1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2015 Intel Corporation. 3 * All rights reserved. 4 * 5 * Copyright (c) 2019-2021 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 */ 8 9 #include "spdk/stdinc.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/fd.h" 14 #include "spdk/nvme.h" 15 #include "spdk/vmd.h" 16 #include "spdk/queue.h" 17 #include "spdk/string.h" 18 #include "spdk/nvme_intel.h" 19 #include "spdk/histogram_data.h" 20 #include "spdk/endian.h" 21 #include "spdk/dif.h" 22 #include "spdk/util.h" 23 #include "spdk/log.h" 24 #include "spdk/likely.h" 25 #include "spdk/sock.h" 26 #include "spdk/zipf.h" 27 #include "spdk/nvmf.h" 28 29 #ifdef SPDK_CONFIG_URING 30 #include <liburing.h> 31 #endif 32 33 #if HAVE_LIBAIO 34 #include <libaio.h> 35 #endif 36 37 struct ctrlr_entry { 38 struct spdk_nvme_ctrlr *ctrlr; 39 enum spdk_nvme_transport_type trtype; 40 struct spdk_nvme_intel_rw_latency_page *latency_page; 41 42 struct spdk_nvme_qpair **unused_qpairs; 43 44 TAILQ_ENTRY(ctrlr_entry) link; 45 char name[1024]; 46 }; 47 48 enum entry_type { 49 ENTRY_TYPE_NVME_NS, 50 ENTRY_TYPE_AIO_FILE, 51 ENTRY_TYPE_URING_FILE, 52 }; 53 54 struct ns_fn_table; 55 56 struct ns_entry { 57 enum entry_type type; 58 const struct ns_fn_table *fn_table; 59 60 union { 61 struct { 62 struct spdk_nvme_ctrlr *ctrlr; 63 struct spdk_nvme_ns *ns; 64 } nvme; 65 #ifdef SPDK_CONFIG_URING 66 struct { 67 int fd; 68 } uring; 69 #endif 70 #if HAVE_LIBAIO 71 struct { 72 int fd; 73 } aio; 74 #endif 75 } u; 76 77 TAILQ_ENTRY(ns_entry) link; 78 uint32_t io_size_blocks; 79 uint32_t num_io_requests; 80 uint64_t size_in_ios; 81 uint32_t block_size; 82 uint32_t md_size; 83 bool md_interleave; 84 unsigned int seed; 85 struct spdk_zipf *zipf; 86 bool pi_loc; 87 enum spdk_nvme_pi_type pi_type; 88 uint32_t io_flags; 89 char name[1024]; 90 }; 91 92 static const double g_latency_cutoffs[] = { 93 0.01, 94 0.10, 95 0.25, 96 0.50, 97 0.75, 98 0.90, 99 0.95, 100 0.98, 101 0.99, 102 0.995, 103 0.999, 104 0.9999, 105 0.99999, 106 0.999999, 107 0.9999999, 108 -1, 109 }; 110 111 struct ns_worker_stats { 112 uint64_t io_submitted; 113 uint64_t io_completed; 114 uint64_t last_io_completed; 115 uint64_t total_tsc; 116 uint64_t min_tsc; 117 uint64_t max_tsc; 118 uint64_t last_tsc; 119 uint64_t busy_tsc; 120 uint64_t idle_tsc; 121 uint64_t last_busy_tsc; 122 uint64_t last_idle_tsc; 123 }; 124 125 struct ns_worker_ctx { 126 struct ns_entry *entry; 127 struct ns_worker_stats stats; 128 uint64_t current_queue_depth; 129 uint64_t offset_in_ios; 130 bool is_draining; 131 132 union { 133 struct { 134 int num_active_qpairs; 135 int num_all_qpairs; 136 struct spdk_nvme_qpair **qpair; 137 struct spdk_nvme_poll_group *group; 138 int last_qpair; 139 } nvme; 140 141 #ifdef SPDK_CONFIG_URING 142 struct { 143 struct io_uring ring; 144 uint64_t io_inflight; 145 uint64_t io_pending; 146 struct io_uring_cqe **cqes; 147 148 } uring; 149 #endif 150 #if HAVE_LIBAIO 151 struct { 152 struct io_event *events; 153 io_context_t ctx; 154 } aio; 155 #endif 156 } u; 157 158 TAILQ_ENTRY(ns_worker_ctx) link; 159 160 TAILQ_HEAD(, perf_task) queued_tasks; 161 162 struct spdk_histogram_data *histogram; 163 int status; 164 }; 165 166 struct perf_task { 167 struct ns_worker_ctx *ns_ctx; 168 struct iovec *iovs; /* array of iovecs to transfer. */ 169 int iovcnt; /* Number of iovecs in iovs array. */ 170 int iovpos; /* Current iovec position. */ 171 uint32_t iov_offset; /* Offset in current iovec. */ 172 struct iovec md_iov; 173 uint64_t submit_tsc; 174 bool is_read; 175 struct spdk_dif_ctx dif_ctx; 176 #if HAVE_LIBAIO 177 struct iocb iocb; 178 #endif 179 TAILQ_ENTRY(perf_task) link; 180 }; 181 182 struct worker_thread { 183 TAILQ_HEAD(, ns_worker_ctx) ns_ctx; 184 TAILQ_ENTRY(worker_thread) link; 185 unsigned lcore; 186 }; 187 188 struct ns_fn_table { 189 void (*setup_payload)(struct perf_task *task, uint8_t pattern); 190 191 int (*submit_io)(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 192 struct ns_entry *entry, uint64_t offset_in_ios); 193 194 int64_t (*check_io)(struct ns_worker_ctx *ns_ctx); 195 196 void (*verify_io)(struct perf_task *task, struct ns_entry *entry); 197 198 int (*init_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx); 199 200 void (*cleanup_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx); 201 void (*dump_transport_stats)(uint32_t lcore, struct ns_worker_ctx *ns_ctx); 202 }; 203 204 static uint32_t g_io_unit_size = (UINT32_MAX & (~0x03)); 205 206 static int g_outstanding_commands; 207 208 static bool g_latency_ssd_tracking_enable; 209 static int g_latency_sw_tracking_level; 210 211 static bool g_vmd; 212 static const char *g_workload_type; 213 static TAILQ_HEAD(, ctrlr_entry) g_controllers = TAILQ_HEAD_INITIALIZER(g_controllers); 214 static TAILQ_HEAD(, ns_entry) g_namespaces = TAILQ_HEAD_INITIALIZER(g_namespaces); 215 static uint32_t g_num_namespaces; 216 static TAILQ_HEAD(, worker_thread) g_workers = TAILQ_HEAD_INITIALIZER(g_workers); 217 static uint32_t g_num_workers = 0; 218 static bool g_use_every_core = false; 219 static uint32_t g_main_core; 220 static pthread_barrier_t g_worker_sync_barrier; 221 222 static uint64_t g_tsc_rate; 223 224 static bool g_monitor_perf_cores = false; 225 226 static uint32_t g_io_align = 0x200; 227 static bool g_io_align_specified; 228 static uint32_t g_io_size_bytes; 229 static uint32_t g_max_io_md_size; 230 static uint32_t g_max_io_size_blocks; 231 static uint32_t g_metacfg_pract_flag; 232 static uint32_t g_metacfg_prchk_flags; 233 static int g_rw_percentage = -1; 234 static int g_is_random; 235 static uint32_t g_queue_depth; 236 static int g_nr_io_queues_per_ns = 1; 237 static int g_nr_unused_io_queues; 238 static int g_time_in_sec; 239 static uint64_t g_number_ios; 240 static uint64_t g_elapsed_time_in_usec; 241 static int g_warmup_time_in_sec; 242 static uint32_t g_max_completions; 243 static uint32_t g_disable_sq_cmb; 244 static bool g_use_uring; 245 static bool g_warn; 246 static bool g_header_digest; 247 static bool g_data_digest; 248 static bool g_no_shn_notification; 249 static bool g_mix_specified; 250 /* The flag is used to exit the program while keep alive fails on the transport */ 251 static bool g_exit; 252 /* Default to 10 seconds for the keep alive value. This value is arbitrary. */ 253 static uint32_t g_keep_alive_timeout_in_ms = 10000; 254 static bool g_continue_on_error = false; 255 static uint32_t g_quiet_count = 1; 256 static double g_zipf_theta; 257 /* Set default io_queue_size to UINT16_MAX, NVMe driver will then reduce this 258 * to MQES to maximize the io_queue_size as much as possible. 259 */ 260 static uint32_t g_io_queue_size = UINT16_MAX; 261 262 static uint32_t g_sock_zcopy_threshold; 263 static char *g_sock_threshold_impl; 264 265 static uint8_t g_transport_tos = 0; 266 267 static uint32_t g_rdma_srq_size; 268 uint8_t *g_psk = NULL; 269 270 /* When user specifies -Q, some error messages are rate limited. When rate 271 * limited, we only print the error message every g_quiet_count times the 272 * error occurs. 273 * 274 * Note: the __count is not thread safe, meaning the rate limiting will not 275 * be exact when running perf with multiple thread with lots of errors. 276 * Thread-local __count would mean rate-limiting per thread which doesn't 277 * seem as useful. 278 */ 279 #define RATELIMIT_LOG(...) \ 280 { \ 281 static uint64_t __count = 0; \ 282 if ((__count % g_quiet_count) == 0) { \ 283 if (__count > 0 && g_quiet_count > 1) { \ 284 fprintf(stderr, "Message suppressed %" PRIu32 " times: ", \ 285 g_quiet_count - 1); \ 286 } \ 287 fprintf(stderr, __VA_ARGS__); \ 288 } \ 289 __count++; \ 290 } 291 292 static bool g_dump_transport_stats; 293 static pthread_mutex_t g_stats_mutex; 294 295 #define MAX_ALLOWED_PCI_DEVICE_NUM 128 296 static struct spdk_pci_addr g_allowed_pci_addr[MAX_ALLOWED_PCI_DEVICE_NUM]; 297 298 struct trid_entry { 299 struct spdk_nvme_transport_id trid; 300 uint16_t nsid; 301 char hostnqn[SPDK_NVMF_NQN_MAX_LEN + 1]; 302 TAILQ_ENTRY(trid_entry) tailq; 303 }; 304 305 static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list); 306 307 static int g_file_optind; /* Index of first filename in argv */ 308 309 static inline void task_complete(struct perf_task *task); 310 311 static void 312 perf_set_sock_opts(const char *impl_name, const char *field, uint32_t val, const char *valstr) 313 { 314 struct spdk_sock_impl_opts sock_opts = {}; 315 size_t opts_size = sizeof(sock_opts); 316 int rc; 317 318 rc = spdk_sock_impl_get_opts(impl_name, &sock_opts, &opts_size); 319 if (rc != 0) { 320 if (errno == EINVAL) { 321 fprintf(stderr, "Unknown sock impl %s\n", impl_name); 322 } else { 323 fprintf(stderr, "Failed to get opts for sock impl %s: error %d (%s)\n", impl_name, errno, 324 strerror(errno)); 325 } 326 return; 327 } 328 329 if (opts_size != sizeof(sock_opts)) { 330 fprintf(stderr, "Warning: sock_opts size mismatch. Expected %zu, received %zu\n", 331 sizeof(sock_opts), opts_size); 332 opts_size = sizeof(sock_opts); 333 } 334 335 if (!field) { 336 fprintf(stderr, "Warning: no socket opts field specified\n"); 337 return; 338 } else if (strcmp(field, "enable_zerocopy_send_client") == 0) { 339 sock_opts.enable_zerocopy_send_client = val; 340 } else if (strcmp(field, "tls_version") == 0) { 341 sock_opts.tls_version = val; 342 } else if (strcmp(field, "ktls") == 0) { 343 sock_opts.enable_ktls = val; 344 } else if (strcmp(field, "psk_path") == 0) { 345 if (!valstr) { 346 fprintf(stderr, "No socket opts value specified\n"); 347 return; 348 } 349 g_psk = calloc(1, SPDK_TLS_PSK_MAX_LEN + 1); 350 if (g_psk == NULL) { 351 fprintf(stderr, "Failed to allocate memory for psk\n"); 352 return; 353 } 354 FILE *psk_file = fopen(valstr, "r"); 355 if (psk_file == NULL) { 356 fprintf(stderr, "Could not open PSK file\n"); 357 return; 358 } 359 if (fscanf(psk_file, "%" SPDK_STRINGIFY(SPDK_TLS_PSK_MAX_LEN) "s", g_psk) != 1) { 360 fprintf(stderr, "Could not retrieve PSK from file\n"); 361 fclose(psk_file); 362 return; 363 } 364 if (fclose(psk_file)) { 365 fprintf(stderr, "Failed to close PSK file\n"); 366 return; 367 } 368 } else if (strcmp(field, "zerocopy_threshold") == 0) { 369 sock_opts.zerocopy_threshold = val; 370 } else { 371 fprintf(stderr, "Warning: invalid or unprocessed socket opts field: %s\n", field); 372 return; 373 } 374 375 if (spdk_sock_impl_set_opts(impl_name, &sock_opts, opts_size)) { 376 fprintf(stderr, "Failed to set %s: %d for sock impl %s : error %d (%s)\n", field, val, impl_name, 377 errno, strerror(errno)); 378 } 379 } 380 381 static void 382 nvme_perf_reset_sgl(void *ref, uint32_t sgl_offset) 383 { 384 struct iovec *iov; 385 struct perf_task *task = (struct perf_task *)ref; 386 387 task->iov_offset = sgl_offset; 388 for (task->iovpos = 0; task->iovpos < task->iovcnt; task->iovpos++) { 389 iov = &task->iovs[task->iovpos]; 390 if (task->iov_offset < iov->iov_len) { 391 break; 392 } 393 394 task->iov_offset -= iov->iov_len; 395 } 396 } 397 398 static int 399 nvme_perf_next_sge(void *ref, void **address, uint32_t *length) 400 { 401 struct iovec *iov; 402 struct perf_task *task = (struct perf_task *)ref; 403 404 assert(task->iovpos < task->iovcnt); 405 406 iov = &task->iovs[task->iovpos]; 407 assert(task->iov_offset <= iov->iov_len); 408 409 *address = iov->iov_base + task->iov_offset; 410 *length = iov->iov_len - task->iov_offset; 411 task->iovpos++; 412 task->iov_offset = 0; 413 414 return 0; 415 } 416 417 static int 418 nvme_perf_allocate_iovs(struct perf_task *task, void *buf, uint32_t length) 419 { 420 int iovpos = 0; 421 struct iovec *iov; 422 uint32_t offset = 0; 423 424 task->iovcnt = SPDK_CEIL_DIV(length, (uint64_t)g_io_unit_size); 425 task->iovs = calloc(task->iovcnt, sizeof(struct iovec)); 426 if (!task->iovs) { 427 return -1; 428 } 429 430 while (length > 0) { 431 iov = &task->iovs[iovpos]; 432 iov->iov_len = spdk_min(length, g_io_unit_size); 433 iov->iov_base = buf + offset; 434 length -= iov->iov_len; 435 offset += iov->iov_len; 436 iovpos++; 437 } 438 439 return 0; 440 } 441 442 #ifdef SPDK_CONFIG_URING 443 444 static void 445 uring_setup_payload(struct perf_task *task, uint8_t pattern) 446 { 447 struct iovec *iov; 448 449 task->iovs = calloc(1, sizeof(struct iovec)); 450 if (!task->iovs) { 451 fprintf(stderr, "perf task failed to allocate iovs\n"); 452 exit(1); 453 } 454 task->iovcnt = 1; 455 456 iov = &task->iovs[0]; 457 iov->iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); 458 iov->iov_len = g_io_size_bytes; 459 if (iov->iov_base == NULL) { 460 fprintf(stderr, "spdk_dma_zmalloc() for task->iovs[0].iov_base failed\n"); 461 free(task->iovs); 462 exit(1); 463 } 464 memset(iov->iov_base, pattern, iov->iov_len); 465 } 466 467 static int 468 uring_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 469 struct ns_entry *entry, uint64_t offset_in_ios) 470 { 471 struct io_uring_sqe *sqe; 472 473 sqe = io_uring_get_sqe(&ns_ctx->u.uring.ring); 474 if (!sqe) { 475 fprintf(stderr, "Cannot get sqe\n"); 476 return -1; 477 } 478 479 if (task->is_read) { 480 io_uring_prep_readv(sqe, entry->u.uring.fd, task->iovs, 1, offset_in_ios * task->iovs[0].iov_len); 481 } else { 482 io_uring_prep_writev(sqe, entry->u.uring.fd, task->iovs, 1, offset_in_ios * task->iovs[0].iov_len); 483 } 484 485 io_uring_sqe_set_data(sqe, task); 486 ns_ctx->u.uring.io_pending++; 487 488 return 0; 489 } 490 491 static int64_t 492 uring_check_io(struct ns_worker_ctx *ns_ctx) 493 { 494 int i, to_complete, to_submit, count = 0, ret = 0; 495 struct perf_task *task; 496 497 to_submit = ns_ctx->u.uring.io_pending; 498 499 if (to_submit > 0) { 500 /* If there are I/O to submit, use io_uring_submit here. 501 * It will automatically call spdk_io_uring_enter appropriately. */ 502 ret = io_uring_submit(&ns_ctx->u.uring.ring); 503 if (ret < 0) { 504 ns_ctx->status = 1; 505 return -1; 506 } 507 ns_ctx->u.uring.io_pending = 0; 508 ns_ctx->u.uring.io_inflight += to_submit; 509 } 510 511 to_complete = ns_ctx->u.uring.io_inflight; 512 if (to_complete > 0) { 513 count = io_uring_peek_batch_cqe(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes, to_complete); 514 ns_ctx->u.uring.io_inflight -= count; 515 for (i = 0; i < count; i++) { 516 int res; 517 518 assert(ns_ctx->u.uring.cqes[i] != NULL); 519 task = (struct perf_task *)ns_ctx->u.uring.cqes[i]->user_data; 520 res = ns_ctx->u.uring.cqes[i]->res; 521 if (res != (int)task->iovs[0].iov_len) { 522 fprintf(stderr, "cqe->status=%d, iov_len=%d\n", res, 523 (int)task->iovs[0].iov_len); 524 ns_ctx->status = 1; 525 if (res == -EIO) { 526 /* The block device has been removed. 527 * Stop trying to send I/O to it. 528 */ 529 ns_ctx->is_draining = true; 530 } 531 } 532 io_uring_cqe_seen(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes[i]); 533 task_complete(task); 534 } 535 } 536 return count; 537 } 538 539 static void 540 uring_verify_io(struct perf_task *task, struct ns_entry *entry) 541 { 542 } 543 544 static int 545 uring_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 546 { 547 if (io_uring_queue_init(g_queue_depth, &ns_ctx->u.uring.ring, 0) < 0) { 548 SPDK_ERRLOG("uring I/O context setup failure\n"); 549 return -1; 550 } 551 552 ns_ctx->u.uring.cqes = calloc(g_queue_depth, sizeof(struct io_uring_cqe *)); 553 if (!ns_ctx->u.uring.cqes) { 554 io_uring_queue_exit(&ns_ctx->u.uring.ring); 555 return -1; 556 } 557 558 return 0; 559 } 560 561 static void 562 uring_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 563 { 564 io_uring_queue_exit(&ns_ctx->u.uring.ring); 565 free(ns_ctx->u.uring.cqes); 566 } 567 568 static const struct ns_fn_table uring_fn_table = { 569 .setup_payload = uring_setup_payload, 570 .submit_io = uring_submit_io, 571 .check_io = uring_check_io, 572 .verify_io = uring_verify_io, 573 .init_ns_worker_ctx = uring_init_ns_worker_ctx, 574 .cleanup_ns_worker_ctx = uring_cleanup_ns_worker_ctx, 575 }; 576 577 #endif 578 579 #ifdef HAVE_LIBAIO 580 static void 581 aio_setup_payload(struct perf_task *task, uint8_t pattern) 582 { 583 struct iovec *iov; 584 585 task->iovs = calloc(1, sizeof(struct iovec)); 586 if (!task->iovs) { 587 fprintf(stderr, "perf task failed to allocate iovs\n"); 588 exit(1); 589 } 590 task->iovcnt = 1; 591 592 iov = &task->iovs[0]; 593 iov->iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); 594 iov->iov_len = g_io_size_bytes; 595 if (iov->iov_base == NULL) { 596 fprintf(stderr, "spdk_dma_zmalloc() for task->iovs[0].iov_base failed\n"); 597 free(task->iovs); 598 exit(1); 599 } 600 memset(iov->iov_base, pattern, iov->iov_len); 601 } 602 603 static int 604 aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd, 605 struct iovec *iov, uint64_t offset, void *cb_ctx) 606 { 607 iocb->aio_fildes = fd; 608 iocb->aio_reqprio = 0; 609 iocb->aio_lio_opcode = cmd; 610 iocb->u.c.buf = iov->iov_base; 611 iocb->u.c.nbytes = iov->iov_len; 612 iocb->u.c.offset = offset * iov->iov_len; 613 iocb->data = cb_ctx; 614 615 if (io_submit(aio_ctx, 1, &iocb) < 0) { 616 printf("io_submit"); 617 return -1; 618 } 619 620 return 0; 621 } 622 623 static int 624 aio_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 625 struct ns_entry *entry, uint64_t offset_in_ios) 626 { 627 if (task->is_read) { 628 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PREAD, 629 task->iovs, offset_in_ios, task); 630 } else { 631 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PWRITE, 632 task->iovs, offset_in_ios, task); 633 } 634 } 635 636 static int64_t 637 aio_check_io(struct ns_worker_ctx *ns_ctx) 638 { 639 int count, i; 640 struct timespec timeout; 641 struct perf_task *task; 642 643 timeout.tv_sec = 0; 644 timeout.tv_nsec = 0; 645 646 count = io_getevents(ns_ctx->u.aio.ctx, 1, g_queue_depth, ns_ctx->u.aio.events, &timeout); 647 if (count < 0) { 648 fprintf(stderr, "io_getevents error\n"); 649 ns_ctx->status = 1; 650 return -1; 651 } 652 653 for (i = 0; i < count; i++) { 654 unsigned long res; 655 656 task = (struct perf_task *)ns_ctx->u.aio.events[i].data; 657 res = ns_ctx->u.aio.events[i].res; 658 if (res != (uint64_t)task->iovs[0].iov_len) { 659 fprintf(stderr, "event->res=%ld, iov_len=%lu\n", (long)res, 660 (uint64_t)task->iovs[0].iov_len); 661 ns_ctx->status = 1; 662 if ((long)res == -EIO) { 663 /* The block device has been removed. Stop trying to send I/O to it. */ 664 ns_ctx->is_draining = true; 665 } 666 } 667 task_complete(ns_ctx->u.aio.events[i].data); 668 } 669 return count; 670 } 671 672 static void 673 aio_verify_io(struct perf_task *task, struct ns_entry *entry) 674 { 675 } 676 677 static int 678 aio_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 679 { 680 ns_ctx->u.aio.events = calloc(g_queue_depth, sizeof(struct io_event)); 681 if (!ns_ctx->u.aio.events) { 682 return -1; 683 } 684 ns_ctx->u.aio.ctx = 0; 685 if (io_setup(g_queue_depth, &ns_ctx->u.aio.ctx) < 0) { 686 free(ns_ctx->u.aio.events); 687 perror("io_setup"); 688 return -1; 689 } 690 return 0; 691 } 692 693 static void 694 aio_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 695 { 696 io_destroy(ns_ctx->u.aio.ctx); 697 free(ns_ctx->u.aio.events); 698 } 699 700 static const struct ns_fn_table aio_fn_table = { 701 .setup_payload = aio_setup_payload, 702 .submit_io = aio_submit_io, 703 .check_io = aio_check_io, 704 .verify_io = aio_verify_io, 705 .init_ns_worker_ctx = aio_init_ns_worker_ctx, 706 .cleanup_ns_worker_ctx = aio_cleanup_ns_worker_ctx, 707 }; 708 709 #endif /* HAVE_LIBAIO */ 710 711 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING) 712 713 static int 714 register_file(const char *path) 715 { 716 struct ns_entry *entry; 717 718 int flags, fd; 719 uint64_t size; 720 uint32_t blklen; 721 722 if (g_rw_percentage == 100) { 723 flags = O_RDONLY; 724 } else if (g_rw_percentage == 0) { 725 flags = O_WRONLY; 726 } else { 727 flags = O_RDWR; 728 } 729 730 flags |= O_DIRECT; 731 732 fd = open(path, flags); 733 if (fd < 0) { 734 fprintf(stderr, "Could not open device %s: %s\n", path, strerror(errno)); 735 return -1; 736 } 737 738 size = spdk_fd_get_size(fd); 739 if (size == 0) { 740 fprintf(stderr, "Could not determine size of device %s\n", path); 741 close(fd); 742 return -1; 743 } 744 745 blklen = spdk_fd_get_blocklen(fd); 746 if (blklen == 0) { 747 fprintf(stderr, "Could not determine block size of device %s\n", path); 748 close(fd); 749 return -1; 750 } 751 752 /* 753 * TODO: This should really calculate the LCM of the current g_io_align and blklen. 754 * For now, it's fairly safe to just assume all block sizes are powers of 2. 755 */ 756 if (g_io_align < blklen) { 757 if (g_io_align_specified) { 758 fprintf(stderr, "Wrong IO alignment (%u). aio requires block-sized alignment (%u)\n", g_io_align, 759 blklen); 760 close(fd); 761 return -1; 762 } 763 764 g_io_align = blklen; 765 } 766 767 entry = calloc(1, sizeof(struct ns_entry)); 768 if (entry == NULL) { 769 close(fd); 770 perror("ns_entry malloc"); 771 return -1; 772 } 773 774 if (g_use_uring) { 775 #ifdef SPDK_CONFIG_URING 776 entry->type = ENTRY_TYPE_URING_FILE; 777 entry->fn_table = &uring_fn_table; 778 entry->u.uring.fd = fd; 779 #endif 780 } else { 781 #if HAVE_LIBAIO 782 entry->type = ENTRY_TYPE_AIO_FILE; 783 entry->fn_table = &aio_fn_table; 784 entry->u.aio.fd = fd; 785 #endif 786 } 787 entry->size_in_ios = size / g_io_size_bytes; 788 entry->io_size_blocks = g_io_size_bytes / blklen; 789 790 if (g_is_random) { 791 entry->seed = rand(); 792 if (g_zipf_theta > 0) { 793 entry->zipf = spdk_zipf_create(entry->size_in_ios, g_zipf_theta, 0); 794 } 795 } 796 797 snprintf(entry->name, sizeof(entry->name), "%s", path); 798 799 g_num_namespaces++; 800 TAILQ_INSERT_TAIL(&g_namespaces, entry, link); 801 802 return 0; 803 } 804 805 static int 806 register_files(int argc, char **argv) 807 { 808 int i; 809 810 /* Treat everything after the options as files for AIO/URING */ 811 for (i = g_file_optind; i < argc; i++) { 812 if (register_file(argv[i]) != 0) { 813 return 1; 814 } 815 } 816 817 return 0; 818 } 819 #endif 820 821 static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl); 822 823 static void 824 nvme_setup_payload(struct perf_task *task, uint8_t pattern) 825 { 826 uint32_t max_io_size_bytes, max_io_md_size; 827 void *buf; 828 int rc; 829 830 /* maximum extended lba format size from all active namespace, 831 * it's same with g_io_size_bytes for namespace without metadata. 832 */ 833 max_io_size_bytes = g_io_size_bytes + g_max_io_md_size * g_max_io_size_blocks; 834 buf = spdk_dma_zmalloc(max_io_size_bytes, g_io_align, NULL); 835 if (buf == NULL) { 836 fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n"); 837 exit(1); 838 } 839 memset(buf, pattern, max_io_size_bytes); 840 841 rc = nvme_perf_allocate_iovs(task, buf, max_io_size_bytes); 842 if (rc < 0) { 843 fprintf(stderr, "perf task failed to allocate iovs\n"); 844 spdk_dma_free(buf); 845 exit(1); 846 } 847 848 max_io_md_size = g_max_io_md_size * g_max_io_size_blocks; 849 if (max_io_md_size != 0) { 850 task->md_iov.iov_base = spdk_dma_zmalloc(max_io_md_size, g_io_align, NULL); 851 task->md_iov.iov_len = max_io_md_size; 852 if (task->md_iov.iov_base == NULL) { 853 fprintf(stderr, "task->md_buf spdk_dma_zmalloc failed\n"); 854 spdk_dma_free(task->iovs[0].iov_base); 855 free(task->iovs); 856 exit(1); 857 } 858 } 859 } 860 861 static int 862 nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 863 struct ns_entry *entry, uint64_t offset_in_ios) 864 { 865 uint64_t lba; 866 int rc; 867 int qp_num; 868 struct spdk_dif_ctx_init_ext_opts dif_opts; 869 870 enum dif_mode { 871 DIF_MODE_NONE = 0, 872 DIF_MODE_DIF = 1, 873 DIF_MODE_DIX = 2, 874 } mode = DIF_MODE_NONE; 875 876 lba = offset_in_ios * entry->io_size_blocks; 877 878 if (entry->md_size != 0 && !(entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) { 879 if (entry->md_interleave) { 880 mode = DIF_MODE_DIF; 881 } else { 882 mode = DIF_MODE_DIX; 883 } 884 } 885 886 qp_num = ns_ctx->u.nvme.last_qpair; 887 ns_ctx->u.nvme.last_qpair++; 888 if (ns_ctx->u.nvme.last_qpair == ns_ctx->u.nvme.num_active_qpairs) { 889 ns_ctx->u.nvme.last_qpair = 0; 890 } 891 892 if (mode != DIF_MODE_NONE) { 893 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 894 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 895 rc = spdk_dif_ctx_init(&task->dif_ctx, entry->block_size, entry->md_size, 896 entry->md_interleave, entry->pi_loc, 897 (enum spdk_dif_type)entry->pi_type, entry->io_flags, 898 lba, 0xFFFF, (uint16_t)entry->io_size_blocks, 0, 0, &dif_opts); 899 if (rc != 0) { 900 fprintf(stderr, "Initialization of DIF context failed\n"); 901 exit(1); 902 } 903 } 904 905 if (task->is_read) { 906 if (task->iovcnt == 1) { 907 return spdk_nvme_ns_cmd_read_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 908 task->iovs[0].iov_base, task->md_iov.iov_base, 909 lba, 910 entry->io_size_blocks, io_complete, 911 task, entry->io_flags, 912 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 913 } else { 914 return spdk_nvme_ns_cmd_readv_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 915 lba, entry->io_size_blocks, 916 io_complete, task, entry->io_flags, 917 nvme_perf_reset_sgl, nvme_perf_next_sge, 918 task->md_iov.iov_base, 919 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 920 } 921 } else { 922 switch (mode) { 923 case DIF_MODE_DIF: 924 rc = spdk_dif_generate(task->iovs, task->iovcnt, entry->io_size_blocks, &task->dif_ctx); 925 if (rc != 0) { 926 fprintf(stderr, "Generation of DIF failed\n"); 927 return rc; 928 } 929 break; 930 case DIF_MODE_DIX: 931 rc = spdk_dix_generate(task->iovs, task->iovcnt, &task->md_iov, entry->io_size_blocks, 932 &task->dif_ctx); 933 if (rc != 0) { 934 fprintf(stderr, "Generation of DIX failed\n"); 935 return rc; 936 } 937 break; 938 default: 939 break; 940 } 941 942 if (task->iovcnt == 1) { 943 return spdk_nvme_ns_cmd_write_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 944 task->iovs[0].iov_base, task->md_iov.iov_base, 945 lba, 946 entry->io_size_blocks, io_complete, 947 task, entry->io_flags, 948 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 949 } else { 950 return spdk_nvme_ns_cmd_writev_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 951 lba, entry->io_size_blocks, 952 io_complete, task, entry->io_flags, 953 nvme_perf_reset_sgl, nvme_perf_next_sge, 954 task->md_iov.iov_base, 955 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 956 } 957 } 958 } 959 960 static void 961 perf_disconnect_cb(struct spdk_nvme_qpair *qpair, void *ctx) 962 { 963 struct ns_worker_ctx *ns_ctx = ctx; 964 965 ns_ctx->is_draining = true; 966 ns_ctx->status = 1; 967 } 968 969 static int64_t 970 nvme_check_io(struct ns_worker_ctx *ns_ctx) 971 { 972 int64_t rc; 973 974 rc = spdk_nvme_poll_group_process_completions(ns_ctx->u.nvme.group, g_max_completions, 975 perf_disconnect_cb); 976 if (rc < 0) { 977 fprintf(stderr, "NVMe io qpair process completion error\n"); 978 ns_ctx->status = 1; 979 return -1; 980 } 981 return rc; 982 } 983 984 static void 985 nvme_verify_io(struct perf_task *task, struct ns_entry *entry) 986 { 987 struct spdk_dif_error err_blk = {}; 988 int rc; 989 990 if (!task->is_read || (entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) { 991 return; 992 } 993 994 if (entry->md_interleave) { 995 rc = spdk_dif_verify(task->iovs, task->iovcnt, entry->io_size_blocks, &task->dif_ctx, 996 &err_blk); 997 if (rc != 0) { 998 fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n", 999 err_blk.err_type, err_blk.err_offset); 1000 } 1001 } else { 1002 rc = spdk_dix_verify(task->iovs, task->iovcnt, &task->md_iov, entry->io_size_blocks, 1003 &task->dif_ctx, &err_blk); 1004 if (rc != 0) { 1005 fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n", 1006 err_blk.err_type, err_blk.err_offset); 1007 } 1008 } 1009 } 1010 1011 /* 1012 * TODO: If a controller has multiple namespaces, they could all use the same queue. 1013 * For now, give each namespace/thread combination its own queue. 1014 */ 1015 static int 1016 nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1017 { 1018 const struct spdk_nvme_ctrlr_opts *ctrlr_opts; 1019 struct spdk_nvme_io_qpair_opts opts; 1020 struct ns_entry *entry = ns_ctx->entry; 1021 struct spdk_nvme_poll_group *group; 1022 struct spdk_nvme_qpair *qpair; 1023 uint64_t poll_timeout_tsc; 1024 int i, rc; 1025 1026 ns_ctx->u.nvme.num_active_qpairs = g_nr_io_queues_per_ns; 1027 ns_ctx->u.nvme.num_all_qpairs = g_nr_io_queues_per_ns + g_nr_unused_io_queues; 1028 ns_ctx->u.nvme.qpair = calloc(ns_ctx->u.nvme.num_all_qpairs, sizeof(struct spdk_nvme_qpair *)); 1029 if (!ns_ctx->u.nvme.qpair) { 1030 return -1; 1031 } 1032 1033 spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->u.nvme.ctrlr, &opts, sizeof(opts)); 1034 if (opts.io_queue_requests < entry->num_io_requests) { 1035 opts.io_queue_requests = entry->num_io_requests; 1036 } 1037 opts.delay_cmd_submit = true; 1038 opts.create_only = true; 1039 1040 ctrlr_opts = spdk_nvme_ctrlr_get_opts(entry->u.nvme.ctrlr); 1041 opts.async_mode = !(spdk_nvme_ctrlr_get_transport_id(entry->u.nvme.ctrlr)->trtype == 1042 SPDK_NVME_TRANSPORT_PCIE 1043 && ns_ctx->u.nvme.num_all_qpairs > ctrlr_opts->admin_queue_size); 1044 1045 ns_ctx->u.nvme.group = spdk_nvme_poll_group_create(ns_ctx, NULL); 1046 if (ns_ctx->u.nvme.group == NULL) { 1047 goto poll_group_failed; 1048 } 1049 1050 group = ns_ctx->u.nvme.group; 1051 for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) { 1052 ns_ctx->u.nvme.qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->u.nvme.ctrlr, &opts, 1053 sizeof(opts)); 1054 qpair = ns_ctx->u.nvme.qpair[i]; 1055 if (!qpair) { 1056 printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n"); 1057 goto qpair_failed; 1058 } 1059 1060 if (spdk_nvme_poll_group_add(group, qpair)) { 1061 printf("ERROR: unable to add I/O qpair to poll group.\n"); 1062 spdk_nvme_ctrlr_free_io_qpair(qpair); 1063 goto qpair_failed; 1064 } 1065 1066 if (spdk_nvme_ctrlr_connect_io_qpair(entry->u.nvme.ctrlr, qpair)) { 1067 printf("ERROR: unable to connect I/O qpair.\n"); 1068 spdk_nvme_ctrlr_free_io_qpair(qpair); 1069 goto qpair_failed; 1070 } 1071 } 1072 1073 /* Busy poll here until all qpairs are connected - this ensures once we start 1074 * I/O we aren't still waiting for some qpairs to connect. Limit the poll to 1075 * 10 seconds though. 1076 */ 1077 poll_timeout_tsc = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 1078 rc = -EAGAIN; 1079 while (spdk_get_ticks() < poll_timeout_tsc && rc == -EAGAIN) { 1080 spdk_nvme_poll_group_process_completions(group, 0, perf_disconnect_cb); 1081 rc = spdk_nvme_poll_group_all_connected(group); 1082 if (rc == 0) { 1083 return 0; 1084 } 1085 } 1086 1087 /* If we reach here, it means we either timed out, or some connection failed. */ 1088 assert(spdk_get_ticks() > poll_timeout_tsc || rc == -EIO); 1089 1090 qpair_failed: 1091 for (; i > 0; --i) { 1092 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i - 1]); 1093 } 1094 1095 spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group); 1096 poll_group_failed: 1097 free(ns_ctx->u.nvme.qpair); 1098 return -1; 1099 } 1100 1101 static void 1102 nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1103 { 1104 int i; 1105 1106 for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) { 1107 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i]); 1108 } 1109 1110 spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group); 1111 free(ns_ctx->u.nvme.qpair); 1112 } 1113 1114 static void 1115 nvme_dump_rdma_statistics(struct spdk_nvme_transport_poll_group_stat *stat) 1116 { 1117 struct spdk_nvme_rdma_device_stat *device_stats; 1118 uint32_t i; 1119 1120 printf("RDMA transport:\n"); 1121 for (i = 0; i < stat->rdma.num_devices; i++) { 1122 device_stats = &stat->rdma.device_stats[i]; 1123 printf("\tdev name: %s\n", device_stats->name); 1124 printf("\tpolls: %"PRIu64"\n", device_stats->polls); 1125 printf("\tidle_polls: %"PRIu64"\n", device_stats->idle_polls); 1126 printf("\tcompletions: %"PRIu64"\n", device_stats->completions); 1127 printf("\tqueued_requests: %"PRIu64"\n", device_stats->queued_requests); 1128 printf("\ttotal_send_wrs: %"PRIu64"\n", device_stats->total_send_wrs); 1129 printf("\tsend_doorbell_updates: %"PRIu64"\n", device_stats->send_doorbell_updates); 1130 printf("\ttotal_recv_wrs: %"PRIu64"\n", device_stats->total_recv_wrs); 1131 printf("\trecv_doorbell_updates: %"PRIu64"\n", device_stats->recv_doorbell_updates); 1132 printf("\t---------------------------------\n"); 1133 } 1134 } 1135 1136 static void 1137 nvme_dump_pcie_statistics(struct spdk_nvme_transport_poll_group_stat *stat) 1138 { 1139 struct spdk_nvme_pcie_stat *pcie_stat; 1140 1141 pcie_stat = &stat->pcie; 1142 1143 printf("PCIE transport:\n"); 1144 printf("\tpolls: %"PRIu64"\n", pcie_stat->polls); 1145 printf("\tidle_polls: %"PRIu64"\n", pcie_stat->idle_polls); 1146 printf("\tcompletions: %"PRIu64"\n", pcie_stat->completions); 1147 printf("\tcq_mmio_doorbell_updates: %"PRIu64"\n", pcie_stat->cq_mmio_doorbell_updates); 1148 printf("\tcq_shadow_doorbell_updates: %"PRIu64"\n", pcie_stat->cq_shadow_doorbell_updates); 1149 printf("\tsubmitted_requests: %"PRIu64"\n", pcie_stat->submitted_requests); 1150 printf("\tsq_mmio_doorbell_updates: %"PRIu64"\n", pcie_stat->sq_mmio_doorbell_updates); 1151 printf("\tsq_shadow_doorbell_updates: %"PRIu64"\n", pcie_stat->sq_shadow_doorbell_updates); 1152 printf("\tqueued_requests: %"PRIu64"\n", pcie_stat->queued_requests); 1153 } 1154 1155 static void 1156 nvme_dump_tcp_statistics(struct spdk_nvme_transport_poll_group_stat *stat) 1157 { 1158 struct spdk_nvme_tcp_stat *tcp_stat; 1159 1160 tcp_stat = &stat->tcp; 1161 1162 printf("TCP transport:\n"); 1163 printf("\tpolls: %"PRIu64"\n", tcp_stat->polls); 1164 printf("\tidle_polls: %"PRIu64"\n", tcp_stat->idle_polls); 1165 printf("\tsock_completions: %"PRIu64"\n", tcp_stat->socket_completions); 1166 printf("\tnvme_completions: %"PRIu64"\n", tcp_stat->nvme_completions); 1167 printf("\tsubmitted_requests: %"PRIu64"\n", tcp_stat->submitted_requests); 1168 printf("\tqueued_requests: %"PRIu64"\n", tcp_stat->queued_requests); 1169 } 1170 1171 static void 1172 nvme_dump_transport_stats(uint32_t lcore, struct ns_worker_ctx *ns_ctx) 1173 { 1174 struct spdk_nvme_poll_group *group; 1175 struct spdk_nvme_poll_group_stat *stat = NULL; 1176 uint32_t i; 1177 int rc; 1178 1179 group = ns_ctx->u.nvme.group; 1180 if (group == NULL) { 1181 return; 1182 } 1183 1184 rc = spdk_nvme_poll_group_get_stats(group, &stat); 1185 if (rc) { 1186 fprintf(stderr, "Can't get transport stats, error %d\n", rc); 1187 return; 1188 } 1189 1190 printf("\n====================\n"); 1191 printf("lcore %u, ns %s statistics:\n", lcore, ns_ctx->entry->name); 1192 1193 for (i = 0; i < stat->num_transports; i++) { 1194 switch (stat->transport_stat[i]->trtype) { 1195 case SPDK_NVME_TRANSPORT_RDMA: 1196 nvme_dump_rdma_statistics(stat->transport_stat[i]); 1197 break; 1198 case SPDK_NVME_TRANSPORT_PCIE: 1199 nvme_dump_pcie_statistics(stat->transport_stat[i]); 1200 break; 1201 case SPDK_NVME_TRANSPORT_TCP: 1202 nvme_dump_tcp_statistics(stat->transport_stat[i]); 1203 break; 1204 default: 1205 fprintf(stderr, "Unknown transport statistics %d %s\n", stat->transport_stat[i]->trtype, 1206 spdk_nvme_transport_id_trtype_str(stat->transport_stat[i]->trtype)); 1207 } 1208 } 1209 1210 spdk_nvme_poll_group_free_stats(group, stat); 1211 } 1212 1213 static const struct ns_fn_table nvme_fn_table = { 1214 .setup_payload = nvme_setup_payload, 1215 .submit_io = nvme_submit_io, 1216 .check_io = nvme_check_io, 1217 .verify_io = nvme_verify_io, 1218 .init_ns_worker_ctx = nvme_init_ns_worker_ctx, 1219 .cleanup_ns_worker_ctx = nvme_cleanup_ns_worker_ctx, 1220 .dump_transport_stats = nvme_dump_transport_stats 1221 }; 1222 1223 static int 1224 build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) 1225 { 1226 const struct spdk_nvme_transport_id *trid; 1227 int res = 0; 1228 1229 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1230 1231 switch (trid->trtype) { 1232 case SPDK_NVME_TRANSPORT_PCIE: 1233 res = snprintf(name, length, "PCIE (%s)", trid->traddr); 1234 break; 1235 case SPDK_NVME_TRANSPORT_RDMA: 1236 res = snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 1237 break; 1238 case SPDK_NVME_TRANSPORT_TCP: 1239 res = snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 1240 break; 1241 case SPDK_NVME_TRANSPORT_VFIOUSER: 1242 res = snprintf(name, length, "VFIOUSER (%s)", trid->traddr); 1243 break; 1244 case SPDK_NVME_TRANSPORT_CUSTOM: 1245 res = snprintf(name, length, "CUSTOM (%s)", trid->traddr); 1246 break; 1247 1248 default: 1249 fprintf(stderr, "Unknown transport type %d\n", trid->trtype); 1250 break; 1251 } 1252 return res; 1253 } 1254 1255 static void 1256 build_nvme_ns_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) 1257 { 1258 int res = 0; 1259 1260 res = build_nvme_name(name, length, ctrlr); 1261 if (res > 0) { 1262 snprintf(name + res, length - res, " NSID %u", nsid); 1263 } 1264 1265 } 1266 1267 static void 1268 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) 1269 { 1270 struct ns_entry *entry; 1271 const struct spdk_nvme_ctrlr_data *cdata; 1272 uint32_t max_xfer_size, entries, sector_size; 1273 uint64_t ns_size; 1274 struct spdk_nvme_io_qpair_opts opts; 1275 1276 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1277 1278 if (!spdk_nvme_ns_is_active(ns)) { 1279 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", 1280 cdata->mn, cdata->sn, 1281 spdk_nvme_ns_get_id(ns)); 1282 g_warn = true; 1283 return; 1284 } 1285 1286 ns_size = spdk_nvme_ns_get_size(ns); 1287 sector_size = spdk_nvme_ns_get_sector_size(ns); 1288 1289 if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) { 1290 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " 1291 "ns size %" PRIu64 " / block size %u for I/O size %u\n", 1292 cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), 1293 ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes); 1294 g_warn = true; 1295 return; 1296 } 1297 1298 max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 1299 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 1300 /* NVMe driver may add additional entries based on 1301 * stripe size and maximum transfer size, we assume 1302 * 1 more entry be used for stripe. 1303 */ 1304 entries = (g_io_size_bytes - 1) / max_xfer_size + 2; 1305 if ((g_queue_depth * entries) > opts.io_queue_size) { 1306 printf("Controller IO queue size %u, less than required.\n", 1307 opts.io_queue_size); 1308 printf("Consider using lower queue depth or smaller IO size, because " 1309 "IO requests may be queued at the NVMe driver.\n"); 1310 } 1311 /* For requests which have children requests, parent request itself 1312 * will also occupy 1 entry. 1313 */ 1314 entries += 1; 1315 1316 entry = calloc(1, sizeof(struct ns_entry)); 1317 if (entry == NULL) { 1318 perror("ns_entry malloc"); 1319 exit(1); 1320 } 1321 1322 entry->type = ENTRY_TYPE_NVME_NS; 1323 entry->fn_table = &nvme_fn_table; 1324 entry->u.nvme.ctrlr = ctrlr; 1325 entry->u.nvme.ns = ns; 1326 entry->num_io_requests = entries * spdk_divide_round_up(g_queue_depth, g_nr_io_queues_per_ns); 1327 1328 entry->size_in_ios = ns_size / g_io_size_bytes; 1329 entry->io_size_blocks = g_io_size_bytes / sector_size; 1330 1331 if (g_is_random) { 1332 entry->seed = rand(); 1333 if (g_zipf_theta > 0) { 1334 entry->zipf = spdk_zipf_create(entry->size_in_ios, g_zipf_theta, 0); 1335 } 1336 } 1337 1338 entry->block_size = spdk_nvme_ns_get_extended_sector_size(ns); 1339 entry->md_size = spdk_nvme_ns_get_md_size(ns); 1340 entry->md_interleave = spdk_nvme_ns_supports_extended_lba(ns); 1341 entry->pi_loc = spdk_nvme_ns_get_data(ns)->dps.md_start; 1342 entry->pi_type = spdk_nvme_ns_get_pi_type(ns); 1343 1344 if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) { 1345 entry->io_flags = g_metacfg_pract_flag | g_metacfg_prchk_flags; 1346 } 1347 1348 /* If metadata size = 8 bytes, PI is stripped (read) or inserted (write), 1349 * and so reduce metadata size from block size. (If metadata size > 8 bytes, 1350 * PI is passed (read) or replaced (write). So block size is not necessary 1351 * to change.) 1352 */ 1353 if ((entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT) && (entry->md_size == 8)) { 1354 entry->block_size = spdk_nvme_ns_get_sector_size(ns); 1355 } 1356 1357 if (g_io_size_bytes % entry->block_size != 0) { 1358 printf("WARNING: IO size %u (-o) is not a multiple of nsid %u sector size %u." 1359 " Removing this ns from test\n", g_io_size_bytes, spdk_nvme_ns_get_id(ns), entry->block_size); 1360 g_warn = true; 1361 spdk_zipf_free(&entry->zipf); 1362 free(entry); 1363 return; 1364 } 1365 1366 if (g_max_io_md_size < entry->md_size) { 1367 g_max_io_md_size = entry->md_size; 1368 } 1369 1370 if (g_max_io_size_blocks < entry->io_size_blocks) { 1371 g_max_io_size_blocks = entry->io_size_blocks; 1372 } 1373 1374 build_nvme_ns_name(entry->name, sizeof(entry->name), ctrlr, spdk_nvme_ns_get_id(ns)); 1375 1376 g_num_namespaces++; 1377 TAILQ_INSERT_TAIL(&g_namespaces, entry, link); 1378 } 1379 1380 static void 1381 unregister_namespaces(void) 1382 { 1383 struct ns_entry *entry, *tmp; 1384 1385 TAILQ_FOREACH_SAFE(entry, &g_namespaces, link, tmp) { 1386 TAILQ_REMOVE(&g_namespaces, entry, link); 1387 spdk_zipf_free(&entry->zipf); 1388 if (g_use_uring) { 1389 #ifdef SPDK_CONFIG_URING 1390 close(entry->u.uring.fd); 1391 #endif 1392 } else { 1393 #if HAVE_LIBAIO 1394 close(entry->u.aio.fd); 1395 #endif 1396 } 1397 free(entry); 1398 } 1399 } 1400 1401 static void 1402 enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl) 1403 { 1404 if (spdk_nvme_cpl_is_error(cpl)) { 1405 printf("enable_latency_tracking_complete failed\n"); 1406 } 1407 g_outstanding_commands--; 1408 } 1409 1410 static void 1411 set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable) 1412 { 1413 int res; 1414 union spdk_nvme_intel_feat_latency_tracking latency_tracking; 1415 1416 if (enable) { 1417 latency_tracking.bits.enable = 0x01; 1418 } else { 1419 latency_tracking.bits.enable = 0x00; 1420 } 1421 1422 res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING, 1423 latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL); 1424 if (res) { 1425 printf("fail to allocate nvme request.\n"); 1426 return; 1427 } 1428 g_outstanding_commands++; 1429 1430 while (g_outstanding_commands) { 1431 spdk_nvme_ctrlr_process_admin_completions(ctrlr); 1432 } 1433 } 1434 1435 static void 1436 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry) 1437 { 1438 struct spdk_nvme_ns *ns; 1439 struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry)); 1440 uint32_t nsid; 1441 1442 if (entry == NULL) { 1443 perror("ctrlr_entry malloc"); 1444 exit(1); 1445 } 1446 1447 entry->latency_page = spdk_dma_zmalloc(sizeof(struct spdk_nvme_intel_rw_latency_page), 1448 4096, NULL); 1449 if (entry->latency_page == NULL) { 1450 printf("Allocation error (latency page)\n"); 1451 exit(1); 1452 } 1453 1454 build_nvme_name(entry->name, sizeof(entry->name), ctrlr); 1455 1456 entry->ctrlr = ctrlr; 1457 entry->trtype = trid_entry->trid.trtype; 1458 TAILQ_INSERT_TAIL(&g_controllers, entry, link); 1459 1460 if (g_latency_ssd_tracking_enable && 1461 spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) { 1462 set_latency_tracking_feature(ctrlr, true); 1463 } 1464 1465 if (trid_entry->nsid == 0) { 1466 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 1467 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 1468 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1469 if (ns == NULL) { 1470 continue; 1471 } 1472 register_ns(ctrlr, ns); 1473 } 1474 } else { 1475 ns = spdk_nvme_ctrlr_get_ns(ctrlr, trid_entry->nsid); 1476 if (!ns) { 1477 perror("Namespace does not exist."); 1478 exit(1); 1479 } 1480 1481 register_ns(ctrlr, ns); 1482 } 1483 } 1484 1485 static inline void 1486 submit_single_io(struct perf_task *task) 1487 { 1488 uint64_t offset_in_ios; 1489 int rc; 1490 struct ns_worker_ctx *ns_ctx = task->ns_ctx; 1491 struct ns_entry *entry = ns_ctx->entry; 1492 1493 assert(!ns_ctx->is_draining); 1494 1495 if (entry->zipf) { 1496 offset_in_ios = spdk_zipf_generate(entry->zipf); 1497 } else if (g_is_random) { 1498 offset_in_ios = rand_r(&entry->seed) % entry->size_in_ios; 1499 } else { 1500 offset_in_ios = ns_ctx->offset_in_ios++; 1501 if (ns_ctx->offset_in_ios == entry->size_in_ios) { 1502 ns_ctx->offset_in_ios = 0; 1503 } 1504 } 1505 1506 task->submit_tsc = spdk_get_ticks(); 1507 1508 if ((g_rw_percentage == 100) || 1509 (g_rw_percentage != 0 && ((rand_r(&entry->seed) % 100) < g_rw_percentage))) { 1510 task->is_read = true; 1511 } else { 1512 task->is_read = false; 1513 } 1514 1515 rc = entry->fn_table->submit_io(task, ns_ctx, entry, offset_in_ios); 1516 1517 if (spdk_unlikely(rc != 0)) { 1518 if (g_continue_on_error) { 1519 /* We can't just resubmit here or we can get in a loop that 1520 * stack overflows. */ 1521 TAILQ_INSERT_TAIL(&ns_ctx->queued_tasks, task, link); 1522 } else { 1523 RATELIMIT_LOG("starting I/O failed: %d\n", rc); 1524 spdk_dma_free(task->iovs[0].iov_base); 1525 free(task->iovs); 1526 spdk_dma_free(task->md_iov.iov_base); 1527 task->ns_ctx->status = 1; 1528 free(task); 1529 } 1530 } else { 1531 ns_ctx->current_queue_depth++; 1532 ns_ctx->stats.io_submitted++; 1533 } 1534 1535 if (spdk_unlikely(g_number_ios && ns_ctx->stats.io_submitted >= g_number_ios)) { 1536 ns_ctx->is_draining = true; 1537 } 1538 } 1539 1540 static inline void 1541 task_complete(struct perf_task *task) 1542 { 1543 struct ns_worker_ctx *ns_ctx; 1544 uint64_t tsc_diff; 1545 struct ns_entry *entry; 1546 1547 ns_ctx = task->ns_ctx; 1548 entry = ns_ctx->entry; 1549 ns_ctx->current_queue_depth--; 1550 ns_ctx->stats.io_completed++; 1551 tsc_diff = spdk_get_ticks() - task->submit_tsc; 1552 ns_ctx->stats.total_tsc += tsc_diff; 1553 if (spdk_unlikely(ns_ctx->stats.min_tsc > tsc_diff)) { 1554 ns_ctx->stats.min_tsc = tsc_diff; 1555 } 1556 if (spdk_unlikely(ns_ctx->stats.max_tsc < tsc_diff)) { 1557 ns_ctx->stats.max_tsc = tsc_diff; 1558 } 1559 if (spdk_unlikely(g_latency_sw_tracking_level > 0)) { 1560 spdk_histogram_data_tally(ns_ctx->histogram, tsc_diff); 1561 } 1562 1563 if (spdk_unlikely(entry->md_size > 0)) { 1564 /* add application level verification for end-to-end data protection */ 1565 entry->fn_table->verify_io(task, entry); 1566 } 1567 1568 /* 1569 * is_draining indicates when time has expired or io_submitted exceeded 1570 * g_number_ios for the test run and we are just waiting for the previously 1571 * submitted I/O to complete. In this case, do not submit a new I/O to 1572 * replace the one just completed. 1573 */ 1574 if (spdk_unlikely(ns_ctx->is_draining)) { 1575 spdk_dma_free(task->iovs[0].iov_base); 1576 free(task->iovs); 1577 spdk_dma_free(task->md_iov.iov_base); 1578 free(task); 1579 } else { 1580 submit_single_io(task); 1581 } 1582 } 1583 1584 static void 1585 io_complete(void *ctx, const struct spdk_nvme_cpl *cpl) 1586 { 1587 struct perf_task *task = ctx; 1588 1589 if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { 1590 if (task->is_read) { 1591 RATELIMIT_LOG("Read completed with error (sct=%d, sc=%d)\n", 1592 cpl->status.sct, cpl->status.sc); 1593 } else { 1594 RATELIMIT_LOG("Write completed with error (sct=%d, sc=%d)\n", 1595 cpl->status.sct, cpl->status.sc); 1596 } 1597 if (!g_continue_on_error) { 1598 if (cpl->status.sct == SPDK_NVME_SCT_GENERIC && 1599 cpl->status.sc == SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT) { 1600 /* The namespace was hotplugged. Stop trying to send I/O to it. */ 1601 task->ns_ctx->is_draining = true; 1602 } 1603 1604 task->ns_ctx->status = 1; 1605 } 1606 } 1607 1608 task_complete(task); 1609 } 1610 1611 static struct perf_task * 1612 allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth) 1613 { 1614 struct perf_task *task; 1615 1616 task = calloc(1, sizeof(*task)); 1617 if (task == NULL) { 1618 fprintf(stderr, "Out of memory allocating tasks\n"); 1619 exit(1); 1620 } 1621 1622 ns_ctx->entry->fn_table->setup_payload(task, queue_depth % 8 + 1); 1623 1624 task->ns_ctx = ns_ctx; 1625 1626 return task; 1627 } 1628 1629 static void 1630 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth) 1631 { 1632 struct perf_task *task; 1633 1634 while (queue_depth-- > 0) { 1635 task = allocate_task(ns_ctx, queue_depth); 1636 submit_single_io(task); 1637 } 1638 } 1639 1640 static int 1641 init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1642 { 1643 TAILQ_INIT(&ns_ctx->queued_tasks); 1644 return ns_ctx->entry->fn_table->init_ns_worker_ctx(ns_ctx); 1645 } 1646 1647 static void 1648 cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1649 { 1650 struct perf_task *task, *ttask; 1651 1652 TAILQ_FOREACH_SAFE(task, &ns_ctx->queued_tasks, link, ttask) { 1653 TAILQ_REMOVE(&ns_ctx->queued_tasks, task, link); 1654 task_complete(task); 1655 } 1656 ns_ctx->entry->fn_table->cleanup_ns_worker_ctx(ns_ctx); 1657 } 1658 1659 static void 1660 print_periodic_performance(bool warmup) 1661 { 1662 uint64_t io_this_second; 1663 double mb_this_second; 1664 struct worker_thread *worker; 1665 struct ns_worker_ctx *ns_ctx; 1666 uint64_t busy_tsc; 1667 uint64_t idle_tsc; 1668 uint64_t core_busy_tsc = 0; 1669 uint64_t core_idle_tsc = 0; 1670 double core_busy_perc = 0; 1671 1672 if (!isatty(STDOUT_FILENO)) { 1673 /* Don't print periodic stats if output is not going 1674 * to a terminal. 1675 */ 1676 return; 1677 } 1678 io_this_second = 0; 1679 TAILQ_FOREACH(worker, &g_workers, link) { 1680 busy_tsc = 0; 1681 idle_tsc = 0; 1682 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1683 io_this_second += ns_ctx->stats.io_completed - ns_ctx->stats.last_io_completed; 1684 ns_ctx->stats.last_io_completed = ns_ctx->stats.io_completed; 1685 1686 if (g_monitor_perf_cores) { 1687 busy_tsc += ns_ctx->stats.busy_tsc - ns_ctx->stats.last_busy_tsc; 1688 idle_tsc += ns_ctx->stats.idle_tsc - ns_ctx->stats.last_idle_tsc; 1689 ns_ctx->stats.last_busy_tsc = ns_ctx->stats.busy_tsc; 1690 ns_ctx->stats.last_idle_tsc = ns_ctx->stats.idle_tsc; 1691 } 1692 } 1693 if (g_monitor_perf_cores) { 1694 core_busy_tsc += busy_tsc; 1695 core_idle_tsc += idle_tsc; 1696 } 1697 } 1698 mb_this_second = (double)io_this_second * g_io_size_bytes / (1024 * 1024); 1699 1700 printf("%s%9ju IOPS, %8.2f MiB/s", warmup ? "[warmup] " : "", io_this_second, mb_this_second); 1701 if (g_monitor_perf_cores) { 1702 core_busy_perc = (double)core_busy_tsc / (core_idle_tsc + core_busy_tsc) * 100; 1703 printf("%3d Core(s): %6.2f%% Busy", g_num_workers, core_busy_perc); 1704 } 1705 printf("\r"); 1706 fflush(stdout); 1707 } 1708 1709 static void 1710 perf_dump_transport_statistics(struct worker_thread *worker) 1711 { 1712 struct ns_worker_ctx *ns_ctx; 1713 1714 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1715 if (ns_ctx->entry->fn_table->dump_transport_stats) { 1716 ns_ctx->entry->fn_table->dump_transport_stats(worker->lcore, ns_ctx); 1717 } 1718 } 1719 } 1720 1721 static int 1722 work_fn(void *arg) 1723 { 1724 uint64_t tsc_start, tsc_end, tsc_current, tsc_next_print; 1725 struct worker_thread *worker = (struct worker_thread *) arg; 1726 struct ns_worker_ctx *ns_ctx = NULL; 1727 uint32_t unfinished_ns_ctx; 1728 bool warmup = false; 1729 int rc; 1730 int64_t check_rc; 1731 uint64_t check_now; 1732 TAILQ_HEAD(, perf_task) swap; 1733 struct perf_task *task; 1734 1735 /* Allocate queue pairs for each namespace. */ 1736 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1737 if (init_ns_worker_ctx(ns_ctx) != 0) { 1738 printf("ERROR: init_ns_worker_ctx() failed\n"); 1739 /* Wait on barrier to avoid blocking of successful workers */ 1740 pthread_barrier_wait(&g_worker_sync_barrier); 1741 ns_ctx->status = 1; 1742 return 1; 1743 } 1744 } 1745 1746 rc = pthread_barrier_wait(&g_worker_sync_barrier); 1747 if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { 1748 printf("ERROR: failed to wait on thread sync barrier\n"); 1749 ns_ctx->status = 1; 1750 return 1; 1751 } 1752 1753 tsc_start = spdk_get_ticks(); 1754 tsc_current = tsc_start; 1755 tsc_next_print = tsc_current + g_tsc_rate; 1756 1757 if (g_warmup_time_in_sec) { 1758 warmup = true; 1759 tsc_end = tsc_current + g_warmup_time_in_sec * g_tsc_rate; 1760 } else { 1761 tsc_end = tsc_current + g_time_in_sec * g_tsc_rate; 1762 } 1763 1764 /* Submit initial I/O for each namespace. */ 1765 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1766 submit_io(ns_ctx, g_queue_depth); 1767 } 1768 1769 while (spdk_likely(!g_exit)) { 1770 bool all_draining = true; 1771 1772 /* 1773 * Check for completed I/O for each controller. A new 1774 * I/O will be submitted in the io_complete callback 1775 * to replace each I/O that is completed. 1776 */ 1777 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1778 if (g_continue_on_error && !ns_ctx->is_draining) { 1779 /* Submit any I/O that is queued up */ 1780 TAILQ_INIT(&swap); 1781 TAILQ_SWAP(&swap, &ns_ctx->queued_tasks, perf_task, link); 1782 while (!TAILQ_EMPTY(&swap)) { 1783 task = TAILQ_FIRST(&swap); 1784 TAILQ_REMOVE(&swap, task, link); 1785 if (ns_ctx->is_draining) { 1786 TAILQ_INSERT_TAIL(&ns_ctx->queued_tasks, 1787 task, link); 1788 continue; 1789 } 1790 submit_single_io(task); 1791 } 1792 } 1793 1794 check_now = spdk_get_ticks(); 1795 check_rc = ns_ctx->entry->fn_table->check_io(ns_ctx); 1796 1797 if (check_rc > 0) { 1798 ns_ctx->stats.busy_tsc += check_now - ns_ctx->stats.last_tsc; 1799 } else { 1800 ns_ctx->stats.idle_tsc += check_now - ns_ctx->stats.last_tsc; 1801 } 1802 ns_ctx->stats.last_tsc = check_now; 1803 1804 if (!ns_ctx->is_draining) { 1805 all_draining = false; 1806 } 1807 } 1808 1809 if (spdk_unlikely(all_draining)) { 1810 break; 1811 } 1812 1813 tsc_current = spdk_get_ticks(); 1814 1815 if (worker->lcore == g_main_core && tsc_current > tsc_next_print) { 1816 tsc_next_print += g_tsc_rate; 1817 print_periodic_performance(warmup); 1818 } 1819 1820 if (tsc_current > tsc_end) { 1821 if (warmup) { 1822 /* Update test start and end time, clear statistics */ 1823 tsc_start = spdk_get_ticks(); 1824 tsc_end = tsc_start + g_time_in_sec * g_tsc_rate; 1825 1826 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1827 memset(&ns_ctx->stats, 0, sizeof(ns_ctx->stats)); 1828 ns_ctx->stats.min_tsc = UINT64_MAX; 1829 spdk_histogram_data_reset(ns_ctx->histogram); 1830 } 1831 1832 if (worker->lcore == g_main_core && isatty(STDOUT_FILENO)) { 1833 /* warmup stage prints a longer string to stdout, need to erase it */ 1834 printf("%c[2K", 27); 1835 } 1836 1837 warmup = false; 1838 } else { 1839 break; 1840 } 1841 } 1842 } 1843 1844 /* Capture the actual elapsed time when we break out of the main loop. This will account 1845 * for cases where we exit prematurely due to a signal. We only need to capture it on 1846 * one core, so use the main core. 1847 */ 1848 if (worker->lcore == g_main_core) { 1849 g_elapsed_time_in_usec = (tsc_current - tsc_start) * SPDK_SEC_TO_USEC / g_tsc_rate; 1850 } 1851 1852 /* drain the io of each ns_ctx in round robin to make the fairness */ 1853 do { 1854 unfinished_ns_ctx = 0; 1855 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1856 /* first time will enter into this if case */ 1857 if (!ns_ctx->is_draining) { 1858 ns_ctx->is_draining = true; 1859 } 1860 1861 if (ns_ctx->current_queue_depth > 0) { 1862 ns_ctx->entry->fn_table->check_io(ns_ctx); 1863 if (ns_ctx->current_queue_depth > 0) { 1864 unfinished_ns_ctx++; 1865 } 1866 } 1867 } 1868 } while (unfinished_ns_ctx > 0); 1869 1870 if (g_dump_transport_stats) { 1871 pthread_mutex_lock(&g_stats_mutex); 1872 perf_dump_transport_statistics(worker); 1873 pthread_mutex_unlock(&g_stats_mutex); 1874 } 1875 1876 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1877 cleanup_ns_worker_ctx(ns_ctx); 1878 } 1879 1880 return 0; 1881 } 1882 1883 static void 1884 usage(char *program_name) 1885 { 1886 printf("%s options", program_name); 1887 #if defined(SPDK_CONFIG_URING) || defined(HAVE_LIBAIO) 1888 printf(" [Kernel device(s)]..."); 1889 #endif 1890 printf("\n\n"); 1891 printf("==== BASIC OPTIONS ====\n\n"); 1892 printf("\t-q, --io-depth <val> io depth\n"); 1893 printf("\t-o, --io-size <val> io size in bytes\n"); 1894 printf("\t-w, --io-pattern <pattern> io pattern type, must be one of\n"); 1895 printf("\t\t(read, write, randread, randwrite, rw, randrw)\n"); 1896 printf("\t-M, --rwmixread <0-100> rwmixread (100 for reads, 0 for writes)\n"); 1897 printf("\t-t, --time <sec> time in seconds\n"); 1898 printf("\t-a, --warmup-time <sec> warmup time in seconds\n"); 1899 printf("\t-c, --core-mask <mask> core mask for I/O submission/completion.\n"); 1900 printf("\t\t(default: 1)\n"); 1901 printf("\t-r, --transport <fmt> Transport ID for local PCIe NVMe or NVMeoF\n"); 1902 printf("\t\t Format: 'key:value [key:value] ...'\n"); 1903 printf("\t\t Keys:\n"); 1904 printf("\t\t trtype Transport type (e.g. PCIe, RDMA)\n"); 1905 printf("\t\t adrfam Address family (e.g. IPv4, IPv6)\n"); 1906 printf("\t\t traddr Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n"); 1907 printf("\t\t trsvcid Transport service identifier (e.g. 4420)\n"); 1908 printf("\t\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN); 1909 printf("\t\t ns NVMe namespace ID (all active namespaces are used by default)\n"); 1910 printf("\t\t hostnqn Host NQN\n"); 1911 printf("\t\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n"); 1912 printf("\t\t -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n"); 1913 printf("\t\t Note: can be specified multiple times to test multiple disks/targets.\n"); 1914 printf("\n"); 1915 1916 printf("==== ADVANCED OPTIONS ====\n\n"); 1917 printf("\t--use-every-core for each namespace, I/Os are submitted from all cores\n"); 1918 printf("\t--io-queue-size <val> size of NVMe IO queue. Default: maximum allowed by controller\n"); 1919 printf("\t-O, --io-unit-size io unit size in bytes (4-byte aligned) for SPDK driver. default: same as io size\n"); 1920 printf("\t-P, --num-qpairs <val> number of io queues per namespace. default: 1\n"); 1921 printf("\t-U, --num-unused-qpairs <val> number of unused io queues per controller. default: 0\n"); 1922 printf("\t-A, --buffer-alignment IO buffer alignment. Must be power of 2 and not less than cache line (%u)\n", 1923 SPDK_CACHE_LINE_SIZE); 1924 printf("\t-s, --hugemem-size <MB> DPDK huge memory size in MB.\n"); 1925 printf("\t-g, --mem-single-seg use single file descriptor for DPDK memory segments\n"); 1926 printf("\t-C, --max-completion-per-poll <val> max completions per poll\n"); 1927 printf("\t\t(default: 0 - unlimited)\n"); 1928 printf("\t-i, --shmem-grp-id <id> shared memory group ID\n"); 1929 printf("\t-d, --number-ios <val> number of I/O to perform per thread on each namespace. Note: this is additional exit criteria.\n"); 1930 printf("\t\t(default: 0 - unlimited)\n"); 1931 printf("\t-e, --metadata <fmt> metadata configuration\n"); 1932 printf("\t\t Keys:\n"); 1933 printf("\t\t PRACT Protection Information Action bit (PRACT=1 or PRACT=0)\n"); 1934 printf("\t\t PRCHK Control of Protection Information Checking (PRCHK=GUARD|REFTAG|APPTAG)\n"); 1935 printf("\t\t Example: -e 'PRACT=0,PRCHK=GUARD|REFTAG|APPTAG'\n"); 1936 printf("\t\t -e 'PRACT=1,PRCHK=GUARD'\n"); 1937 printf("\t-F, --zipf <theta> use zipf distribution for random I/O\n"); 1938 #ifdef SPDK_CONFIG_URING 1939 printf("\t-R, --enable-uring enable using liburing to drive kernel devices (Default: libaio)\n"); 1940 #endif 1941 printf("\t--iova-mode <mode> specify DPDK IOVA mode: va|pa\n"); 1942 printf("\t--no-huge, SPDK is run without hugepages\n"); 1943 printf("\n"); 1944 1945 printf("==== PCIe OPTIONS ====\n\n"); 1946 printf("\t-b, --allowed-pci-addr <addr> allowed local PCIe device address\n"); 1947 printf("\t\t Example: -b 0000:d8:00.0 -b 0000:d9:00.0\n"); 1948 printf("\t-V, --enable-vmd enable VMD enumeration\n"); 1949 printf("\t-D, --disable-sq-cmb disable submission queue in controller memory buffer, default: enabled\n"); 1950 printf("\n"); 1951 1952 printf("==== TCP OPTIONS ====\n\n"); 1953 printf("\t-S, --default-sock-impl <impl> set the default sock impl, e.g. \"posix\"\n"); 1954 printf("\t--disable-ktls disable Kernel TLS. Only valid for ssl impl. Default for ssl impl\n"); 1955 printf("\t--enable-ktls enable Kernel TLS. Only valid for ssl impl\n"); 1956 printf("\t--tls-version <val> TLS version to use. Only valid for ssl impl. Default: 0 (auto-negotiation)\n"); 1957 printf("\t--psk-path <val> Path to PSK file (only applies when sock_impl == ssl)\n"); 1958 printf("\t--psk-identity <val> Default PSK ID, e.g. psk.spdk.io (only applies when sock_impl == ssl)\n"); 1959 printf("\t--zerocopy-threshold <val> data is sent with MSG_ZEROCOPY if size is greater than this val. Default: 0 to disable it\n"); 1960 printf("\t--zerocopy-threshold-sock-impl <impl> specify the sock implementation to set zerocopy_threshold\n"); 1961 printf("\t-z, --disable-zcopy <impl> disable zero copy send for the given sock implementation. Default for posix impl\n"); 1962 printf("\t-Z, --enable-zcopy <impl> enable zero copy send for the given sock implementation\n"); 1963 printf("\t-k, --keepalive <ms> keep alive timeout period in millisecond\n"); 1964 printf("\t-H, --enable-tcp-hdgst enable header digest for TCP transport, default: disabled\n"); 1965 printf("\t-I, --enable-tcp-ddgst enable data digest for TCP transport, default: disabled\n"); 1966 printf("\n"); 1967 1968 printf("==== RDMA OPTIONS ====\n\n"); 1969 printf("\t--transport-tos <val> specify the type of service for RDMA transport. Default: 0 (disabled)\n"); 1970 printf("\t--rdma-srq-size <val> The size of a shared rdma receive queue. Default: 0 (disabled)\n"); 1971 printf("\t-k, --keepalive <ms> keep alive timeout period in millisecond\n"); 1972 printf("\n"); 1973 1974 printf("==== LOGGING ====\n\n"); 1975 printf("\t-L, --enable-sw-latency-tracking enable latency tracking via sw, default: disabled\n"); 1976 printf("\t\t-L for latency summary, -LL for detailed histogram\n"); 1977 printf("\t-l, --enable-ssd-latency-tracking enable latency tracking via ssd (if supported), default: disabled\n"); 1978 printf("\t-N, --no-shst-notification no shutdown notification process for controllers, default: disabled\n"); 1979 printf("\t-Q, --continue-on-error <val> Do not stop on error. Log I/O errors every N times (default: 1)\n"); 1980 spdk_log_usage(stdout, "\t-T"); 1981 printf("\t-m, --cpu-usage display real-time overall cpu usage on used cores\n"); 1982 #ifdef DEBUG 1983 printf("\t-G, --enable-debug enable debug logging\n"); 1984 #else 1985 printf("\t-G, --enable-debug enable debug logging (flag disabled, must reconfigure with --enable-debug)\n"); 1986 #endif 1987 printf("\t--transport-stats dump transport statistics\n"); 1988 printf("\n\n"); 1989 } 1990 1991 static void 1992 check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count, 1993 uint64_t total, uint64_t so_far) 1994 { 1995 double so_far_pct; 1996 double **cutoff = ctx; 1997 1998 if (count == 0) { 1999 return; 2000 } 2001 2002 so_far_pct = (double)so_far / total; 2003 while (so_far_pct >= **cutoff && **cutoff > 0) { 2004 printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * 1000 * 1000 / g_tsc_rate); 2005 (*cutoff)++; 2006 } 2007 } 2008 2009 static void 2010 print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count, 2011 uint64_t total, uint64_t so_far) 2012 { 2013 double so_far_pct; 2014 2015 if (count == 0) { 2016 return; 2017 } 2018 2019 so_far_pct = (double)so_far * 100 / total; 2020 printf("%9.3f - %9.3f: %9.4f%% (%9ju)\n", 2021 (double)start * 1000 * 1000 / g_tsc_rate, 2022 (double)end * 1000 * 1000 / g_tsc_rate, 2023 so_far_pct, count); 2024 } 2025 2026 static void 2027 print_performance(void) 2028 { 2029 uint64_t total_io_completed, total_io_tsc; 2030 double io_per_second, mb_per_second, average_latency, min_latency, max_latency; 2031 double sum_ave_latency, min_latency_so_far, max_latency_so_far; 2032 double total_io_per_second, total_mb_per_second; 2033 int ns_count; 2034 struct worker_thread *worker; 2035 struct ns_worker_ctx *ns_ctx; 2036 uint32_t max_strlen; 2037 2038 total_io_per_second = 0; 2039 total_mb_per_second = 0; 2040 total_io_completed = 0; 2041 total_io_tsc = 0; 2042 min_latency_so_far = (double)UINT64_MAX; 2043 max_latency_so_far = 0; 2044 ns_count = 0; 2045 2046 max_strlen = 0; 2047 TAILQ_FOREACH(worker, &g_workers, link) { 2048 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2049 max_strlen = spdk_max(strlen(ns_ctx->entry->name), max_strlen); 2050 } 2051 } 2052 2053 printf("========================================================\n"); 2054 printf("%*s\n", max_strlen + 60, "Latency(us)"); 2055 printf("%-*s: %10s %10s %10s %10s %10s\n", 2056 max_strlen + 13, "Device Information", "IOPS", "MiB/s", "Average", "min", "max"); 2057 2058 TAILQ_FOREACH(worker, &g_workers, link) { 2059 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2060 if (ns_ctx->stats.io_completed != 0) { 2061 io_per_second = (double)ns_ctx->stats.io_completed * 1000 * 1000 / g_elapsed_time_in_usec; 2062 mb_per_second = io_per_second * g_io_size_bytes / (1024 * 1024); 2063 average_latency = ((double)ns_ctx->stats.total_tsc / ns_ctx->stats.io_completed) * 1000 * 1000 / 2064 g_tsc_rate; 2065 min_latency = (double)ns_ctx->stats.min_tsc * 1000 * 1000 / g_tsc_rate; 2066 if (min_latency < min_latency_so_far) { 2067 min_latency_so_far = min_latency; 2068 } 2069 2070 max_latency = (double)ns_ctx->stats.max_tsc * 1000 * 1000 / g_tsc_rate; 2071 if (max_latency > max_latency_so_far) { 2072 max_latency_so_far = max_latency; 2073 } 2074 2075 printf("%-*.*s from core %2u: %10.2f %10.2f %10.2f %10.2f %10.2f\n", 2076 max_strlen, max_strlen, ns_ctx->entry->name, worker->lcore, 2077 io_per_second, mb_per_second, 2078 average_latency, min_latency, max_latency); 2079 total_io_per_second += io_per_second; 2080 total_mb_per_second += mb_per_second; 2081 total_io_completed += ns_ctx->stats.io_completed; 2082 total_io_tsc += ns_ctx->stats.total_tsc; 2083 ns_count++; 2084 } 2085 } 2086 } 2087 2088 if (ns_count != 0 && total_io_completed) { 2089 sum_ave_latency = ((double)total_io_tsc / total_io_completed) * 1000 * 1000 / g_tsc_rate; 2090 printf("========================================================\n"); 2091 printf("%-*s: %10.2f %10.2f %10.2f %10.2f %10.2f\n", 2092 max_strlen + 13, "Total", total_io_per_second, total_mb_per_second, 2093 sum_ave_latency, min_latency_so_far, max_latency_so_far); 2094 printf("\n"); 2095 } 2096 2097 if (g_latency_sw_tracking_level == 0 || total_io_completed == 0) { 2098 return; 2099 } 2100 2101 TAILQ_FOREACH(worker, &g_workers, link) { 2102 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2103 const double *cutoff = g_latency_cutoffs; 2104 2105 printf("Summary latency data for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore); 2106 printf("=================================================================================\n"); 2107 2108 spdk_histogram_data_iterate(ns_ctx->histogram, check_cutoff, &cutoff); 2109 2110 printf("\n"); 2111 } 2112 } 2113 2114 if (g_latency_sw_tracking_level == 1) { 2115 return; 2116 } 2117 2118 TAILQ_FOREACH(worker, &g_workers, link) { 2119 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2120 printf("Latency histogram for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore); 2121 printf("==============================================================================\n"); 2122 printf(" Range in us Cumulative IO count\n"); 2123 2124 spdk_histogram_data_iterate(ns_ctx->histogram, print_bucket, NULL); 2125 printf("\n"); 2126 } 2127 } 2128 2129 } 2130 2131 static void 2132 print_latency_page(struct ctrlr_entry *entry) 2133 { 2134 int i; 2135 2136 printf("\n"); 2137 printf("%s\n", entry->name); 2138 printf("--------------------------------------------------------\n"); 2139 2140 for (i = 0; i < 32; i++) { 2141 if (entry->latency_page->buckets_32us[i]) { 2142 printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32, entry->latency_page->buckets_32us[i]); 2143 } 2144 } 2145 for (i = 0; i < 31; i++) { 2146 if (entry->latency_page->buckets_1ms[i]) { 2147 printf("Bucket %dms - %dms: %d\n", i + 1, i + 2, entry->latency_page->buckets_1ms[i]); 2148 } 2149 } 2150 for (i = 0; i < 31; i++) { 2151 if (entry->latency_page->buckets_32ms[i]) 2152 printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32, 2153 entry->latency_page->buckets_32ms[i]); 2154 } 2155 } 2156 2157 static void 2158 print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page) 2159 { 2160 struct ctrlr_entry *ctrlr; 2161 2162 printf("%s Latency Statistics:\n", op_name); 2163 printf("========================================================\n"); 2164 TAILQ_FOREACH(ctrlr, &g_controllers, link) { 2165 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) { 2166 if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr->ctrlr, log_page, SPDK_NVME_GLOBAL_NS_TAG, 2167 ctrlr->latency_page, sizeof(struct spdk_nvme_intel_rw_latency_page), 0, 2168 enable_latency_tracking_complete, 2169 NULL)) { 2170 printf("nvme_ctrlr_cmd_get_log_page() failed\n"); 2171 exit(1); 2172 } 2173 2174 g_outstanding_commands++; 2175 } else { 2176 printf("Controller %s: %s latency statistics not supported\n", ctrlr->name, op_name); 2177 } 2178 } 2179 2180 while (g_outstanding_commands) { 2181 TAILQ_FOREACH(ctrlr, &g_controllers, link) { 2182 spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr); 2183 } 2184 } 2185 2186 TAILQ_FOREACH(ctrlr, &g_controllers, link) { 2187 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) { 2188 print_latency_page(ctrlr); 2189 } 2190 } 2191 printf("\n"); 2192 } 2193 2194 static void 2195 print_stats(void) 2196 { 2197 print_performance(); 2198 if (g_latency_ssd_tracking_enable) { 2199 if (g_rw_percentage != 0) { 2200 print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY); 2201 } 2202 if (g_rw_percentage != 100) { 2203 print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY); 2204 } 2205 } 2206 } 2207 2208 static void 2209 unregister_trids(void) 2210 { 2211 struct trid_entry *trid_entry, *tmp; 2212 2213 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) { 2214 TAILQ_REMOVE(&g_trid_list, trid_entry, tailq); 2215 free(trid_entry); 2216 } 2217 } 2218 2219 static int 2220 add_trid(const char *trid_str) 2221 { 2222 struct trid_entry *trid_entry; 2223 struct spdk_nvme_transport_id *trid; 2224 char *ns; 2225 char *hostnqn; 2226 2227 trid_entry = calloc(1, sizeof(*trid_entry)); 2228 if (trid_entry == NULL) { 2229 return -1; 2230 } 2231 2232 trid = &trid_entry->trid; 2233 trid->trtype = SPDK_NVME_TRANSPORT_PCIE; 2234 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 2235 2236 if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) { 2237 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str); 2238 free(trid_entry); 2239 return 1; 2240 } 2241 2242 ns = strcasestr(trid_str, "ns:"); 2243 if (ns) { 2244 char nsid_str[6]; /* 5 digits maximum in an nsid */ 2245 int len; 2246 int nsid; 2247 2248 ns += 3; 2249 2250 len = strcspn(ns, " \t\n"); 2251 if (len > 5) { 2252 fprintf(stderr, "NVMe namespace IDs must be 5 digits or less\n"); 2253 free(trid_entry); 2254 return 1; 2255 } 2256 2257 memcpy(nsid_str, ns, len); 2258 nsid_str[len] = '\0'; 2259 2260 nsid = spdk_strtol(nsid_str, 10); 2261 if (nsid <= 0 || nsid > 65535) { 2262 fprintf(stderr, "NVMe namespace IDs must be less than 65536 and greater than 0\n"); 2263 free(trid_entry); 2264 return 1; 2265 } 2266 2267 trid_entry->nsid = (uint16_t)nsid; 2268 } 2269 2270 hostnqn = strcasestr(trid_str, "hostnqn:"); 2271 if (hostnqn) { 2272 size_t len; 2273 2274 hostnqn += strlen("hostnqn:"); 2275 2276 len = strcspn(hostnqn, " \t\n"); 2277 if (len > (sizeof(trid_entry->hostnqn) - 1)) { 2278 fprintf(stderr, "Host NQN is too long\n"); 2279 free(trid_entry); 2280 return 1; 2281 } 2282 2283 memcpy(trid_entry->hostnqn, hostnqn, len); 2284 trid_entry->hostnqn[len] = '\0'; 2285 } 2286 2287 TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq); 2288 return 0; 2289 } 2290 2291 static int 2292 add_allowed_pci_device(const char *bdf_str, struct spdk_env_opts *env_opts) 2293 { 2294 int rc; 2295 2296 if (env_opts->num_pci_addr >= MAX_ALLOWED_PCI_DEVICE_NUM) { 2297 fprintf(stderr, "Currently we only support allowed PCI device num=%d\n", 2298 MAX_ALLOWED_PCI_DEVICE_NUM); 2299 return -1; 2300 } 2301 2302 rc = spdk_pci_addr_parse(&env_opts->pci_allowed[env_opts->num_pci_addr], bdf_str); 2303 if (rc < 0) { 2304 fprintf(stderr, "Failed to parse the given bdf_str=%s\n", bdf_str); 2305 return -1; 2306 } 2307 2308 env_opts->num_pci_addr++; 2309 return 0; 2310 } 2311 2312 static size_t 2313 parse_next_key(const char **str, char *key, char *val, size_t key_buf_size, 2314 size_t val_buf_size) 2315 { 2316 const char *sep; 2317 const char *separator = ", \t\n"; 2318 size_t key_len, val_len; 2319 2320 *str += strspn(*str, separator); 2321 2322 sep = strchr(*str, '='); 2323 if (!sep) { 2324 fprintf(stderr, "Key without '=' separator\n"); 2325 return 0; 2326 } 2327 2328 key_len = sep - *str; 2329 if (key_len >= key_buf_size) { 2330 fprintf(stderr, "Key length %zu is greater than maximum allowed %zu\n", 2331 key_len, key_buf_size - 1); 2332 return 0; 2333 } 2334 2335 memcpy(key, *str, key_len); 2336 key[key_len] = '\0'; 2337 2338 *str += key_len + 1; /* Skip key */ 2339 val_len = strcspn(*str, separator); 2340 if (val_len == 0) { 2341 fprintf(stderr, "Key without value\n"); 2342 return 0; 2343 } 2344 2345 if (val_len >= val_buf_size) { 2346 fprintf(stderr, "Value length %zu is greater than maximum allowed %zu\n", 2347 val_len, val_buf_size - 1); 2348 return 0; 2349 } 2350 2351 memcpy(val, *str, val_len); 2352 val[val_len] = '\0'; 2353 2354 *str += val_len; 2355 2356 return val_len; 2357 } 2358 2359 static int 2360 parse_metadata(const char *metacfg_str) 2361 { 2362 const char *str; 2363 size_t val_len; 2364 char key[32]; 2365 char val[1024]; 2366 2367 if (metacfg_str == NULL) { 2368 return -EINVAL; 2369 } 2370 2371 str = metacfg_str; 2372 2373 while (*str != '\0') { 2374 val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val)); 2375 if (val_len == 0) { 2376 fprintf(stderr, "Failed to parse metadata\n"); 2377 return -EINVAL; 2378 } 2379 2380 if (strcmp(key, "PRACT") == 0) { 2381 if (*val == '1') { 2382 g_metacfg_pract_flag = SPDK_NVME_IO_FLAGS_PRACT; 2383 } 2384 } else if (strcmp(key, "PRCHK") == 0) { 2385 if (strstr(val, "GUARD") != NULL) { 2386 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD; 2387 } 2388 if (strstr(val, "REFTAG") != NULL) { 2389 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG; 2390 } 2391 if (strstr(val, "APPTAG") != NULL) { 2392 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG; 2393 } 2394 } else { 2395 fprintf(stderr, "Unknown key '%s'\n", key); 2396 } 2397 } 2398 2399 return 0; 2400 } 2401 2402 #define PERF_GETOPT_SHORT "a:b:c:d:e:gi:lmo:q:r:k:s:t:w:z:A:C:DF:GHILM:NO:P:Q:RS:T:U:VZ:" 2403 2404 static const struct option g_perf_cmdline_opts[] = { 2405 #define PERF_WARMUP_TIME 'a' 2406 {"warmup-time", required_argument, NULL, PERF_WARMUP_TIME}, 2407 #define PERF_ALLOWED_PCI_ADDR 'b' 2408 {"allowed-pci-addr", required_argument, NULL, PERF_ALLOWED_PCI_ADDR}, 2409 #define PERF_CORE_MASK 'c' 2410 {"core-mask", required_argument, NULL, PERF_CORE_MASK}, 2411 #define PERF_METADATA 'e' 2412 {"metadata", required_argument, NULL, PERF_METADATA}, 2413 #define PERF_MEM_SINGL_SEG 'g' 2414 {"mem-single-seg", no_argument, NULL, PERF_MEM_SINGL_SEG}, 2415 #define PERF_SHMEM_GROUP_ID 'i' 2416 {"shmem-grp-id", required_argument, NULL, PERF_SHMEM_GROUP_ID}, 2417 #define PERF_ENABLE_SSD_LATENCY_TRACING 'l' 2418 {"enable-ssd-latency-tracking", no_argument, NULL, PERF_ENABLE_SSD_LATENCY_TRACING}, 2419 #define PERF_CPU_USAGE 'm' 2420 {"cpu-usage", no_argument, NULL, PERF_CPU_USAGE}, 2421 #define PERF_IO_SIZE 'o' 2422 {"io-size", required_argument, NULL, PERF_IO_SIZE}, 2423 #define PERF_IO_DEPTH 'q' 2424 {"io-depth", required_argument, NULL, PERF_IO_DEPTH}, 2425 #define PERF_TRANSPORT 'r' 2426 {"transport", required_argument, NULL, PERF_TRANSPORT}, 2427 #define PERF_KEEPALIVE 'k' 2428 {"keepalive", required_argument, NULL, PERF_KEEPALIVE}, 2429 #define PERF_HUGEMEM_SIZE 's' 2430 {"hugemem-size", required_argument, NULL, PERF_HUGEMEM_SIZE}, 2431 #define PERF_TIME 't' 2432 {"time", required_argument, NULL, PERF_TIME}, 2433 #define PERF_NUMBER_IOS 'd' 2434 {"number-ios", required_argument, NULL, PERF_NUMBER_IOS}, 2435 #define PERF_IO_PATTERN 'w' 2436 {"io-pattern", required_argument, NULL, PERF_IO_PATTERN}, 2437 #define PERF_DISABLE_ZCOPY 'z' 2438 {"disable-zcopy", required_argument, NULL, PERF_DISABLE_ZCOPY}, 2439 #define PERF_BUFFER_ALIGNMENT 'A' 2440 {"buffer-alignment", required_argument, NULL, PERF_BUFFER_ALIGNMENT}, 2441 #define PERF_MAX_COMPLETIONS_PER_POLL 'C' 2442 {"max-completion-per-poll", required_argument, NULL, PERF_MAX_COMPLETIONS_PER_POLL}, 2443 #define PERF_DISABLE_SQ_CMB 'D' 2444 {"disable-sq-cmb", no_argument, NULL, PERF_DISABLE_SQ_CMB}, 2445 #define PERF_ZIPF 'F' 2446 {"zipf", required_argument, NULL, PERF_ZIPF}, 2447 #define PERF_ENABLE_DEBUG 'G' 2448 {"enable-debug", no_argument, NULL, PERF_ENABLE_DEBUG}, 2449 #define PERF_ENABLE_TCP_HDGST 'H' 2450 {"enable-tcp-hdgst", no_argument, NULL, PERF_ENABLE_TCP_HDGST}, 2451 #define PERF_ENABLE_TCP_DDGST 'I' 2452 {"enable-tcp-ddgst", no_argument, NULL, PERF_ENABLE_TCP_DDGST}, 2453 #define PERF_ENABLE_SW_LATENCY_TRACING 'L' 2454 {"enable-sw-latency-tracking", no_argument, NULL, PERF_ENABLE_SW_LATENCY_TRACING}, 2455 #define PERF_RW_MIXREAD 'M' 2456 {"rwmixread", required_argument, NULL, PERF_RW_MIXREAD}, 2457 #define PERF_NO_SHST_NOTIFICATION 'N' 2458 {"no-shst-notification", no_argument, NULL, PERF_NO_SHST_NOTIFICATION}, 2459 #define PERF_IO_UNIT_SIZE 'O' 2460 {"io-unit-size", required_argument, NULL, PERF_IO_UNIT_SIZE}, 2461 #define PERF_IO_QUEUES_PER_NS 'P' 2462 {"num-qpairs", required_argument, NULL, PERF_IO_QUEUES_PER_NS}, 2463 #define PERF_CONTINUE_ON_ERROR 'Q' 2464 {"continue-on-error", required_argument, NULL, PERF_CONTINUE_ON_ERROR}, 2465 #define PERF_ENABLE_URING 'R' 2466 {"enable-uring", no_argument, NULL, PERF_ENABLE_URING}, 2467 #define PERF_DEFAULT_SOCK_IMPL 'S' 2468 {"default-sock-impl", required_argument, NULL, PERF_DEFAULT_SOCK_IMPL}, 2469 #define PERF_LOG_FLAG 'T' 2470 {"logflag", required_argument, NULL, PERF_LOG_FLAG}, 2471 #define PERF_NUM_UNUSED_IO_QPAIRS 'U' 2472 {"num-unused-qpairs", required_argument, NULL, PERF_NUM_UNUSED_IO_QPAIRS}, 2473 #define PERF_ENABLE_VMD 'V' 2474 {"enable-vmd", no_argument, NULL, PERF_ENABLE_VMD}, 2475 #define PERF_ENABLE_ZCOPY 'Z' 2476 {"enable-zcopy", required_argument, NULL, PERF_ENABLE_ZCOPY}, 2477 #define PERF_TRANSPORT_STATISTICS 257 2478 {"transport-stats", no_argument, NULL, PERF_TRANSPORT_STATISTICS}, 2479 #define PERF_IOVA_MODE 258 2480 {"iova-mode", required_argument, NULL, PERF_IOVA_MODE}, 2481 #define PERF_IO_QUEUE_SIZE 259 2482 {"io-queue-size", required_argument, NULL, PERF_IO_QUEUE_SIZE}, 2483 #define PERF_DISABLE_KTLS 260 2484 {"disable-ktls", no_argument, NULL, PERF_DISABLE_KTLS}, 2485 #define PERF_ENABLE_KTLS 261 2486 {"enable-ktls", no_argument, NULL, PERF_ENABLE_KTLS}, 2487 #define PERF_TLS_VERSION 262 2488 {"tls-version", required_argument, NULL, PERF_TLS_VERSION}, 2489 #define PERF_PSK_PATH 263 2490 {"psk-path", required_argument, NULL, PERF_PSK_PATH}, 2491 #define PERF_PSK_IDENTITY 264 2492 {"psk-identity ", required_argument, NULL, PERF_PSK_IDENTITY}, 2493 #define PERF_ZEROCOPY_THRESHOLD 265 2494 {"zerocopy-threshold", required_argument, NULL, PERF_ZEROCOPY_THRESHOLD}, 2495 #define PERF_SOCK_IMPL 266 2496 {"zerocopy-threshold-sock-impl", required_argument, NULL, PERF_SOCK_IMPL}, 2497 #define PERF_TRANSPORT_TOS 267 2498 {"transport-tos", required_argument, NULL, PERF_TRANSPORT_TOS}, 2499 #define PERF_RDMA_SRQ_SIZE 268 2500 {"rdma-srq-size", required_argument, NULL, PERF_RDMA_SRQ_SIZE}, 2501 #define PERF_USE_EVERY_CORE 269 2502 {"use-every-core", no_argument, NULL, PERF_USE_EVERY_CORE}, 2503 #define PERF_NO_HUGE 270 2504 {"no-huge", no_argument, NULL, PERF_NO_HUGE}, 2505 /* Should be the last element */ 2506 {0, 0, 0, 0} 2507 }; 2508 2509 static int 2510 parse_args(int argc, char **argv, struct spdk_env_opts *env_opts) 2511 { 2512 int op, long_idx; 2513 long int val; 2514 uint64_t val_u64; 2515 int rc; 2516 char *endptr; 2517 bool ssl_used = false; 2518 char *sock_impl = "posix"; 2519 2520 while ((op = getopt_long(argc, argv, PERF_GETOPT_SHORT, g_perf_cmdline_opts, &long_idx)) != -1) { 2521 switch (op) { 2522 case PERF_WARMUP_TIME: 2523 case PERF_SHMEM_GROUP_ID: 2524 case PERF_MAX_COMPLETIONS_PER_POLL: 2525 case PERF_IO_QUEUES_PER_NS: 2526 case PERF_IO_DEPTH: 2527 case PERF_KEEPALIVE: 2528 case PERF_TIME: 2529 case PERF_RW_MIXREAD: 2530 case PERF_NUM_UNUSED_IO_QPAIRS: 2531 case PERF_CONTINUE_ON_ERROR: 2532 case PERF_IO_QUEUE_SIZE: 2533 case PERF_RDMA_SRQ_SIZE: 2534 val = spdk_strtol(optarg, 10); 2535 if (val < 0) { 2536 fprintf(stderr, "Converting a string to integer failed\n"); 2537 return val; 2538 } 2539 switch (op) { 2540 case PERF_WARMUP_TIME: 2541 g_warmup_time_in_sec = val; 2542 break; 2543 case PERF_SHMEM_GROUP_ID: 2544 env_opts->shm_id = val; 2545 break; 2546 case PERF_MAX_COMPLETIONS_PER_POLL: 2547 g_max_completions = val; 2548 break; 2549 case PERF_IO_QUEUES_PER_NS: 2550 g_nr_io_queues_per_ns = val; 2551 break; 2552 case PERF_IO_DEPTH: 2553 g_queue_depth = val; 2554 break; 2555 case PERF_KEEPALIVE: 2556 g_keep_alive_timeout_in_ms = val; 2557 break; 2558 case PERF_TIME: 2559 g_time_in_sec = val; 2560 break; 2561 case PERF_RW_MIXREAD: 2562 g_rw_percentage = val; 2563 g_mix_specified = true; 2564 break; 2565 case PERF_CONTINUE_ON_ERROR: 2566 g_quiet_count = val; 2567 g_continue_on_error = true; 2568 break; 2569 case PERF_NUM_UNUSED_IO_QPAIRS: 2570 g_nr_unused_io_queues = val; 2571 break; 2572 case PERF_IO_QUEUE_SIZE: 2573 g_io_queue_size = val; 2574 break; 2575 case PERF_RDMA_SRQ_SIZE: 2576 g_rdma_srq_size = val; 2577 break; 2578 } 2579 break; 2580 case PERF_IO_SIZE: 2581 case PERF_IO_UNIT_SIZE: 2582 case PERF_ZEROCOPY_THRESHOLD: 2583 case PERF_BUFFER_ALIGNMENT: 2584 case PERF_HUGEMEM_SIZE: 2585 case PERF_NUMBER_IOS: 2586 rc = spdk_parse_capacity(optarg, &val_u64, NULL); 2587 if (rc != 0) { 2588 fprintf(stderr, "Converting a string to integer failed\n"); 2589 return 1; 2590 } 2591 switch (op) { 2592 case PERF_IO_SIZE: 2593 g_io_size_bytes = (uint32_t)val_u64; 2594 break; 2595 case PERF_IO_UNIT_SIZE: 2596 g_io_unit_size = (uint32_t)val_u64; 2597 break; 2598 case PERF_ZEROCOPY_THRESHOLD: 2599 g_sock_zcopy_threshold = (uint32_t)val_u64; 2600 break; 2601 case PERF_BUFFER_ALIGNMENT: 2602 g_io_align = (uint32_t)val_u64; 2603 if (!spdk_u32_is_pow2(g_io_align) || g_io_align < SPDK_CACHE_LINE_SIZE) { 2604 fprintf(stderr, "Wrong alignment %u. Must be power of 2 and not less than cache lize (%u)\n", 2605 g_io_align, SPDK_CACHE_LINE_SIZE); 2606 usage(argv[0]); 2607 return 1; 2608 } 2609 g_io_align_specified = true; 2610 break; 2611 case PERF_HUGEMEM_SIZE: 2612 env_opts->mem_size = (int)val_u64; 2613 break; 2614 case PERF_NUMBER_IOS: 2615 g_number_ios = val_u64; 2616 break; 2617 } 2618 break; 2619 case PERF_ZIPF: 2620 errno = 0; 2621 g_zipf_theta = strtod(optarg, &endptr); 2622 if (errno || optarg == endptr || g_zipf_theta < 0) { 2623 fprintf(stderr, "Illegal zipf theta value %s\n", optarg); 2624 return 1; 2625 } 2626 break; 2627 case PERF_ALLOWED_PCI_ADDR: 2628 if (add_allowed_pci_device(optarg, env_opts)) { 2629 usage(argv[0]); 2630 return 1; 2631 } 2632 break; 2633 case PERF_CORE_MASK: 2634 env_opts->core_mask = optarg; 2635 break; 2636 case PERF_METADATA: 2637 if (parse_metadata(optarg)) { 2638 usage(argv[0]); 2639 return 1; 2640 } 2641 break; 2642 case PERF_MEM_SINGL_SEG: 2643 env_opts->hugepage_single_segments = true; 2644 break; 2645 case PERF_ENABLE_SSD_LATENCY_TRACING: 2646 g_latency_ssd_tracking_enable = true; 2647 break; 2648 case PERF_CPU_USAGE: 2649 g_monitor_perf_cores = true; 2650 break; 2651 case PERF_TRANSPORT: 2652 if (add_trid(optarg)) { 2653 usage(argv[0]); 2654 return 1; 2655 } 2656 break; 2657 case PERF_IO_PATTERN: 2658 g_workload_type = optarg; 2659 break; 2660 case PERF_DISABLE_SQ_CMB: 2661 g_disable_sq_cmb = 1; 2662 break; 2663 case PERF_ENABLE_DEBUG: 2664 #ifndef DEBUG 2665 fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n", 2666 argv[0]); 2667 usage(argv[0]); 2668 return 1; 2669 #else 2670 spdk_log_set_flag("nvme"); 2671 spdk_log_set_print_level(SPDK_LOG_DEBUG); 2672 break; 2673 #endif 2674 case PERF_ENABLE_TCP_HDGST: 2675 g_header_digest = 1; 2676 break; 2677 case PERF_ENABLE_TCP_DDGST: 2678 g_data_digest = 1; 2679 break; 2680 case PERF_ENABLE_SW_LATENCY_TRACING: 2681 g_latency_sw_tracking_level++; 2682 break; 2683 case PERF_NO_SHST_NOTIFICATION: 2684 g_no_shn_notification = true; 2685 break; 2686 case PERF_ENABLE_URING: 2687 #ifndef SPDK_CONFIG_URING 2688 fprintf(stderr, "%s must be rebuilt with CONFIG_URING=y for -R flag.\n", 2689 argv[0]); 2690 usage(argv[0]); 2691 return 0; 2692 #endif 2693 g_use_uring = true; 2694 break; 2695 case PERF_LOG_FLAG: 2696 rc = spdk_log_set_flag(optarg); 2697 if (rc < 0) { 2698 fprintf(stderr, "unknown flag\n"); 2699 usage(argv[0]); 2700 exit(EXIT_FAILURE); 2701 } 2702 #ifdef DEBUG 2703 spdk_log_set_print_level(SPDK_LOG_DEBUG); 2704 #endif 2705 break; 2706 case PERF_ENABLE_VMD: 2707 g_vmd = true; 2708 break; 2709 case PERF_DISABLE_KTLS: 2710 ssl_used = true; 2711 perf_set_sock_opts("ssl", "ktls", 0, NULL); 2712 break; 2713 case PERF_ENABLE_KTLS: 2714 ssl_used = true; 2715 perf_set_sock_opts("ssl", "ktls", 1, NULL); 2716 break; 2717 case PERF_TLS_VERSION: 2718 ssl_used = true; 2719 val = spdk_strtol(optarg, 10); 2720 if (val < 0) { 2721 fprintf(stderr, "Illegal tls version value %s\n", optarg); 2722 return val; 2723 } 2724 perf_set_sock_opts("ssl", "tls_version", val, NULL); 2725 break; 2726 case PERF_PSK_PATH: 2727 ssl_used = true; 2728 perf_set_sock_opts("ssl", "psk_path", 0, optarg); 2729 break; 2730 case PERF_PSK_IDENTITY: 2731 ssl_used = true; 2732 perf_set_sock_opts("ssl", "psk_identity", 0, optarg); 2733 break; 2734 case PERF_DISABLE_ZCOPY: 2735 perf_set_sock_opts(optarg, "enable_zerocopy_send_client", 0, NULL); 2736 break; 2737 case PERF_ENABLE_ZCOPY: 2738 perf_set_sock_opts(optarg, "enable_zerocopy_send_client", 1, NULL); 2739 break; 2740 case PERF_USE_EVERY_CORE: 2741 g_use_every_core = true; 2742 break; 2743 case PERF_DEFAULT_SOCK_IMPL: 2744 sock_impl = optarg; 2745 rc = spdk_sock_set_default_impl(optarg); 2746 if (rc) { 2747 fprintf(stderr, "Failed to set sock impl %s, err %d (%s)\n", optarg, errno, strerror(errno)); 2748 return 1; 2749 } 2750 break; 2751 case PERF_TRANSPORT_STATISTICS: 2752 g_dump_transport_stats = true; 2753 break; 2754 case PERF_IOVA_MODE: 2755 env_opts->iova_mode = optarg; 2756 break; 2757 case PERF_SOCK_IMPL: 2758 g_sock_threshold_impl = optarg; 2759 break; 2760 case PERF_TRANSPORT_TOS: 2761 val = spdk_strtol(optarg, 10); 2762 if (val < 0) { 2763 fprintf(stderr, "Invalid TOS value\n"); 2764 return 1; 2765 } 2766 g_transport_tos = val; 2767 break; 2768 case PERF_NO_HUGE: 2769 env_opts->no_huge = true; 2770 break; 2771 default: 2772 usage(argv[0]); 2773 return 1; 2774 } 2775 } 2776 2777 if (!g_nr_io_queues_per_ns) { 2778 usage(argv[0]); 2779 return 1; 2780 } 2781 2782 if (!g_queue_depth) { 2783 fprintf(stderr, "missing -q (--io-depth) operand\n"); 2784 usage(argv[0]); 2785 return 1; 2786 } 2787 if (!g_io_size_bytes) { 2788 fprintf(stderr, "missing -o (--io-size) operand\n"); 2789 usage(argv[0]); 2790 return 1; 2791 } 2792 if (!g_io_unit_size || g_io_unit_size % 4) { 2793 fprintf(stderr, "io unit size can not be 0 or non 4-byte aligned\n"); 2794 return 1; 2795 } 2796 if (!g_workload_type) { 2797 fprintf(stderr, "missing -w (--io-pattern) operand\n"); 2798 usage(argv[0]); 2799 return 1; 2800 } 2801 if (!g_time_in_sec) { 2802 fprintf(stderr, "missing -t (--time) operand\n"); 2803 usage(argv[0]); 2804 return 1; 2805 } 2806 if (!g_quiet_count) { 2807 fprintf(stderr, "-Q (--continue-on-error) value must be greater than 0\n"); 2808 usage(argv[0]); 2809 return 1; 2810 } 2811 2812 if (strncmp(g_workload_type, "rand", 4) == 0) { 2813 g_is_random = 1; 2814 g_workload_type = &g_workload_type[4]; 2815 } 2816 2817 if (ssl_used && strncmp(sock_impl, "ssl", 3) != 0) { 2818 fprintf(stderr, "sock impl is not SSL but tried to use one of the SSL only options\n"); 2819 usage(argv[0]); 2820 return 1; 2821 } 2822 2823 2824 if (strcmp(g_workload_type, "read") == 0 || strcmp(g_workload_type, "write") == 0) { 2825 g_rw_percentage = strcmp(g_workload_type, "read") == 0 ? 100 : 0; 2826 if (g_mix_specified) { 2827 fprintf(stderr, "Ignoring -M (--rwmixread) option... Please use -M option" 2828 " only when using rw or randrw.\n"); 2829 } 2830 } else if (strcmp(g_workload_type, "rw") == 0) { 2831 if (g_rw_percentage < 0 || g_rw_percentage > 100) { 2832 fprintf(stderr, 2833 "-M (--rwmixread) must be specified to value from 0 to 100 " 2834 "for rw or randrw.\n"); 2835 return 1; 2836 } 2837 } else { 2838 fprintf(stderr, 2839 "-w (--io-pattern) io pattern type must be one of\n" 2840 "(read, write, randread, randwrite, rw, randrw)\n"); 2841 return 1; 2842 } 2843 2844 if (g_sock_zcopy_threshold > 0) { 2845 if (!g_sock_threshold_impl) { 2846 fprintf(stderr, 2847 "--zerocopy-threshold must be set with sock implementation specified(--zerocopy-threshold-sock-impl <impl>)\n"); 2848 return 1; 2849 } 2850 2851 perf_set_sock_opts(g_sock_threshold_impl, "zerocopy_threshold", g_sock_zcopy_threshold, NULL); 2852 } 2853 2854 if (g_number_ios && g_warmup_time_in_sec) { 2855 fprintf(stderr, "-d (--number-ios) with -a (--warmup-time) is not supported\n"); 2856 return 1; 2857 } 2858 2859 if (g_number_ios && g_number_ios < g_queue_depth) { 2860 fprintf(stderr, "-d (--number-ios) less than -q (--io-depth) is not supported\n"); 2861 return 1; 2862 } 2863 2864 if (g_rdma_srq_size != 0) { 2865 struct spdk_nvme_transport_opts opts; 2866 2867 spdk_nvme_transport_get_opts(&opts, sizeof(opts)); 2868 opts.rdma_srq_size = g_rdma_srq_size; 2869 2870 rc = spdk_nvme_transport_set_opts(&opts, sizeof(opts)); 2871 if (rc != 0) { 2872 fprintf(stderr, "Failed to set NVMe transport options.\n"); 2873 return 1; 2874 } 2875 } 2876 2877 if (TAILQ_EMPTY(&g_trid_list)) { 2878 /* If no transport IDs specified, default to enumerating all local PCIe devices */ 2879 add_trid("trtype:PCIe"); 2880 } else { 2881 struct trid_entry *trid_entry, *trid_entry_tmp; 2882 2883 env_opts->no_pci = true; 2884 /* check whether there is local PCIe type */ 2885 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) { 2886 if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 2887 env_opts->no_pci = false; 2888 break; 2889 } 2890 } 2891 } 2892 2893 g_file_optind = optind; 2894 2895 return 0; 2896 } 2897 2898 static int 2899 register_workers(void) 2900 { 2901 uint32_t i; 2902 struct worker_thread *worker; 2903 2904 SPDK_ENV_FOREACH_CORE(i) { 2905 worker = calloc(1, sizeof(*worker)); 2906 if (worker == NULL) { 2907 fprintf(stderr, "Unable to allocate worker\n"); 2908 return -1; 2909 } 2910 2911 TAILQ_INIT(&worker->ns_ctx); 2912 worker->lcore = i; 2913 TAILQ_INSERT_TAIL(&g_workers, worker, link); 2914 g_num_workers++; 2915 } 2916 2917 return 0; 2918 } 2919 2920 static void 2921 unregister_workers(void) 2922 { 2923 struct worker_thread *worker, *tmp_worker; 2924 struct ns_worker_ctx *ns_ctx, *tmp_ns_ctx; 2925 2926 /* Free namespace context and worker thread */ 2927 TAILQ_FOREACH_SAFE(worker, &g_workers, link, tmp_worker) { 2928 TAILQ_REMOVE(&g_workers, worker, link); 2929 2930 TAILQ_FOREACH_SAFE(ns_ctx, &worker->ns_ctx, link, tmp_ns_ctx) { 2931 TAILQ_REMOVE(&worker->ns_ctx, ns_ctx, link); 2932 spdk_histogram_data_free(ns_ctx->histogram); 2933 free(ns_ctx); 2934 } 2935 2936 free(worker); 2937 } 2938 } 2939 2940 static bool 2941 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2942 struct spdk_nvme_ctrlr_opts *opts) 2943 { 2944 struct trid_entry *trid_entry = cb_ctx; 2945 2946 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2947 if (g_disable_sq_cmb) { 2948 opts->use_cmb_sqs = false; 2949 } 2950 if (g_no_shn_notification) { 2951 opts->no_shn_notification = true; 2952 } 2953 } 2954 2955 if (trid->trtype != trid_entry->trid.trtype && 2956 strcasecmp(trid->trstring, trid_entry->trid.trstring)) { 2957 return false; 2958 } 2959 2960 opts->io_queue_size = g_io_queue_size; 2961 2962 /* Set the header and data_digest */ 2963 opts->header_digest = g_header_digest; 2964 opts->data_digest = g_data_digest; 2965 opts->keep_alive_timeout_ms = g_keep_alive_timeout_in_ms; 2966 memcpy(opts->hostnqn, trid_entry->hostnqn, sizeof(opts->hostnqn)); 2967 2968 opts->transport_tos = g_transport_tos; 2969 if (opts->num_io_queues < g_num_workers * g_nr_io_queues_per_ns) { 2970 opts->num_io_queues = g_num_workers * g_nr_io_queues_per_ns; 2971 } 2972 2973 if (g_psk != NULL) { 2974 memcpy(opts->psk, g_psk, strlen(g_psk)); 2975 } 2976 2977 return true; 2978 } 2979 2980 static void 2981 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2982 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2983 { 2984 struct trid_entry *trid_entry = cb_ctx; 2985 struct spdk_pci_addr pci_addr; 2986 struct spdk_pci_device *pci_dev; 2987 struct spdk_pci_id pci_id; 2988 2989 if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) { 2990 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n", 2991 trid->traddr, trid->trsvcid, 2992 trid->subnqn); 2993 } else { 2994 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) { 2995 return; 2996 } 2997 2998 pci_dev = spdk_nvme_ctrlr_get_pci_device(ctrlr); 2999 if (!pci_dev) { 3000 return; 3001 } 3002 3003 pci_id = spdk_pci_device_get_id(pci_dev); 3004 3005 printf("Attached to NVMe Controller at %s [%04x:%04x]\n", 3006 trid->traddr, 3007 pci_id.vendor_id, pci_id.device_id); 3008 } 3009 3010 register_ctrlr(ctrlr, trid_entry); 3011 } 3012 3013 static int 3014 register_controllers(void) 3015 { 3016 struct trid_entry *trid_entry; 3017 3018 printf("Initializing NVMe Controllers\n"); 3019 3020 if (g_vmd && spdk_vmd_init()) { 3021 fprintf(stderr, "Failed to initialize VMD." 3022 " Some NVMe devices can be unavailable.\n"); 3023 } 3024 3025 TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) { 3026 if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) { 3027 fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n", 3028 trid_entry->trid.traddr); 3029 return -1; 3030 } 3031 } 3032 3033 return 0; 3034 } 3035 3036 static void 3037 unregister_controllers(void) 3038 { 3039 struct ctrlr_entry *entry, *tmp; 3040 struct spdk_nvme_detach_ctx *detach_ctx = NULL; 3041 3042 TAILQ_FOREACH_SAFE(entry, &g_controllers, link, tmp) { 3043 TAILQ_REMOVE(&g_controllers, entry, link); 3044 3045 spdk_dma_free(entry->latency_page); 3046 if (g_latency_ssd_tracking_enable && 3047 spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) { 3048 set_latency_tracking_feature(entry->ctrlr, false); 3049 } 3050 3051 if (g_nr_unused_io_queues) { 3052 int i; 3053 3054 for (i = 0; i < g_nr_unused_io_queues; i++) { 3055 spdk_nvme_ctrlr_free_io_qpair(entry->unused_qpairs[i]); 3056 } 3057 3058 free(entry->unused_qpairs); 3059 } 3060 3061 spdk_nvme_detach_async(entry->ctrlr, &detach_ctx); 3062 free(entry); 3063 } 3064 3065 if (detach_ctx) { 3066 spdk_nvme_detach_poll(detach_ctx); 3067 } 3068 3069 if (g_vmd) { 3070 spdk_vmd_fini(); 3071 } 3072 } 3073 3074 static int 3075 allocate_ns_worker(struct ns_entry *entry, struct worker_thread *worker) 3076 { 3077 struct ns_worker_ctx *ns_ctx; 3078 3079 ns_ctx = calloc(1, sizeof(struct ns_worker_ctx)); 3080 if (!ns_ctx) { 3081 return -1; 3082 } 3083 3084 printf("Associating %s with lcore %d\n", entry->name, worker->lcore); 3085 ns_ctx->stats.min_tsc = UINT64_MAX; 3086 ns_ctx->entry = entry; 3087 ns_ctx->histogram = spdk_histogram_data_alloc(); 3088 TAILQ_INSERT_TAIL(&worker->ns_ctx, ns_ctx, link); 3089 3090 return 0; 3091 } 3092 3093 static int 3094 associate_workers_with_ns(void) 3095 { 3096 struct ns_entry *entry = TAILQ_FIRST(&g_namespaces); 3097 struct worker_thread *worker = TAILQ_FIRST(&g_workers); 3098 int i, count; 3099 3100 /* Each core contains single worker, and namespaces are associated as follows: 3101 * --use-every-core not specified (default): 3102 * 2) equal workers and namespaces - each worker associated with single namespace 3103 * 3) more workers than namespaces - each namespace is associated with one or more workers 3104 * 4) more namespaces than workers - each worker is associated with one or more namespaces 3105 * --use-every-core option enabled - every worker is associated with all namespaces 3106 */ 3107 if (g_use_every_core) { 3108 TAILQ_FOREACH(worker, &g_workers, link) { 3109 TAILQ_FOREACH(entry, &g_namespaces, link) { 3110 if (allocate_ns_worker(entry, worker) != 0) { 3111 return -1; 3112 } 3113 } 3114 } 3115 return 0; 3116 } 3117 3118 count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers; 3119 3120 for (i = 0; i < count; i++) { 3121 if (entry == NULL) { 3122 break; 3123 } 3124 3125 if (allocate_ns_worker(entry, worker) != 0) { 3126 return -1; 3127 } 3128 3129 worker = TAILQ_NEXT(worker, link); 3130 if (worker == NULL) { 3131 worker = TAILQ_FIRST(&g_workers); 3132 } 3133 3134 entry = TAILQ_NEXT(entry, link); 3135 if (entry == NULL) { 3136 entry = TAILQ_FIRST(&g_namespaces); 3137 } 3138 3139 } 3140 3141 return 0; 3142 } 3143 3144 static void * 3145 nvme_poll_ctrlrs(void *arg) 3146 { 3147 struct ctrlr_entry *entry; 3148 int oldstate; 3149 int rc; 3150 3151 spdk_unaffinitize_thread(); 3152 3153 while (true) { 3154 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate); 3155 3156 TAILQ_FOREACH(entry, &g_controllers, link) { 3157 if (entry->trtype != SPDK_NVME_TRANSPORT_PCIE) { 3158 rc = spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr); 3159 if (spdk_unlikely(rc < 0 && !g_exit)) { 3160 g_exit = true; 3161 } 3162 } 3163 } 3164 3165 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate); 3166 3167 /* This is a pthread cancellation point and cannot be removed. */ 3168 sleep(1); 3169 } 3170 3171 return NULL; 3172 } 3173 3174 static void 3175 sig_handler(int signo) 3176 { 3177 g_exit = true; 3178 } 3179 3180 static int 3181 setup_sig_handlers(void) 3182 { 3183 struct sigaction sigact = {}; 3184 int rc; 3185 3186 sigemptyset(&sigact.sa_mask); 3187 sigact.sa_handler = sig_handler; 3188 rc = sigaction(SIGINT, &sigact, NULL); 3189 if (rc < 0) { 3190 fprintf(stderr, "sigaction(SIGINT) failed, errno %d (%s)\n", errno, strerror(errno)); 3191 return -1; 3192 } 3193 3194 rc = sigaction(SIGTERM, &sigact, NULL); 3195 if (rc < 0) { 3196 fprintf(stderr, "sigaction(SIGTERM) failed, errno %d (%s)\n", errno, strerror(errno)); 3197 return -1; 3198 } 3199 3200 return 0; 3201 } 3202 3203 int 3204 main(int argc, char **argv) 3205 { 3206 int rc; 3207 struct worker_thread *worker, *main_worker; 3208 struct ns_worker_ctx *ns_ctx; 3209 struct spdk_env_opts opts; 3210 pthread_t thread_id = 0; 3211 3212 /* Use the runtime PID to set the random seed */ 3213 srand(getpid()); 3214 3215 spdk_env_opts_init(&opts); 3216 opts.name = "perf"; 3217 opts.pci_allowed = g_allowed_pci_addr; 3218 rc = parse_args(argc, argv, &opts); 3219 if (rc != 0) { 3220 free(g_psk); 3221 return rc; 3222 } 3223 /* Transport statistics are printed from each thread. 3224 * To avoid mess in terminal, init and use mutex */ 3225 rc = pthread_mutex_init(&g_stats_mutex, NULL); 3226 if (rc != 0) { 3227 fprintf(stderr, "Failed to init mutex\n"); 3228 free(g_psk); 3229 return -1; 3230 } 3231 if (spdk_env_init(&opts) < 0) { 3232 fprintf(stderr, "Unable to initialize SPDK env\n"); 3233 unregister_trids(); 3234 pthread_mutex_destroy(&g_stats_mutex); 3235 free(g_psk); 3236 return -1; 3237 } 3238 3239 rc = setup_sig_handlers(); 3240 if (rc != 0) { 3241 rc = -1; 3242 goto cleanup; 3243 } 3244 3245 g_tsc_rate = spdk_get_ticks_hz(); 3246 3247 if (register_workers() != 0) { 3248 rc = -1; 3249 goto cleanup; 3250 } 3251 3252 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING) 3253 if (register_files(argc, argv) != 0) { 3254 rc = -1; 3255 goto cleanup; 3256 } 3257 #endif 3258 3259 if (register_controllers() != 0) { 3260 rc = -1; 3261 goto cleanup; 3262 } 3263 3264 if (g_warn) { 3265 printf("WARNING: Some requested NVMe devices were skipped\n"); 3266 } 3267 3268 if (g_num_namespaces == 0) { 3269 fprintf(stderr, "No valid NVMe controllers or AIO or URING devices found\n"); 3270 goto cleanup; 3271 } 3272 3273 if (g_num_workers > 1 && g_quiet_count > 1) { 3274 fprintf(stderr, "Error message rate-limiting enabled across multiple threads.\n"); 3275 fprintf(stderr, "Error suppression count may not be exact.\n"); 3276 } 3277 3278 rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL); 3279 if (rc != 0) { 3280 fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n"); 3281 goto cleanup; 3282 } 3283 3284 if (associate_workers_with_ns() != 0) { 3285 rc = -1; 3286 goto cleanup; 3287 } 3288 3289 rc = pthread_barrier_init(&g_worker_sync_barrier, NULL, g_num_workers); 3290 if (rc != 0) { 3291 fprintf(stderr, "Unable to initialize thread sync barrier\n"); 3292 goto cleanup; 3293 } 3294 3295 printf("Initialization complete. Launching workers.\n"); 3296 3297 /* Launch all of the secondary workers */ 3298 g_main_core = spdk_env_get_current_core(); 3299 main_worker = NULL; 3300 TAILQ_FOREACH(worker, &g_workers, link) { 3301 if (worker->lcore != g_main_core) { 3302 spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker); 3303 } else { 3304 assert(main_worker == NULL); 3305 main_worker = worker; 3306 } 3307 } 3308 3309 assert(main_worker != NULL); 3310 work_fn(main_worker); 3311 3312 spdk_env_thread_wait_all(); 3313 3314 print_stats(); 3315 3316 pthread_barrier_destroy(&g_worker_sync_barrier); 3317 3318 cleanup: 3319 if (thread_id && pthread_cancel(thread_id) == 0) { 3320 pthread_join(thread_id, NULL); 3321 } 3322 3323 /* Collect errors from all workers and namespaces */ 3324 TAILQ_FOREACH(worker, &g_workers, link) { 3325 if (rc != 0) { 3326 break; 3327 } 3328 3329 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 3330 if (ns_ctx->status != 0) { 3331 rc = ns_ctx->status; 3332 break; 3333 } 3334 } 3335 } 3336 3337 unregister_trids(); 3338 unregister_namespaces(); 3339 unregister_controllers(); 3340 unregister_workers(); 3341 3342 spdk_env_fini(); 3343 3344 free(g_psk); 3345 3346 pthread_mutex_destroy(&g_stats_mutex); 3347 3348 if (rc != 0) { 3349 fprintf(stderr, "%s: errors occurred\n", argv[0]); 3350 } 3351 3352 return rc; 3353 } 3354