1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2015 Intel Corporation. 3 * All rights reserved. 4 * 5 * Copyright (c) 2019-2021 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 */ 8 9 #include "spdk/stdinc.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/fd.h" 14 #include "spdk/nvme.h" 15 #include "spdk/vmd.h" 16 #include "spdk/queue.h" 17 #include "spdk/string.h" 18 #include "spdk/nvme_intel.h" 19 #include "spdk/histogram_data.h" 20 #include "spdk/endian.h" 21 #include "spdk/dif.h" 22 #include "spdk/util.h" 23 #include "spdk/log.h" 24 #include "spdk/likely.h" 25 #include "spdk/sock.h" 26 #include "spdk/zipf.h" 27 #include "spdk/nvmf.h" 28 #include "spdk/keyring.h" 29 #include "spdk/module/keyring/file.h" 30 31 #ifdef SPDK_CONFIG_URING 32 #include <liburing.h> 33 #endif 34 35 #if HAVE_LIBAIO 36 #include <libaio.h> 37 #endif 38 39 #define HELP_RETURN_CODE UINT16_MAX 40 41 struct ctrlr_entry { 42 struct spdk_nvme_ctrlr *ctrlr; 43 enum spdk_nvme_transport_type trtype; 44 struct spdk_nvme_intel_rw_latency_page *latency_page; 45 46 struct spdk_nvme_qpair **unused_qpairs; 47 48 TAILQ_ENTRY(ctrlr_entry) link; 49 char name[1024]; 50 }; 51 52 enum entry_type { 53 ENTRY_TYPE_NVME_NS, 54 ENTRY_TYPE_AIO_FILE, 55 ENTRY_TYPE_URING_FILE, 56 }; 57 58 struct ns_fn_table; 59 60 struct ns_entry { 61 enum entry_type type; 62 const struct ns_fn_table *fn_table; 63 64 union { 65 struct { 66 struct spdk_nvme_ctrlr *ctrlr; 67 struct spdk_nvme_ns *ns; 68 } nvme; 69 #ifdef SPDK_CONFIG_URING 70 struct { 71 int fd; 72 } uring; 73 #endif 74 #if HAVE_LIBAIO 75 struct { 76 int fd; 77 } aio; 78 #endif 79 } u; 80 81 TAILQ_ENTRY(ns_entry) link; 82 uint32_t io_size_blocks; 83 uint32_t num_io_requests; 84 uint64_t size_in_ios; 85 uint32_t block_size; 86 uint32_t md_size; 87 bool md_interleave; 88 unsigned int seed; 89 struct spdk_zipf *zipf; 90 bool pi_loc; 91 enum spdk_nvme_pi_type pi_type; 92 uint32_t io_flags; 93 char name[1024]; 94 }; 95 96 static const double g_latency_cutoffs[] = { 97 0.01, 98 0.10, 99 0.25, 100 0.50, 101 0.75, 102 0.90, 103 0.95, 104 0.98, 105 0.99, 106 0.995, 107 0.999, 108 0.9999, 109 0.99999, 110 0.999999, 111 0.9999999, 112 -1, 113 }; 114 115 struct ns_worker_stats { 116 uint64_t io_submitted; 117 uint64_t io_completed; 118 uint64_t last_io_completed; 119 uint64_t total_tsc; 120 uint64_t min_tsc; 121 uint64_t max_tsc; 122 uint64_t last_tsc; 123 uint64_t busy_tsc; 124 uint64_t idle_tsc; 125 uint64_t last_busy_tsc; 126 uint64_t last_idle_tsc; 127 }; 128 129 struct ns_worker_ctx { 130 struct ns_entry *entry; 131 struct ns_worker_stats stats; 132 uint64_t current_queue_depth; 133 uint64_t offset_in_ios; 134 bool is_draining; 135 136 union { 137 struct { 138 int num_active_qpairs; 139 int num_all_qpairs; 140 struct spdk_nvme_qpair **qpair; 141 struct spdk_nvme_poll_group *group; 142 int last_qpair; 143 } nvme; 144 145 #ifdef SPDK_CONFIG_URING 146 struct { 147 struct io_uring ring; 148 uint64_t io_inflight; 149 uint64_t io_pending; 150 struct io_uring_cqe **cqes; 151 152 } uring; 153 #endif 154 #if HAVE_LIBAIO 155 struct { 156 struct io_event *events; 157 io_context_t ctx; 158 } aio; 159 #endif 160 } u; 161 162 TAILQ_ENTRY(ns_worker_ctx) link; 163 164 TAILQ_HEAD(, perf_task) queued_tasks; 165 166 struct spdk_histogram_data *histogram; 167 int status; 168 }; 169 170 struct perf_task { 171 struct ns_worker_ctx *ns_ctx; 172 struct iovec *iovs; /* array of iovecs to transfer. */ 173 int iovcnt; /* Number of iovecs in iovs array. */ 174 int iovpos; /* Current iovec position. */ 175 uint32_t iov_offset; /* Offset in current iovec. */ 176 struct iovec md_iov; 177 uint64_t submit_tsc; 178 bool is_read; 179 struct spdk_dif_ctx dif_ctx; 180 #if HAVE_LIBAIO 181 struct iocb iocb; 182 #endif 183 TAILQ_ENTRY(perf_task) link; 184 }; 185 186 struct worker_thread { 187 TAILQ_HEAD(, ns_worker_ctx) ns_ctx; 188 TAILQ_ENTRY(worker_thread) link; 189 unsigned lcore; 190 }; 191 192 struct ns_fn_table { 193 void (*setup_payload)(struct perf_task *task, uint8_t pattern); 194 195 int (*submit_io)(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 196 struct ns_entry *entry, uint64_t offset_in_ios); 197 198 int64_t (*check_io)(struct ns_worker_ctx *ns_ctx); 199 200 void (*verify_io)(struct perf_task *task, struct ns_entry *entry); 201 202 int (*init_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx); 203 204 void (*cleanup_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx); 205 void (*dump_transport_stats)(uint32_t lcore, struct ns_worker_ctx *ns_ctx); 206 }; 207 208 static uint32_t g_io_unit_size = (UINT32_MAX & (~0x03)); 209 210 static int g_outstanding_commands; 211 212 static bool g_latency_ssd_tracking_enable; 213 static int g_latency_sw_tracking_level; 214 215 static bool g_vmd; 216 static const char *g_workload_type; 217 static TAILQ_HEAD(, ctrlr_entry) g_controllers = TAILQ_HEAD_INITIALIZER(g_controllers); 218 static TAILQ_HEAD(, ns_entry) g_namespaces = TAILQ_HEAD_INITIALIZER(g_namespaces); 219 static uint32_t g_num_namespaces; 220 static TAILQ_HEAD(, worker_thread) g_workers = TAILQ_HEAD_INITIALIZER(g_workers); 221 static uint32_t g_num_workers = 0; 222 static bool g_use_every_core = false; 223 static uint32_t g_main_core; 224 static pthread_barrier_t g_worker_sync_barrier; 225 226 static uint64_t g_tsc_rate; 227 228 static bool g_monitor_perf_cores = false; 229 230 static uint32_t g_io_align = 0x200; 231 static bool g_io_align_specified; 232 static uint32_t g_io_size_bytes; 233 static uint32_t g_max_io_md_size; 234 static uint32_t g_max_io_size_blocks; 235 static uint32_t g_metacfg_pract_flag; 236 static uint32_t g_metacfg_prchk_flags; 237 static int g_rw_percentage = -1; 238 static int g_is_random; 239 static uint32_t g_queue_depth; 240 static int g_nr_io_queues_per_ns = 1; 241 static int g_nr_unused_io_queues; 242 static int g_time_in_sec; 243 static uint64_t g_number_ios; 244 static uint64_t g_elapsed_time_in_usec; 245 static int g_warmup_time_in_sec; 246 static uint32_t g_max_completions; 247 static uint32_t g_disable_sq_cmb; 248 static bool g_use_uring; 249 static bool g_warn; 250 static bool g_header_digest; 251 static bool g_data_digest; 252 static bool g_no_shn_notification; 253 static bool g_mix_specified; 254 /* The flag is used to exit the program while keep alive fails on the transport */ 255 static bool g_exit; 256 /* Default to 10 seconds for the keep alive value. This value is arbitrary. */ 257 static uint32_t g_keep_alive_timeout_in_ms = 10000; 258 static bool g_continue_on_error = false; 259 static uint32_t g_quiet_count = 1; 260 static double g_zipf_theta; 261 /* Set default io_queue_size to UINT16_MAX, NVMe driver will then reduce this 262 * to MQES to maximize the io_queue_size as much as possible. 263 */ 264 static uint32_t g_io_queue_size = UINT16_MAX; 265 266 static uint32_t g_sock_zcopy_threshold; 267 static char *g_sock_threshold_impl; 268 269 static uint8_t g_transport_tos = 0; 270 271 static uint32_t g_rdma_srq_size; 272 static struct spdk_key *g_psk = NULL; 273 274 /* When user specifies -Q, some error messages are rate limited. When rate 275 * limited, we only print the error message every g_quiet_count times the 276 * error occurs. 277 * 278 * Note: the __count is not thread safe, meaning the rate limiting will not 279 * be exact when running perf with multiple thread with lots of errors. 280 * Thread-local __count would mean rate-limiting per thread which doesn't 281 * seem as useful. 282 */ 283 #define RATELIMIT_LOG(...) \ 284 { \ 285 static uint64_t __count = 0; \ 286 if ((__count % g_quiet_count) == 0) { \ 287 if (__count > 0 && g_quiet_count > 1) { \ 288 fprintf(stderr, "Message suppressed %" PRIu32 " times: ", \ 289 g_quiet_count - 1); \ 290 } \ 291 fprintf(stderr, __VA_ARGS__); \ 292 } \ 293 __count++; \ 294 } 295 296 static bool g_dump_transport_stats; 297 static pthread_mutex_t g_stats_mutex; 298 299 #define MAX_ALLOWED_PCI_DEVICE_NUM 128 300 static struct spdk_pci_addr g_allowed_pci_addr[MAX_ALLOWED_PCI_DEVICE_NUM]; 301 302 struct trid_entry { 303 struct spdk_nvme_transport_id trid; 304 uint16_t nsid; 305 char hostnqn[SPDK_NVMF_NQN_MAX_LEN + 1]; 306 TAILQ_ENTRY(trid_entry) tailq; 307 }; 308 309 static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list); 310 311 static int g_file_optind; /* Index of first filename in argv */ 312 313 static inline void task_complete(struct perf_task *task); 314 315 static void 316 perf_set_sock_opts(const char *impl_name, const char *field, uint32_t val, const char *valstr) 317 { 318 struct spdk_sock_impl_opts sock_opts = {}; 319 size_t opts_size = sizeof(sock_opts); 320 int rc; 321 322 rc = spdk_sock_impl_get_opts(impl_name, &sock_opts, &opts_size); 323 if (rc != 0) { 324 if (errno == EINVAL) { 325 fprintf(stderr, "Unknown sock impl %s\n", impl_name); 326 } else { 327 fprintf(stderr, "Failed to get opts for sock impl %s: error %d (%s)\n", impl_name, errno, 328 strerror(errno)); 329 } 330 return; 331 } 332 333 if (opts_size != sizeof(sock_opts)) { 334 fprintf(stderr, "Warning: sock_opts size mismatch. Expected %zu, received %zu\n", 335 sizeof(sock_opts), opts_size); 336 opts_size = sizeof(sock_opts); 337 } 338 339 if (!field) { 340 fprintf(stderr, "Warning: no socket opts field specified\n"); 341 return; 342 } else if (strcmp(field, "enable_zerocopy_send_client") == 0) { 343 sock_opts.enable_zerocopy_send_client = val; 344 } else if (strcmp(field, "tls_version") == 0) { 345 sock_opts.tls_version = val; 346 } else if (strcmp(field, "ktls") == 0) { 347 sock_opts.enable_ktls = val; 348 } else if (strcmp(field, "zerocopy_threshold") == 0) { 349 sock_opts.zerocopy_threshold = val; 350 } else { 351 fprintf(stderr, "Warning: invalid or unprocessed socket opts field: %s\n", field); 352 return; 353 } 354 355 if (spdk_sock_impl_set_opts(impl_name, &sock_opts, opts_size)) { 356 fprintf(stderr, "Failed to set %s: %d for sock impl %s : error %d (%s)\n", field, val, impl_name, 357 errno, strerror(errno)); 358 } 359 } 360 361 static void 362 nvme_perf_reset_sgl(void *ref, uint32_t sgl_offset) 363 { 364 struct iovec *iov; 365 struct perf_task *task = (struct perf_task *)ref; 366 367 task->iov_offset = sgl_offset; 368 for (task->iovpos = 0; task->iovpos < task->iovcnt; task->iovpos++) { 369 iov = &task->iovs[task->iovpos]; 370 if (task->iov_offset < iov->iov_len) { 371 break; 372 } 373 374 task->iov_offset -= iov->iov_len; 375 } 376 } 377 378 static int 379 nvme_perf_next_sge(void *ref, void **address, uint32_t *length) 380 { 381 struct iovec *iov; 382 struct perf_task *task = (struct perf_task *)ref; 383 384 assert(task->iovpos < task->iovcnt); 385 386 iov = &task->iovs[task->iovpos]; 387 assert(task->iov_offset <= iov->iov_len); 388 389 *address = iov->iov_base + task->iov_offset; 390 *length = iov->iov_len - task->iov_offset; 391 task->iovpos++; 392 task->iov_offset = 0; 393 394 return 0; 395 } 396 397 static int 398 nvme_perf_allocate_iovs(struct perf_task *task, void *buf, uint32_t length) 399 { 400 int iovpos = 0; 401 struct iovec *iov; 402 uint32_t offset = 0; 403 404 task->iovcnt = SPDK_CEIL_DIV(length, (uint64_t)g_io_unit_size); 405 task->iovs = calloc(task->iovcnt, sizeof(struct iovec)); 406 if (!task->iovs) { 407 return -1; 408 } 409 410 while (length > 0) { 411 iov = &task->iovs[iovpos]; 412 iov->iov_len = spdk_min(length, g_io_unit_size); 413 iov->iov_base = buf + offset; 414 length -= iov->iov_len; 415 offset += iov->iov_len; 416 iovpos++; 417 } 418 419 return 0; 420 } 421 422 #ifdef SPDK_CONFIG_URING 423 424 static void 425 uring_setup_payload(struct perf_task *task, uint8_t pattern) 426 { 427 struct iovec *iov; 428 429 task->iovs = calloc(1, sizeof(struct iovec)); 430 if (!task->iovs) { 431 fprintf(stderr, "perf task failed to allocate iovs\n"); 432 exit(1); 433 } 434 task->iovcnt = 1; 435 436 iov = &task->iovs[0]; 437 iov->iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); 438 iov->iov_len = g_io_size_bytes; 439 if (iov->iov_base == NULL) { 440 fprintf(stderr, "spdk_dma_zmalloc() for task->iovs[0].iov_base failed\n"); 441 free(task->iovs); 442 exit(1); 443 } 444 memset(iov->iov_base, pattern, iov->iov_len); 445 } 446 447 static int 448 uring_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 449 struct ns_entry *entry, uint64_t offset_in_ios) 450 { 451 struct io_uring_sqe *sqe; 452 453 sqe = io_uring_get_sqe(&ns_ctx->u.uring.ring); 454 if (!sqe) { 455 fprintf(stderr, "Cannot get sqe\n"); 456 return -1; 457 } 458 459 if (task->is_read) { 460 io_uring_prep_readv(sqe, entry->u.uring.fd, task->iovs, 1, offset_in_ios * task->iovs[0].iov_len); 461 } else { 462 io_uring_prep_writev(sqe, entry->u.uring.fd, task->iovs, 1, offset_in_ios * task->iovs[0].iov_len); 463 } 464 465 io_uring_sqe_set_data(sqe, task); 466 ns_ctx->u.uring.io_pending++; 467 468 return 0; 469 } 470 471 static int64_t 472 uring_check_io(struct ns_worker_ctx *ns_ctx) 473 { 474 int i, to_complete, to_submit, count = 0, ret = 0; 475 struct perf_task *task; 476 477 to_submit = ns_ctx->u.uring.io_pending; 478 479 if (to_submit > 0) { 480 /* If there are I/O to submit, use io_uring_submit here. 481 * It will automatically call spdk_io_uring_enter appropriately. */ 482 ret = io_uring_submit(&ns_ctx->u.uring.ring); 483 if (ret < 0) { 484 ns_ctx->status = 1; 485 return -1; 486 } 487 ns_ctx->u.uring.io_pending = 0; 488 ns_ctx->u.uring.io_inflight += to_submit; 489 } 490 491 to_complete = ns_ctx->u.uring.io_inflight; 492 if (to_complete > 0) { 493 count = io_uring_peek_batch_cqe(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes, to_complete); 494 ns_ctx->u.uring.io_inflight -= count; 495 for (i = 0; i < count; i++) { 496 int res; 497 498 assert(ns_ctx->u.uring.cqes[i] != NULL); 499 task = (struct perf_task *)ns_ctx->u.uring.cqes[i]->user_data; 500 res = ns_ctx->u.uring.cqes[i]->res; 501 if (res != (int)task->iovs[0].iov_len) { 502 fprintf(stderr, "cqe->status=%d, iov_len=%d\n", res, 503 (int)task->iovs[0].iov_len); 504 ns_ctx->status = 1; 505 if (res == -EIO) { 506 /* The block device has been removed. 507 * Stop trying to send I/O to it. 508 */ 509 ns_ctx->is_draining = true; 510 } 511 } 512 io_uring_cqe_seen(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes[i]); 513 task_complete(task); 514 } 515 } 516 return count; 517 } 518 519 static void 520 uring_verify_io(struct perf_task *task, struct ns_entry *entry) 521 { 522 } 523 524 static int 525 uring_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 526 { 527 if (io_uring_queue_init(g_queue_depth, &ns_ctx->u.uring.ring, 0) < 0) { 528 SPDK_ERRLOG("uring I/O context setup failure\n"); 529 return -1; 530 } 531 532 ns_ctx->u.uring.cqes = calloc(g_queue_depth, sizeof(struct io_uring_cqe *)); 533 if (!ns_ctx->u.uring.cqes) { 534 io_uring_queue_exit(&ns_ctx->u.uring.ring); 535 return -1; 536 } 537 538 return 0; 539 } 540 541 static void 542 uring_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 543 { 544 io_uring_queue_exit(&ns_ctx->u.uring.ring); 545 free(ns_ctx->u.uring.cqes); 546 } 547 548 static const struct ns_fn_table uring_fn_table = { 549 .setup_payload = uring_setup_payload, 550 .submit_io = uring_submit_io, 551 .check_io = uring_check_io, 552 .verify_io = uring_verify_io, 553 .init_ns_worker_ctx = uring_init_ns_worker_ctx, 554 .cleanup_ns_worker_ctx = uring_cleanup_ns_worker_ctx, 555 }; 556 557 #endif 558 559 #ifdef HAVE_LIBAIO 560 static void 561 aio_setup_payload(struct perf_task *task, uint8_t pattern) 562 { 563 struct iovec *iov; 564 565 task->iovs = calloc(1, sizeof(struct iovec)); 566 if (!task->iovs) { 567 fprintf(stderr, "perf task failed to allocate iovs\n"); 568 exit(1); 569 } 570 task->iovcnt = 1; 571 572 iov = &task->iovs[0]; 573 iov->iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); 574 iov->iov_len = g_io_size_bytes; 575 if (iov->iov_base == NULL) { 576 fprintf(stderr, "spdk_dma_zmalloc() for task->iovs[0].iov_base failed\n"); 577 free(task->iovs); 578 exit(1); 579 } 580 memset(iov->iov_base, pattern, iov->iov_len); 581 } 582 583 static int 584 aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd, 585 struct iovec *iov, uint64_t offset, void *cb_ctx) 586 { 587 iocb->aio_fildes = fd; 588 iocb->aio_reqprio = 0; 589 iocb->aio_lio_opcode = cmd; 590 iocb->u.c.buf = iov->iov_base; 591 iocb->u.c.nbytes = iov->iov_len; 592 iocb->u.c.offset = offset * iov->iov_len; 593 iocb->data = cb_ctx; 594 595 if (io_submit(aio_ctx, 1, &iocb) < 0) { 596 printf("io_submit"); 597 return -1; 598 } 599 600 return 0; 601 } 602 603 static int 604 aio_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 605 struct ns_entry *entry, uint64_t offset_in_ios) 606 { 607 if (task->is_read) { 608 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PREAD, 609 task->iovs, offset_in_ios, task); 610 } else { 611 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PWRITE, 612 task->iovs, offset_in_ios, task); 613 } 614 } 615 616 static int64_t 617 aio_check_io(struct ns_worker_ctx *ns_ctx) 618 { 619 int count, i; 620 struct timespec timeout; 621 struct perf_task *task; 622 623 timeout.tv_sec = 0; 624 timeout.tv_nsec = 0; 625 626 count = io_getevents(ns_ctx->u.aio.ctx, 1, g_queue_depth, ns_ctx->u.aio.events, &timeout); 627 if (count < 0) { 628 fprintf(stderr, "io_getevents error\n"); 629 ns_ctx->status = 1; 630 return -1; 631 } 632 633 for (i = 0; i < count; i++) { 634 unsigned long res; 635 636 task = (struct perf_task *)ns_ctx->u.aio.events[i].data; 637 res = ns_ctx->u.aio.events[i].res; 638 if (res != (uint64_t)task->iovs[0].iov_len) { 639 fprintf(stderr, "event->res=%ld, iov_len=%lu\n", (long)res, 640 (uint64_t)task->iovs[0].iov_len); 641 ns_ctx->status = 1; 642 if ((long)res == -EIO) { 643 /* The block device has been removed. Stop trying to send I/O to it. */ 644 ns_ctx->is_draining = true; 645 } 646 } 647 task_complete(ns_ctx->u.aio.events[i].data); 648 } 649 return count; 650 } 651 652 static void 653 aio_verify_io(struct perf_task *task, struct ns_entry *entry) 654 { 655 } 656 657 static int 658 aio_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 659 { 660 ns_ctx->u.aio.events = calloc(g_queue_depth, sizeof(struct io_event)); 661 if (!ns_ctx->u.aio.events) { 662 return -1; 663 } 664 ns_ctx->u.aio.ctx = 0; 665 if (io_setup(g_queue_depth, &ns_ctx->u.aio.ctx) < 0) { 666 free(ns_ctx->u.aio.events); 667 perror("io_setup"); 668 return -1; 669 } 670 return 0; 671 } 672 673 static void 674 aio_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 675 { 676 io_destroy(ns_ctx->u.aio.ctx); 677 free(ns_ctx->u.aio.events); 678 } 679 680 static const struct ns_fn_table aio_fn_table = { 681 .setup_payload = aio_setup_payload, 682 .submit_io = aio_submit_io, 683 .check_io = aio_check_io, 684 .verify_io = aio_verify_io, 685 .init_ns_worker_ctx = aio_init_ns_worker_ctx, 686 .cleanup_ns_worker_ctx = aio_cleanup_ns_worker_ctx, 687 }; 688 689 #endif /* HAVE_LIBAIO */ 690 691 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING) 692 693 static int 694 register_file(const char *path) 695 { 696 struct ns_entry *entry; 697 698 int flags, fd; 699 uint64_t size; 700 uint32_t blklen; 701 702 if (g_rw_percentage == 100) { 703 flags = O_RDONLY; 704 } else if (g_rw_percentage == 0) { 705 flags = O_WRONLY; 706 } else { 707 flags = O_RDWR; 708 } 709 710 flags |= O_DIRECT; 711 712 fd = open(path, flags); 713 if (fd < 0) { 714 fprintf(stderr, "Could not open device %s: %s\n", path, strerror(errno)); 715 return -1; 716 } 717 718 size = spdk_fd_get_size(fd); 719 if (size == 0) { 720 fprintf(stderr, "Could not determine size of device %s\n", path); 721 close(fd); 722 return -1; 723 } 724 725 blklen = spdk_fd_get_blocklen(fd); 726 if (blklen == 0) { 727 fprintf(stderr, "Could not determine block size of device %s\n", path); 728 close(fd); 729 return -1; 730 } 731 732 /* 733 * TODO: This should really calculate the LCM of the current g_io_align and blklen. 734 * For now, it's fairly safe to just assume all block sizes are powers of 2. 735 */ 736 if (g_io_align < blklen) { 737 if (g_io_align_specified) { 738 fprintf(stderr, "Wrong IO alignment (%u). aio requires block-sized alignment (%u)\n", g_io_align, 739 blklen); 740 close(fd); 741 return -1; 742 } 743 744 g_io_align = blklen; 745 } 746 747 entry = calloc(1, sizeof(struct ns_entry)); 748 if (entry == NULL) { 749 close(fd); 750 perror("ns_entry malloc"); 751 return -1; 752 } 753 754 if (g_use_uring) { 755 #ifdef SPDK_CONFIG_URING 756 entry->type = ENTRY_TYPE_URING_FILE; 757 entry->fn_table = &uring_fn_table; 758 entry->u.uring.fd = fd; 759 #endif 760 } else { 761 #if HAVE_LIBAIO 762 entry->type = ENTRY_TYPE_AIO_FILE; 763 entry->fn_table = &aio_fn_table; 764 entry->u.aio.fd = fd; 765 #endif 766 } 767 entry->size_in_ios = size / g_io_size_bytes; 768 entry->io_size_blocks = g_io_size_bytes / blklen; 769 770 if (g_is_random) { 771 entry->seed = rand(); 772 if (g_zipf_theta > 0) { 773 entry->zipf = spdk_zipf_create(entry->size_in_ios, g_zipf_theta, 0); 774 } 775 } 776 777 snprintf(entry->name, sizeof(entry->name), "%s", path); 778 779 g_num_namespaces++; 780 TAILQ_INSERT_TAIL(&g_namespaces, entry, link); 781 782 return 0; 783 } 784 785 static int 786 register_files(int argc, char **argv) 787 { 788 int i; 789 790 /* Treat everything after the options as files for AIO/URING */ 791 for (i = g_file_optind; i < argc; i++) { 792 if (register_file(argv[i]) != 0) { 793 return 1; 794 } 795 } 796 797 return 0; 798 } 799 #endif 800 801 static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl); 802 803 static void 804 nvme_setup_payload(struct perf_task *task, uint8_t pattern) 805 { 806 struct spdk_nvme_ctrlr *ctrlr; 807 uint32_t max_io_size_bytes, max_io_md_size; 808 int32_t numa_id; 809 void *buf; 810 int rc; 811 812 ctrlr = task->ns_ctx->entry->u.nvme.ctrlr; 813 numa_id = spdk_nvme_ctrlr_get_numa_id(ctrlr); 814 815 /* maximum extended lba format size from all active namespace, 816 * it's same with g_io_size_bytes for namespace without metadata. 817 */ 818 max_io_size_bytes = g_io_size_bytes + g_max_io_md_size * g_max_io_size_blocks; 819 buf = spdk_dma_zmalloc_socket(max_io_size_bytes, g_io_align, NULL, numa_id); 820 if (buf == NULL) { 821 fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n"); 822 exit(1); 823 } 824 memset(buf, pattern, max_io_size_bytes); 825 826 rc = nvme_perf_allocate_iovs(task, buf, max_io_size_bytes); 827 if (rc < 0) { 828 fprintf(stderr, "perf task failed to allocate iovs\n"); 829 spdk_dma_free(buf); 830 exit(1); 831 } 832 833 max_io_md_size = g_max_io_md_size * g_max_io_size_blocks; 834 if (max_io_md_size != 0) { 835 task->md_iov.iov_base = spdk_dma_zmalloc(max_io_md_size, g_io_align, NULL); 836 task->md_iov.iov_len = max_io_md_size; 837 if (task->md_iov.iov_base == NULL) { 838 fprintf(stderr, "task->md_buf spdk_dma_zmalloc failed\n"); 839 spdk_dma_free(task->iovs[0].iov_base); 840 free(task->iovs); 841 exit(1); 842 } 843 } 844 } 845 846 static int 847 nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 848 struct ns_entry *entry, uint64_t offset_in_ios) 849 { 850 uint64_t lba; 851 int rc; 852 int qp_num; 853 struct spdk_dif_ctx_init_ext_opts dif_opts; 854 855 enum dif_mode { 856 DIF_MODE_NONE = 0, 857 DIF_MODE_DIF = 1, 858 DIF_MODE_DIX = 2, 859 } mode = DIF_MODE_NONE; 860 861 lba = offset_in_ios * entry->io_size_blocks; 862 863 if (entry->md_size != 0 && !(entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) { 864 if (entry->md_interleave) { 865 mode = DIF_MODE_DIF; 866 } else { 867 mode = DIF_MODE_DIX; 868 } 869 } 870 871 qp_num = ns_ctx->u.nvme.last_qpair; 872 ns_ctx->u.nvme.last_qpair++; 873 if (ns_ctx->u.nvme.last_qpair == ns_ctx->u.nvme.num_active_qpairs) { 874 ns_ctx->u.nvme.last_qpair = 0; 875 } 876 877 if (mode != DIF_MODE_NONE) { 878 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 879 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 880 rc = spdk_dif_ctx_init(&task->dif_ctx, entry->block_size, entry->md_size, 881 entry->md_interleave, entry->pi_loc, 882 (enum spdk_dif_type)entry->pi_type, entry->io_flags, 883 lba, 0xFFFF, (uint16_t)entry->io_size_blocks, 0, 0, &dif_opts); 884 if (rc != 0) { 885 fprintf(stderr, "Initialization of DIF context failed\n"); 886 exit(1); 887 } 888 } 889 890 if (task->is_read) { 891 if (task->iovcnt == 1) { 892 return spdk_nvme_ns_cmd_read_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 893 task->iovs[0].iov_base, task->md_iov.iov_base, 894 lba, 895 entry->io_size_blocks, io_complete, 896 task, entry->io_flags, 897 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 898 } else { 899 return spdk_nvme_ns_cmd_readv_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 900 lba, entry->io_size_blocks, 901 io_complete, task, entry->io_flags, 902 nvme_perf_reset_sgl, nvme_perf_next_sge, 903 task->md_iov.iov_base, 904 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 905 } 906 } else { 907 switch (mode) { 908 case DIF_MODE_DIF: 909 rc = spdk_dif_generate(task->iovs, task->iovcnt, entry->io_size_blocks, &task->dif_ctx); 910 if (rc != 0) { 911 fprintf(stderr, "Generation of DIF failed\n"); 912 return rc; 913 } 914 break; 915 case DIF_MODE_DIX: 916 rc = spdk_dix_generate(task->iovs, task->iovcnt, &task->md_iov, entry->io_size_blocks, 917 &task->dif_ctx); 918 if (rc != 0) { 919 fprintf(stderr, "Generation of DIX failed\n"); 920 return rc; 921 } 922 break; 923 default: 924 break; 925 } 926 927 if (task->iovcnt == 1) { 928 return spdk_nvme_ns_cmd_write_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 929 task->iovs[0].iov_base, task->md_iov.iov_base, 930 lba, 931 entry->io_size_blocks, io_complete, 932 task, entry->io_flags, 933 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 934 } else { 935 return spdk_nvme_ns_cmd_writev_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 936 lba, entry->io_size_blocks, 937 io_complete, task, entry->io_flags, 938 nvme_perf_reset_sgl, nvme_perf_next_sge, 939 task->md_iov.iov_base, 940 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 941 } 942 } 943 } 944 945 static void 946 perf_disconnect_cb(struct spdk_nvme_qpair *qpair, void *ctx) 947 { 948 struct ns_worker_ctx *ns_ctx = ctx; 949 950 ns_ctx->is_draining = true; 951 ns_ctx->status = 1; 952 } 953 954 static int64_t 955 nvme_check_io(struct ns_worker_ctx *ns_ctx) 956 { 957 int64_t rc; 958 959 rc = spdk_nvme_poll_group_process_completions(ns_ctx->u.nvme.group, g_max_completions, 960 perf_disconnect_cb); 961 if (rc < 0) { 962 fprintf(stderr, "NVMe io qpair process completion error\n"); 963 ns_ctx->status = 1; 964 return -1; 965 } 966 return rc; 967 } 968 969 static void 970 nvme_verify_io(struct perf_task *task, struct ns_entry *entry) 971 { 972 struct spdk_dif_error err_blk = {}; 973 int rc; 974 975 if (!task->is_read || (entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) { 976 return; 977 } 978 979 if (entry->md_interleave) { 980 rc = spdk_dif_verify(task->iovs, task->iovcnt, entry->io_size_blocks, &task->dif_ctx, 981 &err_blk); 982 if (rc != 0) { 983 fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n", 984 err_blk.err_type, err_blk.err_offset); 985 } 986 } else { 987 rc = spdk_dix_verify(task->iovs, task->iovcnt, &task->md_iov, entry->io_size_blocks, 988 &task->dif_ctx, &err_blk); 989 if (rc != 0) { 990 fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n", 991 err_blk.err_type, err_blk.err_offset); 992 } 993 } 994 } 995 996 /* 997 * TODO: If a controller has multiple namespaces, they could all use the same queue. 998 * For now, give each namespace/thread combination its own queue. 999 */ 1000 static int 1001 nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1002 { 1003 const struct spdk_nvme_ctrlr_opts *ctrlr_opts; 1004 struct spdk_nvme_io_qpair_opts opts; 1005 struct ns_entry *entry = ns_ctx->entry; 1006 struct spdk_nvme_poll_group *group; 1007 struct spdk_nvme_qpair *qpair; 1008 uint64_t poll_timeout_tsc; 1009 int i, rc; 1010 1011 ns_ctx->u.nvme.num_active_qpairs = g_nr_io_queues_per_ns; 1012 ns_ctx->u.nvme.num_all_qpairs = g_nr_io_queues_per_ns + g_nr_unused_io_queues; 1013 ns_ctx->u.nvme.qpair = calloc(ns_ctx->u.nvme.num_all_qpairs, sizeof(struct spdk_nvme_qpair *)); 1014 if (!ns_ctx->u.nvme.qpair) { 1015 return -1; 1016 } 1017 1018 spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->u.nvme.ctrlr, &opts, sizeof(opts)); 1019 if (opts.io_queue_requests < entry->num_io_requests) { 1020 opts.io_queue_requests = entry->num_io_requests; 1021 } 1022 opts.delay_cmd_submit = true; 1023 opts.create_only = true; 1024 1025 ctrlr_opts = spdk_nvme_ctrlr_get_opts(entry->u.nvme.ctrlr); 1026 opts.async_mode = !(spdk_nvme_ctrlr_get_transport_id(entry->u.nvme.ctrlr)->trtype == 1027 SPDK_NVME_TRANSPORT_PCIE 1028 && ns_ctx->u.nvme.num_all_qpairs > ctrlr_opts->admin_queue_size); 1029 1030 ns_ctx->u.nvme.group = spdk_nvme_poll_group_create(ns_ctx, NULL); 1031 if (ns_ctx->u.nvme.group == NULL) { 1032 goto poll_group_failed; 1033 } 1034 1035 group = ns_ctx->u.nvme.group; 1036 for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) { 1037 ns_ctx->u.nvme.qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->u.nvme.ctrlr, &opts, 1038 sizeof(opts)); 1039 qpair = ns_ctx->u.nvme.qpair[i]; 1040 if (!qpair) { 1041 printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n"); 1042 goto qpair_failed; 1043 } 1044 1045 if (spdk_nvme_poll_group_add(group, qpair)) { 1046 printf("ERROR: unable to add I/O qpair to poll group.\n"); 1047 spdk_nvme_ctrlr_free_io_qpair(qpair); 1048 goto qpair_failed; 1049 } 1050 1051 if (spdk_nvme_ctrlr_connect_io_qpair(entry->u.nvme.ctrlr, qpair)) { 1052 printf("ERROR: unable to connect I/O qpair.\n"); 1053 spdk_nvme_ctrlr_free_io_qpair(qpair); 1054 goto qpair_failed; 1055 } 1056 } 1057 1058 /* Busy poll here until all qpairs are connected - this ensures once we start 1059 * I/O we aren't still waiting for some qpairs to connect. Limit the poll to 1060 * 10 seconds though. 1061 */ 1062 poll_timeout_tsc = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 1063 rc = -EAGAIN; 1064 while (spdk_get_ticks() < poll_timeout_tsc && rc == -EAGAIN) { 1065 spdk_nvme_poll_group_process_completions(group, 0, perf_disconnect_cb); 1066 rc = spdk_nvme_poll_group_all_connected(group); 1067 if (rc == 0) { 1068 return 0; 1069 } 1070 } 1071 1072 /* If we reach here, it means we either timed out, or some connection failed. */ 1073 assert(spdk_get_ticks() > poll_timeout_tsc || rc == -EIO); 1074 1075 qpair_failed: 1076 for (; i > 0; --i) { 1077 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i - 1]); 1078 } 1079 1080 spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group); 1081 poll_group_failed: 1082 free(ns_ctx->u.nvme.qpair); 1083 return -1; 1084 } 1085 1086 static void 1087 nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1088 { 1089 int i; 1090 1091 for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) { 1092 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i]); 1093 } 1094 1095 spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group); 1096 free(ns_ctx->u.nvme.qpair); 1097 } 1098 1099 static void 1100 nvme_dump_rdma_statistics(struct spdk_nvme_transport_poll_group_stat *stat) 1101 { 1102 struct spdk_nvme_rdma_device_stat *device_stats; 1103 uint32_t i; 1104 1105 printf("RDMA transport:\n"); 1106 for (i = 0; i < stat->rdma.num_devices; i++) { 1107 device_stats = &stat->rdma.device_stats[i]; 1108 printf("\tdev name: %s\n", device_stats->name); 1109 printf("\tpolls: %"PRIu64"\n", device_stats->polls); 1110 printf("\tidle_polls: %"PRIu64"\n", device_stats->idle_polls); 1111 printf("\tcompletions: %"PRIu64"\n", device_stats->completions); 1112 printf("\tqueued_requests: %"PRIu64"\n", device_stats->queued_requests); 1113 printf("\ttotal_send_wrs: %"PRIu64"\n", device_stats->total_send_wrs); 1114 printf("\tsend_doorbell_updates: %"PRIu64"\n", device_stats->send_doorbell_updates); 1115 printf("\ttotal_recv_wrs: %"PRIu64"\n", device_stats->total_recv_wrs); 1116 printf("\trecv_doorbell_updates: %"PRIu64"\n", device_stats->recv_doorbell_updates); 1117 printf("\t---------------------------------\n"); 1118 } 1119 } 1120 1121 static void 1122 nvme_dump_pcie_statistics(struct spdk_nvme_transport_poll_group_stat *stat) 1123 { 1124 struct spdk_nvme_pcie_stat *pcie_stat; 1125 1126 pcie_stat = &stat->pcie; 1127 1128 printf("PCIE transport:\n"); 1129 printf("\tpolls: %"PRIu64"\n", pcie_stat->polls); 1130 printf("\tidle_polls: %"PRIu64"\n", pcie_stat->idle_polls); 1131 printf("\tcompletions: %"PRIu64"\n", pcie_stat->completions); 1132 printf("\tcq_mmio_doorbell_updates: %"PRIu64"\n", pcie_stat->cq_mmio_doorbell_updates); 1133 printf("\tcq_shadow_doorbell_updates: %"PRIu64"\n", pcie_stat->cq_shadow_doorbell_updates); 1134 printf("\tsubmitted_requests: %"PRIu64"\n", pcie_stat->submitted_requests); 1135 printf("\tsq_mmio_doorbell_updates: %"PRIu64"\n", pcie_stat->sq_mmio_doorbell_updates); 1136 printf("\tsq_shadow_doorbell_updates: %"PRIu64"\n", pcie_stat->sq_shadow_doorbell_updates); 1137 printf("\tqueued_requests: %"PRIu64"\n", pcie_stat->queued_requests); 1138 } 1139 1140 static void 1141 nvme_dump_tcp_statistics(struct spdk_nvme_transport_poll_group_stat *stat) 1142 { 1143 struct spdk_nvme_tcp_stat *tcp_stat; 1144 1145 tcp_stat = &stat->tcp; 1146 1147 printf("TCP transport:\n"); 1148 printf("\tpolls: %"PRIu64"\n", tcp_stat->polls); 1149 printf("\tidle_polls: %"PRIu64"\n", tcp_stat->idle_polls); 1150 printf("\tsock_completions: %"PRIu64"\n", tcp_stat->socket_completions); 1151 printf("\tnvme_completions: %"PRIu64"\n", tcp_stat->nvme_completions); 1152 printf("\tsubmitted_requests: %"PRIu64"\n", tcp_stat->submitted_requests); 1153 printf("\tqueued_requests: %"PRIu64"\n", tcp_stat->queued_requests); 1154 } 1155 1156 static void 1157 nvme_dump_transport_stats(uint32_t lcore, struct ns_worker_ctx *ns_ctx) 1158 { 1159 struct spdk_nvme_poll_group *group; 1160 struct spdk_nvme_poll_group_stat *stat = NULL; 1161 uint32_t i; 1162 int rc; 1163 1164 group = ns_ctx->u.nvme.group; 1165 if (group == NULL) { 1166 return; 1167 } 1168 1169 rc = spdk_nvme_poll_group_get_stats(group, &stat); 1170 if (rc) { 1171 fprintf(stderr, "Can't get transport stats, error %d\n", rc); 1172 return; 1173 } 1174 1175 printf("\n====================\n"); 1176 printf("lcore %u, ns %s statistics:\n", lcore, ns_ctx->entry->name); 1177 1178 for (i = 0; i < stat->num_transports; i++) { 1179 switch (stat->transport_stat[i]->trtype) { 1180 case SPDK_NVME_TRANSPORT_RDMA: 1181 nvme_dump_rdma_statistics(stat->transport_stat[i]); 1182 break; 1183 case SPDK_NVME_TRANSPORT_PCIE: 1184 nvme_dump_pcie_statistics(stat->transport_stat[i]); 1185 break; 1186 case SPDK_NVME_TRANSPORT_TCP: 1187 nvme_dump_tcp_statistics(stat->transport_stat[i]); 1188 break; 1189 default: 1190 fprintf(stderr, "Unknown transport statistics %d %s\n", stat->transport_stat[i]->trtype, 1191 spdk_nvme_transport_id_trtype_str(stat->transport_stat[i]->trtype)); 1192 } 1193 } 1194 1195 spdk_nvme_poll_group_free_stats(group, stat); 1196 } 1197 1198 static const struct ns_fn_table nvme_fn_table = { 1199 .setup_payload = nvme_setup_payload, 1200 .submit_io = nvme_submit_io, 1201 .check_io = nvme_check_io, 1202 .verify_io = nvme_verify_io, 1203 .init_ns_worker_ctx = nvme_init_ns_worker_ctx, 1204 .cleanup_ns_worker_ctx = nvme_cleanup_ns_worker_ctx, 1205 .dump_transport_stats = nvme_dump_transport_stats 1206 }; 1207 1208 static int 1209 build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) 1210 { 1211 const struct spdk_nvme_transport_id *trid; 1212 int res = 0; 1213 1214 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1215 1216 switch (trid->trtype) { 1217 case SPDK_NVME_TRANSPORT_PCIE: 1218 res = snprintf(name, length, "PCIE (%s)", trid->traddr); 1219 break; 1220 case SPDK_NVME_TRANSPORT_RDMA: 1221 res = snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 1222 break; 1223 case SPDK_NVME_TRANSPORT_TCP: 1224 res = snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 1225 break; 1226 case SPDK_NVME_TRANSPORT_VFIOUSER: 1227 res = snprintf(name, length, "VFIOUSER (%s)", trid->traddr); 1228 break; 1229 case SPDK_NVME_TRANSPORT_CUSTOM: 1230 res = snprintf(name, length, "CUSTOM (%s)", trid->traddr); 1231 break; 1232 1233 default: 1234 fprintf(stderr, "Unknown transport type %d\n", trid->trtype); 1235 break; 1236 } 1237 return res; 1238 } 1239 1240 static void 1241 build_nvme_ns_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) 1242 { 1243 int res = 0; 1244 1245 res = build_nvme_name(name, length, ctrlr); 1246 if (res > 0) { 1247 snprintf(name + res, length - res, " NSID %u", nsid); 1248 } 1249 1250 } 1251 1252 static void 1253 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) 1254 { 1255 struct ns_entry *entry; 1256 const struct spdk_nvme_ctrlr_data *cdata; 1257 uint32_t max_xfer_size, entries, sector_size; 1258 uint64_t ns_size; 1259 struct spdk_nvme_io_qpair_opts opts; 1260 1261 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1262 1263 if (!spdk_nvme_ns_is_active(ns)) { 1264 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", 1265 cdata->mn, cdata->sn, 1266 spdk_nvme_ns_get_id(ns)); 1267 g_warn = true; 1268 return; 1269 } 1270 1271 ns_size = spdk_nvme_ns_get_size(ns); 1272 sector_size = spdk_nvme_ns_get_sector_size(ns); 1273 1274 if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) { 1275 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " 1276 "ns size %" PRIu64 " / block size %u for I/O size %u\n", 1277 cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), 1278 ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes); 1279 g_warn = true; 1280 return; 1281 } 1282 1283 max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 1284 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 1285 /* NVMe driver may add additional entries based on 1286 * stripe size and maximum transfer size, we assume 1287 * 1 more entry be used for stripe. 1288 */ 1289 entries = (g_io_size_bytes - 1) / max_xfer_size + 2; 1290 if ((g_queue_depth * entries) > opts.io_queue_size) { 1291 printf("Controller IO queue size %u, less than required.\n", 1292 opts.io_queue_size); 1293 printf("Consider using lower queue depth or smaller IO size, because " 1294 "IO requests may be queued at the NVMe driver.\n"); 1295 } 1296 /* For requests which have children requests, parent request itself 1297 * will also occupy 1 entry. 1298 */ 1299 entries += 1; 1300 1301 entry = calloc(1, sizeof(struct ns_entry)); 1302 if (entry == NULL) { 1303 perror("ns_entry malloc"); 1304 exit(1); 1305 } 1306 1307 entry->type = ENTRY_TYPE_NVME_NS; 1308 entry->fn_table = &nvme_fn_table; 1309 entry->u.nvme.ctrlr = ctrlr; 1310 entry->u.nvme.ns = ns; 1311 entry->num_io_requests = entries * spdk_divide_round_up(g_queue_depth, g_nr_io_queues_per_ns); 1312 1313 entry->size_in_ios = ns_size / g_io_size_bytes; 1314 entry->io_size_blocks = g_io_size_bytes / sector_size; 1315 1316 if (g_is_random) { 1317 entry->seed = rand(); 1318 if (g_zipf_theta > 0) { 1319 entry->zipf = spdk_zipf_create(entry->size_in_ios, g_zipf_theta, 0); 1320 } 1321 } 1322 1323 entry->block_size = spdk_nvme_ns_get_extended_sector_size(ns); 1324 entry->md_size = spdk_nvme_ns_get_md_size(ns); 1325 entry->md_interleave = spdk_nvme_ns_supports_extended_lba(ns); 1326 entry->pi_loc = spdk_nvme_ns_get_data(ns)->dps.md_start; 1327 entry->pi_type = spdk_nvme_ns_get_pi_type(ns); 1328 1329 if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) { 1330 entry->io_flags = g_metacfg_pract_flag | g_metacfg_prchk_flags; 1331 } 1332 1333 /* If metadata size = 8 bytes, PI is stripped (read) or inserted (write), 1334 * and so reduce metadata size from block size. (If metadata size > 8 bytes, 1335 * PI is passed (read) or replaced (write). So block size is not necessary 1336 * to change.) 1337 */ 1338 if ((entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT) && (entry->md_size == 8)) { 1339 entry->block_size = spdk_nvme_ns_get_sector_size(ns); 1340 } 1341 1342 if (g_io_size_bytes % entry->block_size != 0) { 1343 printf("WARNING: IO size %u (-o) is not a multiple of nsid %u sector size %u." 1344 " Removing this ns from test\n", g_io_size_bytes, spdk_nvme_ns_get_id(ns), entry->block_size); 1345 g_warn = true; 1346 spdk_zipf_free(&entry->zipf); 1347 free(entry); 1348 return; 1349 } 1350 1351 if (g_max_io_md_size < entry->md_size) { 1352 g_max_io_md_size = entry->md_size; 1353 } 1354 1355 if (g_max_io_size_blocks < entry->io_size_blocks) { 1356 g_max_io_size_blocks = entry->io_size_blocks; 1357 } 1358 1359 build_nvme_ns_name(entry->name, sizeof(entry->name), ctrlr, spdk_nvme_ns_get_id(ns)); 1360 1361 g_num_namespaces++; 1362 TAILQ_INSERT_TAIL(&g_namespaces, entry, link); 1363 } 1364 1365 static void 1366 unregister_namespaces(void) 1367 { 1368 struct ns_entry *entry, *tmp; 1369 1370 TAILQ_FOREACH_SAFE(entry, &g_namespaces, link, tmp) { 1371 TAILQ_REMOVE(&g_namespaces, entry, link); 1372 spdk_zipf_free(&entry->zipf); 1373 if (g_use_uring) { 1374 #ifdef SPDK_CONFIG_URING 1375 close(entry->u.uring.fd); 1376 #endif 1377 } else { 1378 #if HAVE_LIBAIO 1379 close(entry->u.aio.fd); 1380 #endif 1381 } 1382 free(entry); 1383 } 1384 } 1385 1386 static void 1387 enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl) 1388 { 1389 if (spdk_nvme_cpl_is_error(cpl)) { 1390 printf("enable_latency_tracking_complete failed\n"); 1391 } 1392 g_outstanding_commands--; 1393 } 1394 1395 static void 1396 set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable) 1397 { 1398 int res; 1399 union spdk_nvme_intel_feat_latency_tracking latency_tracking; 1400 1401 if (enable) { 1402 latency_tracking.bits.enable = 0x01; 1403 } else { 1404 latency_tracking.bits.enable = 0x00; 1405 } 1406 1407 res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING, 1408 latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL); 1409 if (res) { 1410 printf("fail to allocate nvme request.\n"); 1411 return; 1412 } 1413 g_outstanding_commands++; 1414 1415 while (g_outstanding_commands) { 1416 spdk_nvme_ctrlr_process_admin_completions(ctrlr); 1417 } 1418 } 1419 1420 static void 1421 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry) 1422 { 1423 struct spdk_nvme_ns *ns; 1424 struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry)); 1425 uint32_t nsid; 1426 1427 if (entry == NULL) { 1428 perror("ctrlr_entry malloc"); 1429 exit(1); 1430 } 1431 1432 entry->latency_page = spdk_dma_zmalloc(sizeof(struct spdk_nvme_intel_rw_latency_page), 1433 4096, NULL); 1434 if (entry->latency_page == NULL) { 1435 printf("Allocation error (latency page)\n"); 1436 exit(1); 1437 } 1438 1439 build_nvme_name(entry->name, sizeof(entry->name), ctrlr); 1440 1441 entry->ctrlr = ctrlr; 1442 entry->trtype = trid_entry->trid.trtype; 1443 TAILQ_INSERT_TAIL(&g_controllers, entry, link); 1444 1445 if (g_latency_ssd_tracking_enable && 1446 spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) { 1447 set_latency_tracking_feature(ctrlr, true); 1448 } 1449 1450 if (trid_entry->nsid == 0) { 1451 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 1452 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 1453 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1454 if (ns == NULL) { 1455 continue; 1456 } 1457 register_ns(ctrlr, ns); 1458 } 1459 } else { 1460 ns = spdk_nvme_ctrlr_get_ns(ctrlr, trid_entry->nsid); 1461 if (!ns) { 1462 perror("Namespace does not exist."); 1463 exit(1); 1464 } 1465 1466 register_ns(ctrlr, ns); 1467 } 1468 } 1469 1470 static inline void 1471 submit_single_io(struct perf_task *task) 1472 { 1473 uint64_t rand_value, offset_in_ios; 1474 int rc; 1475 struct ns_worker_ctx *ns_ctx = task->ns_ctx; 1476 struct ns_entry *entry = ns_ctx->entry; 1477 1478 assert(!ns_ctx->is_draining); 1479 1480 if (entry->zipf) { 1481 offset_in_ios = spdk_zipf_generate(entry->zipf); 1482 } else if (g_is_random) { 1483 /* rand_r() returns int, so we need to use two calls to ensure 1484 * we get a large enough value to cover a very large block 1485 * device. 1486 */ 1487 rand_value = (uint64_t)rand_r(&entry->seed) * 1488 ((uint64_t)RAND_MAX + 1) + 1489 rand_r(&entry->seed); 1490 offset_in_ios = rand_value % entry->size_in_ios; 1491 } else { 1492 offset_in_ios = ns_ctx->offset_in_ios++; 1493 if (ns_ctx->offset_in_ios == entry->size_in_ios) { 1494 ns_ctx->offset_in_ios = 0; 1495 } 1496 } 1497 1498 task->submit_tsc = spdk_get_ticks(); 1499 1500 if ((g_rw_percentage == 100) || 1501 (g_rw_percentage != 0 && ((rand_r(&entry->seed) % 100) < g_rw_percentage))) { 1502 task->is_read = true; 1503 } else { 1504 task->is_read = false; 1505 } 1506 1507 rc = entry->fn_table->submit_io(task, ns_ctx, entry, offset_in_ios); 1508 1509 if (spdk_unlikely(rc != 0)) { 1510 if (g_continue_on_error) { 1511 /* We can't just resubmit here or we can get in a loop that 1512 * stack overflows. */ 1513 TAILQ_INSERT_TAIL(&ns_ctx->queued_tasks, task, link); 1514 } else { 1515 RATELIMIT_LOG("starting I/O failed: %d\n", rc); 1516 spdk_dma_free(task->iovs[0].iov_base); 1517 free(task->iovs); 1518 spdk_dma_free(task->md_iov.iov_base); 1519 task->ns_ctx->status = 1; 1520 free(task); 1521 } 1522 } else { 1523 ns_ctx->current_queue_depth++; 1524 ns_ctx->stats.io_submitted++; 1525 } 1526 1527 if (spdk_unlikely(g_number_ios && ns_ctx->stats.io_submitted >= g_number_ios)) { 1528 ns_ctx->is_draining = true; 1529 } 1530 } 1531 1532 static inline void 1533 task_complete(struct perf_task *task) 1534 { 1535 struct ns_worker_ctx *ns_ctx; 1536 uint64_t tsc_diff; 1537 struct ns_entry *entry; 1538 1539 ns_ctx = task->ns_ctx; 1540 entry = ns_ctx->entry; 1541 ns_ctx->current_queue_depth--; 1542 ns_ctx->stats.io_completed++; 1543 tsc_diff = spdk_get_ticks() - task->submit_tsc; 1544 ns_ctx->stats.total_tsc += tsc_diff; 1545 if (spdk_unlikely(ns_ctx->stats.min_tsc > tsc_diff)) { 1546 ns_ctx->stats.min_tsc = tsc_diff; 1547 } 1548 if (spdk_unlikely(ns_ctx->stats.max_tsc < tsc_diff)) { 1549 ns_ctx->stats.max_tsc = tsc_diff; 1550 } 1551 if (spdk_unlikely(g_latency_sw_tracking_level > 0)) { 1552 spdk_histogram_data_tally(ns_ctx->histogram, tsc_diff); 1553 } 1554 1555 if (spdk_unlikely(entry->md_size > 0)) { 1556 /* add application level verification for end-to-end data protection */ 1557 entry->fn_table->verify_io(task, entry); 1558 } 1559 1560 /* 1561 * is_draining indicates when time has expired or io_submitted exceeded 1562 * g_number_ios for the test run and we are just waiting for the previously 1563 * submitted I/O to complete. In this case, do not submit a new I/O to 1564 * replace the one just completed. 1565 */ 1566 if (spdk_unlikely(ns_ctx->is_draining)) { 1567 spdk_dma_free(task->iovs[0].iov_base); 1568 free(task->iovs); 1569 spdk_dma_free(task->md_iov.iov_base); 1570 free(task); 1571 } else { 1572 submit_single_io(task); 1573 } 1574 } 1575 1576 static void 1577 io_complete(void *ctx, const struct spdk_nvme_cpl *cpl) 1578 { 1579 struct perf_task *task = ctx; 1580 1581 if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { 1582 if (task->is_read) { 1583 RATELIMIT_LOG("Read completed with error (sct=%d, sc=%d)\n", 1584 cpl->status.sct, cpl->status.sc); 1585 } else { 1586 RATELIMIT_LOG("Write completed with error (sct=%d, sc=%d)\n", 1587 cpl->status.sct, cpl->status.sc); 1588 } 1589 if (!g_continue_on_error) { 1590 if (cpl->status.sct == SPDK_NVME_SCT_GENERIC && 1591 cpl->status.sc == SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT) { 1592 /* The namespace was hotplugged. Stop trying to send I/O to it. */ 1593 task->ns_ctx->is_draining = true; 1594 } 1595 1596 task->ns_ctx->status = 1; 1597 } 1598 } 1599 1600 task_complete(task); 1601 } 1602 1603 static struct perf_task * 1604 allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth) 1605 { 1606 struct perf_task *task; 1607 1608 task = calloc(1, sizeof(*task)); 1609 if (task == NULL) { 1610 fprintf(stderr, "Out of memory allocating tasks\n"); 1611 exit(1); 1612 } 1613 1614 task->ns_ctx = ns_ctx; 1615 ns_ctx->entry->fn_table->setup_payload(task, queue_depth % 8 + 1); 1616 1617 return task; 1618 } 1619 1620 static void 1621 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth) 1622 { 1623 struct perf_task *task; 1624 1625 while (queue_depth-- > 0) { 1626 task = allocate_task(ns_ctx, queue_depth); 1627 submit_single_io(task); 1628 } 1629 } 1630 1631 static int 1632 init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1633 { 1634 TAILQ_INIT(&ns_ctx->queued_tasks); 1635 return ns_ctx->entry->fn_table->init_ns_worker_ctx(ns_ctx); 1636 } 1637 1638 static void 1639 cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1640 { 1641 struct perf_task *task, *ttask; 1642 1643 TAILQ_FOREACH_SAFE(task, &ns_ctx->queued_tasks, link, ttask) { 1644 TAILQ_REMOVE(&ns_ctx->queued_tasks, task, link); 1645 task_complete(task); 1646 } 1647 ns_ctx->entry->fn_table->cleanup_ns_worker_ctx(ns_ctx); 1648 } 1649 1650 static void 1651 print_periodic_performance(bool warmup) 1652 { 1653 uint64_t io_this_second; 1654 double mb_this_second; 1655 struct worker_thread *worker; 1656 struct ns_worker_ctx *ns_ctx; 1657 uint64_t busy_tsc; 1658 uint64_t idle_tsc; 1659 uint64_t core_busy_tsc = 0; 1660 uint64_t core_idle_tsc = 0; 1661 double core_busy_perc = 0; 1662 1663 if (!isatty(STDOUT_FILENO)) { 1664 /* Don't print periodic stats if output is not going 1665 * to a terminal. 1666 */ 1667 return; 1668 } 1669 io_this_second = 0; 1670 TAILQ_FOREACH(worker, &g_workers, link) { 1671 busy_tsc = 0; 1672 idle_tsc = 0; 1673 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1674 io_this_second += ns_ctx->stats.io_completed - ns_ctx->stats.last_io_completed; 1675 ns_ctx->stats.last_io_completed = ns_ctx->stats.io_completed; 1676 1677 if (g_monitor_perf_cores) { 1678 busy_tsc += ns_ctx->stats.busy_tsc - ns_ctx->stats.last_busy_tsc; 1679 idle_tsc += ns_ctx->stats.idle_tsc - ns_ctx->stats.last_idle_tsc; 1680 ns_ctx->stats.last_busy_tsc = ns_ctx->stats.busy_tsc; 1681 ns_ctx->stats.last_idle_tsc = ns_ctx->stats.idle_tsc; 1682 } 1683 } 1684 if (g_monitor_perf_cores) { 1685 core_busy_tsc += busy_tsc; 1686 core_idle_tsc += idle_tsc; 1687 } 1688 } 1689 mb_this_second = (double)io_this_second * g_io_size_bytes / (1024 * 1024); 1690 1691 printf("%s%9ju IOPS, %8.2f MiB/s", warmup ? "[warmup] " : "", io_this_second, mb_this_second); 1692 if (g_monitor_perf_cores) { 1693 core_busy_perc = (double)core_busy_tsc / (core_idle_tsc + core_busy_tsc) * 100; 1694 printf("%3d Core(s): %6.2f%% Busy", g_num_workers, core_busy_perc); 1695 } 1696 printf("\r"); 1697 fflush(stdout); 1698 } 1699 1700 static void 1701 perf_dump_transport_statistics(struct worker_thread *worker) 1702 { 1703 struct ns_worker_ctx *ns_ctx; 1704 1705 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1706 if (ns_ctx->entry->fn_table->dump_transport_stats) { 1707 ns_ctx->entry->fn_table->dump_transport_stats(worker->lcore, ns_ctx); 1708 } 1709 } 1710 } 1711 1712 static int 1713 work_fn(void *arg) 1714 { 1715 uint64_t tsc_start, tsc_end, tsc_current, tsc_next_print; 1716 struct worker_thread *worker = (struct worker_thread *) arg; 1717 struct ns_worker_ctx *ns_ctx = NULL; 1718 uint32_t unfinished_ns_ctx; 1719 bool warmup = false; 1720 int rc; 1721 int64_t check_rc; 1722 uint64_t check_now; 1723 TAILQ_HEAD(, perf_task) swap; 1724 struct perf_task *task; 1725 1726 /* Allocate queue pairs for each namespace. */ 1727 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1728 if (init_ns_worker_ctx(ns_ctx) != 0) { 1729 printf("ERROR: init_ns_worker_ctx() failed\n"); 1730 /* Wait on barrier to avoid blocking of successful workers */ 1731 pthread_barrier_wait(&g_worker_sync_barrier); 1732 ns_ctx->status = 1; 1733 return 1; 1734 } 1735 } 1736 1737 rc = pthread_barrier_wait(&g_worker_sync_barrier); 1738 if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { 1739 printf("ERROR: failed to wait on thread sync barrier\n"); 1740 ns_ctx->status = 1; 1741 return 1; 1742 } 1743 1744 tsc_start = spdk_get_ticks(); 1745 tsc_current = tsc_start; 1746 tsc_next_print = tsc_current + g_tsc_rate; 1747 1748 if (g_warmup_time_in_sec) { 1749 warmup = true; 1750 tsc_end = tsc_current + g_warmup_time_in_sec * g_tsc_rate; 1751 } else { 1752 tsc_end = tsc_current + g_time_in_sec * g_tsc_rate; 1753 } 1754 1755 /* Submit initial I/O for each namespace. */ 1756 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1757 submit_io(ns_ctx, g_queue_depth); 1758 } 1759 1760 while (spdk_likely(!g_exit)) { 1761 bool all_draining = true; 1762 1763 /* 1764 * Check for completed I/O for each controller. A new 1765 * I/O will be submitted in the io_complete callback 1766 * to replace each I/O that is completed. 1767 */ 1768 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1769 if (g_continue_on_error && !ns_ctx->is_draining) { 1770 /* Submit any I/O that is queued up */ 1771 TAILQ_INIT(&swap); 1772 TAILQ_SWAP(&swap, &ns_ctx->queued_tasks, perf_task, link); 1773 while (!TAILQ_EMPTY(&swap)) { 1774 task = TAILQ_FIRST(&swap); 1775 TAILQ_REMOVE(&swap, task, link); 1776 if (ns_ctx->is_draining) { 1777 TAILQ_INSERT_TAIL(&ns_ctx->queued_tasks, 1778 task, link); 1779 continue; 1780 } 1781 submit_single_io(task); 1782 } 1783 } 1784 1785 check_now = spdk_get_ticks(); 1786 check_rc = ns_ctx->entry->fn_table->check_io(ns_ctx); 1787 1788 if (check_rc > 0) { 1789 ns_ctx->stats.busy_tsc += check_now - ns_ctx->stats.last_tsc; 1790 } else { 1791 ns_ctx->stats.idle_tsc += check_now - ns_ctx->stats.last_tsc; 1792 } 1793 ns_ctx->stats.last_tsc = check_now; 1794 1795 if (!ns_ctx->is_draining) { 1796 all_draining = false; 1797 } 1798 } 1799 1800 if (spdk_unlikely(all_draining)) { 1801 break; 1802 } 1803 1804 tsc_current = spdk_get_ticks(); 1805 1806 if (worker->lcore == g_main_core && tsc_current > tsc_next_print) { 1807 tsc_next_print += g_tsc_rate; 1808 print_periodic_performance(warmup); 1809 } 1810 1811 if (tsc_current > tsc_end) { 1812 if (warmup) { 1813 /* Update test start and end time, clear statistics */ 1814 tsc_start = spdk_get_ticks(); 1815 tsc_end = tsc_start + g_time_in_sec * g_tsc_rate; 1816 1817 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1818 memset(&ns_ctx->stats, 0, sizeof(ns_ctx->stats)); 1819 ns_ctx->stats.min_tsc = UINT64_MAX; 1820 spdk_histogram_data_reset(ns_ctx->histogram); 1821 } 1822 1823 if (worker->lcore == g_main_core && isatty(STDOUT_FILENO)) { 1824 /* warmup stage prints a longer string to stdout, need to erase it */ 1825 printf("%c[2K", 27); 1826 } 1827 1828 warmup = false; 1829 } else { 1830 break; 1831 } 1832 } 1833 } 1834 1835 /* Capture the actual elapsed time when we break out of the main loop. This will account 1836 * for cases where we exit prematurely due to a signal. We only need to capture it on 1837 * one core, so use the main core. 1838 */ 1839 if (worker->lcore == g_main_core) { 1840 g_elapsed_time_in_usec = (tsc_current - tsc_start) * SPDK_SEC_TO_USEC / g_tsc_rate; 1841 } 1842 1843 /* drain the io of each ns_ctx in round robin to make the fairness */ 1844 do { 1845 unfinished_ns_ctx = 0; 1846 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1847 /* first time will enter into this if case */ 1848 if (!ns_ctx->is_draining) { 1849 ns_ctx->is_draining = true; 1850 } 1851 1852 if (ns_ctx->current_queue_depth > 0) { 1853 ns_ctx->entry->fn_table->check_io(ns_ctx); 1854 if (ns_ctx->current_queue_depth > 0) { 1855 unfinished_ns_ctx++; 1856 } 1857 } 1858 } 1859 } while (unfinished_ns_ctx > 0); 1860 1861 if (g_dump_transport_stats) { 1862 pthread_mutex_lock(&g_stats_mutex); 1863 perf_dump_transport_statistics(worker); 1864 pthread_mutex_unlock(&g_stats_mutex); 1865 } 1866 1867 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1868 cleanup_ns_worker_ctx(ns_ctx); 1869 } 1870 1871 return 0; 1872 } 1873 1874 static void 1875 usage(char *program_name) 1876 { 1877 printf("%s options", program_name); 1878 #if defined(SPDK_CONFIG_URING) || defined(HAVE_LIBAIO) 1879 printf(" [Kernel device(s)]..."); 1880 #endif 1881 printf("\n\n"); 1882 printf("==== BASIC OPTIONS ====\n\n"); 1883 printf("\t-q, --io-depth <val> io depth\n"); 1884 printf("\t-o, --io-size <val> io size in bytes\n"); 1885 printf("\t-w, --io-pattern <pattern> io pattern type, must be one of\n"); 1886 printf("\t\t(read, write, randread, randwrite, rw, randrw)\n"); 1887 printf("\t-M, --rwmixread <0-100> rwmixread (100 for reads, 0 for writes)\n"); 1888 printf("\t-t, --time <sec> time in seconds\n"); 1889 printf("\t-a, --warmup-time <sec> warmup time in seconds\n"); 1890 printf("\t-c, --core-mask <mask> core mask for I/O submission/completion.\n"); 1891 printf("\t\t(default: 1)\n"); 1892 printf("\t-r, --transport <fmt> Transport ID for local PCIe NVMe or NVMeoF\n"); 1893 printf("\t\t Format: 'key:value [key:value] ...'\n"); 1894 printf("\t\t Keys:\n"); 1895 printf("\t\t trtype Transport type (e.g. PCIe, RDMA)\n"); 1896 printf("\t\t adrfam Address family (e.g. IPv4, IPv6)\n"); 1897 printf("\t\t traddr Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n"); 1898 printf("\t\t trsvcid Transport service identifier (e.g. 4420)\n"); 1899 printf("\t\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN); 1900 printf("\t\t ns NVMe namespace ID (all active namespaces are used by default)\n"); 1901 printf("\t\t hostnqn Host NQN\n"); 1902 printf("\t\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n"); 1903 printf("\t\t -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n"); 1904 printf("\t\t Note: can be specified multiple times to test multiple disks/targets.\n"); 1905 printf("\n"); 1906 1907 printf("==== ADVANCED OPTIONS ====\n\n"); 1908 printf("\t--use-every-core for each namespace, I/Os are submitted from all cores\n"); 1909 printf("\t--io-queue-size <val> size of NVMe IO queue. Default: maximum allowed by controller\n"); 1910 printf("\t-O, --io-unit-size io unit size in bytes (4-byte aligned) for SPDK driver. default: same as io size\n"); 1911 printf("\t-P, --num-qpairs <val> number of io queues per namespace. default: 1\n"); 1912 printf("\t-U, --num-unused-qpairs <val> number of unused io queues per controller. default: 0\n"); 1913 printf("\t-A, --buffer-alignment IO buffer alignment. Must be power of 2 and not less than cache line (%u)\n", 1914 SPDK_CACHE_LINE_SIZE); 1915 printf("\t-s, --hugemem-size <MB> DPDK huge memory size in MB.\n"); 1916 printf("\t-g, --mem-single-seg use single file descriptor for DPDK memory segments\n"); 1917 printf("\t-C, --max-completion-per-poll <val> max completions per poll\n"); 1918 printf("\t\t(default: 0 - unlimited)\n"); 1919 printf("\t-i, --shmem-grp-id <id> shared memory group ID\n"); 1920 printf("\t-d, --number-ios <val> number of I/O to perform per thread on each namespace. Note: this is additional exit criteria.\n"); 1921 printf("\t\t(default: 0 - unlimited)\n"); 1922 printf("\t-e, --metadata <fmt> metadata configuration\n"); 1923 printf("\t\t Keys:\n"); 1924 printf("\t\t PRACT Protection Information Action bit (PRACT=1 or PRACT=0)\n"); 1925 printf("\t\t PRCHK Control of Protection Information Checking (PRCHK=GUARD|REFTAG|APPTAG)\n"); 1926 printf("\t\t Example: -e 'PRACT=0,PRCHK=GUARD|REFTAG|APPTAG'\n"); 1927 printf("\t\t -e 'PRACT=1,PRCHK=GUARD'\n"); 1928 printf("\t-F, --zipf <theta> use zipf distribution for random I/O\n"); 1929 #ifdef SPDK_CONFIG_URING 1930 printf("\t-R, --enable-uring enable using liburing to drive kernel devices (Default: libaio)\n"); 1931 #endif 1932 printf("\t--iova-mode <mode> specify DPDK IOVA mode: va|pa\n"); 1933 printf("\t--no-huge, SPDK is run without hugepages\n"); 1934 printf("\n"); 1935 1936 printf("==== PCIe OPTIONS ====\n\n"); 1937 printf("\t-b, --allowed-pci-addr <addr> allowed local PCIe device address\n"); 1938 printf("\t\t Example: -b 0000:d8:00.0 -b 0000:d9:00.0\n"); 1939 printf("\t-V, --enable-vmd enable VMD enumeration\n"); 1940 printf("\t-D, --disable-sq-cmb disable submission queue in controller memory buffer, default: enabled\n"); 1941 printf("\n"); 1942 1943 printf("==== TCP OPTIONS ====\n\n"); 1944 printf("\t-S, --default-sock-impl <impl> set the default sock impl, e.g. \"posix\"\n"); 1945 printf("\t--disable-ktls disable Kernel TLS. Only valid for ssl impl. Default for ssl impl\n"); 1946 printf("\t--enable-ktls enable Kernel TLS. Only valid for ssl impl\n"); 1947 printf("\t--tls-version <val> TLS version to use. Only valid for ssl impl. Default: 0 (auto-negotiation)\n"); 1948 printf("\t--psk-path <val> Path to PSK file (only applies when sock_impl == ssl)\n"); 1949 printf("\t--psk-identity <val> Default PSK ID, e.g. psk.spdk.io (only applies when sock_impl == ssl)\n"); 1950 printf("\t--zerocopy-threshold <val> data is sent with MSG_ZEROCOPY if size is greater than this val. Default: 0 to disable it\n"); 1951 printf("\t--zerocopy-threshold-sock-impl <impl> specify the sock implementation to set zerocopy_threshold\n"); 1952 printf("\t-z, --disable-zcopy <impl> disable zero copy send for the given sock implementation. Default for posix impl\n"); 1953 printf("\t-Z, --enable-zcopy <impl> enable zero copy send for the given sock implementation\n"); 1954 printf("\t-k, --keepalive <ms> keep alive timeout period in millisecond\n"); 1955 printf("\t-H, --enable-tcp-hdgst enable header digest for TCP transport, default: disabled\n"); 1956 printf("\t-I, --enable-tcp-ddgst enable data digest for TCP transport, default: disabled\n"); 1957 printf("\n"); 1958 1959 printf("==== RDMA OPTIONS ====\n\n"); 1960 printf("\t--transport-tos <val> specify the type of service for RDMA transport. Default: 0 (disabled)\n"); 1961 printf("\t--rdma-srq-size <val> The size of a shared rdma receive queue. Default: 0 (disabled)\n"); 1962 printf("\t-k, --keepalive <ms> keep alive timeout period in millisecond\n"); 1963 printf("\n"); 1964 1965 printf("==== LOGGING ====\n\n"); 1966 printf("\t-L, --enable-sw-latency-tracking enable latency tracking via sw, default: disabled\n"); 1967 printf("\t\t-L for latency summary, -LL for detailed histogram\n"); 1968 printf("\t-l, --enable-ssd-latency-tracking enable latency tracking via ssd (if supported), default: disabled\n"); 1969 printf("\t-N, --no-shst-notification no shutdown notification process for controllers, default: disabled\n"); 1970 printf("\t-Q, --continue-on-error <val> Do not stop on error. Log I/O errors every N times (default: 1)\n"); 1971 spdk_log_usage(stdout, "\t-T"); 1972 printf("\t-m, --cpu-usage display real-time overall cpu usage on used cores\n"); 1973 #ifdef DEBUG 1974 printf("\t-G, --enable-debug enable debug logging\n"); 1975 #else 1976 printf("\t-G, --enable-debug enable debug logging (flag disabled, must reconfigure with --enable-debug)\n"); 1977 #endif 1978 printf("\t--transport-stats dump transport statistics\n"); 1979 printf("\n\n"); 1980 } 1981 1982 static void 1983 check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count, 1984 uint64_t total, uint64_t so_far) 1985 { 1986 double so_far_pct; 1987 double **cutoff = ctx; 1988 1989 if (count == 0) { 1990 return; 1991 } 1992 1993 so_far_pct = (double)so_far / total; 1994 while (so_far_pct >= **cutoff && **cutoff > 0) { 1995 printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * 1000 * 1000 / g_tsc_rate); 1996 (*cutoff)++; 1997 } 1998 } 1999 2000 static void 2001 print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count, 2002 uint64_t total, uint64_t so_far) 2003 { 2004 double so_far_pct; 2005 2006 if (count == 0) { 2007 return; 2008 } 2009 2010 so_far_pct = (double)so_far * 100 / total; 2011 printf("%9.3f - %9.3f: %9.4f%% (%9ju)\n", 2012 (double)start * 1000 * 1000 / g_tsc_rate, 2013 (double)end * 1000 * 1000 / g_tsc_rate, 2014 so_far_pct, count); 2015 } 2016 2017 static void 2018 print_performance(void) 2019 { 2020 uint64_t total_io_completed, total_io_tsc; 2021 double io_per_second, mb_per_second, average_latency, min_latency, max_latency; 2022 double sum_ave_latency, min_latency_so_far, max_latency_so_far; 2023 double total_io_per_second, total_mb_per_second; 2024 int ns_count; 2025 struct worker_thread *worker; 2026 struct ns_worker_ctx *ns_ctx; 2027 uint32_t max_strlen; 2028 2029 total_io_per_second = 0; 2030 total_mb_per_second = 0; 2031 total_io_completed = 0; 2032 total_io_tsc = 0; 2033 min_latency_so_far = (double)UINT64_MAX; 2034 max_latency_so_far = 0; 2035 ns_count = 0; 2036 2037 max_strlen = 0; 2038 TAILQ_FOREACH(worker, &g_workers, link) { 2039 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2040 max_strlen = spdk_max(strlen(ns_ctx->entry->name), max_strlen); 2041 } 2042 } 2043 2044 printf("========================================================\n"); 2045 printf("%*s\n", max_strlen + 60, "Latency(us)"); 2046 printf("%-*s: %10s %10s %10s %10s %10s\n", 2047 max_strlen + 13, "Device Information", "IOPS", "MiB/s", "Average", "min", "max"); 2048 2049 TAILQ_FOREACH(worker, &g_workers, link) { 2050 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2051 if (ns_ctx->stats.io_completed != 0) { 2052 io_per_second = (double)ns_ctx->stats.io_completed * 1000 * 1000 / g_elapsed_time_in_usec; 2053 mb_per_second = io_per_second * g_io_size_bytes / (1024 * 1024); 2054 average_latency = ((double)ns_ctx->stats.total_tsc / ns_ctx->stats.io_completed) * 1000 * 1000 / 2055 g_tsc_rate; 2056 min_latency = (double)ns_ctx->stats.min_tsc * 1000 * 1000 / g_tsc_rate; 2057 if (min_latency < min_latency_so_far) { 2058 min_latency_so_far = min_latency; 2059 } 2060 2061 max_latency = (double)ns_ctx->stats.max_tsc * 1000 * 1000 / g_tsc_rate; 2062 if (max_latency > max_latency_so_far) { 2063 max_latency_so_far = max_latency; 2064 } 2065 2066 printf("%-*.*s from core %2u: %10.2f %10.2f %10.2f %10.2f %10.2f\n", 2067 max_strlen, max_strlen, ns_ctx->entry->name, worker->lcore, 2068 io_per_second, mb_per_second, 2069 average_latency, min_latency, max_latency); 2070 total_io_per_second += io_per_second; 2071 total_mb_per_second += mb_per_second; 2072 total_io_completed += ns_ctx->stats.io_completed; 2073 total_io_tsc += ns_ctx->stats.total_tsc; 2074 ns_count++; 2075 } 2076 } 2077 } 2078 2079 if (ns_count != 0 && total_io_completed) { 2080 sum_ave_latency = ((double)total_io_tsc / total_io_completed) * 1000 * 1000 / g_tsc_rate; 2081 printf("========================================================\n"); 2082 printf("%-*s: %10.2f %10.2f %10.2f %10.2f %10.2f\n", 2083 max_strlen + 13, "Total", total_io_per_second, total_mb_per_second, 2084 sum_ave_latency, min_latency_so_far, max_latency_so_far); 2085 printf("\n"); 2086 } 2087 2088 if (g_latency_sw_tracking_level == 0 || total_io_completed == 0) { 2089 return; 2090 } 2091 2092 TAILQ_FOREACH(worker, &g_workers, link) { 2093 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2094 const double *cutoff = g_latency_cutoffs; 2095 2096 printf("Summary latency data for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore); 2097 printf("=================================================================================\n"); 2098 2099 spdk_histogram_data_iterate(ns_ctx->histogram, check_cutoff, &cutoff); 2100 2101 printf("\n"); 2102 } 2103 } 2104 2105 if (g_latency_sw_tracking_level == 1) { 2106 return; 2107 } 2108 2109 TAILQ_FOREACH(worker, &g_workers, link) { 2110 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2111 printf("Latency histogram for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore); 2112 printf("==============================================================================\n"); 2113 printf(" Range in us Cumulative IO count\n"); 2114 2115 spdk_histogram_data_iterate(ns_ctx->histogram, print_bucket, NULL); 2116 printf("\n"); 2117 } 2118 } 2119 2120 } 2121 2122 static void 2123 print_latency_page(struct ctrlr_entry *entry) 2124 { 2125 int i; 2126 2127 printf("\n"); 2128 printf("%s\n", entry->name); 2129 printf("--------------------------------------------------------\n"); 2130 2131 for (i = 0; i < 32; i++) { 2132 if (entry->latency_page->buckets_32us[i]) { 2133 printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32, entry->latency_page->buckets_32us[i]); 2134 } 2135 } 2136 for (i = 0; i < 31; i++) { 2137 if (entry->latency_page->buckets_1ms[i]) { 2138 printf("Bucket %dms - %dms: %d\n", i + 1, i + 2, entry->latency_page->buckets_1ms[i]); 2139 } 2140 } 2141 for (i = 0; i < 31; i++) { 2142 if (entry->latency_page->buckets_32ms[i]) 2143 printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32, 2144 entry->latency_page->buckets_32ms[i]); 2145 } 2146 } 2147 2148 static void 2149 print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page) 2150 { 2151 struct ctrlr_entry *ctrlr; 2152 2153 printf("%s Latency Statistics:\n", op_name); 2154 printf("========================================================\n"); 2155 TAILQ_FOREACH(ctrlr, &g_controllers, link) { 2156 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) { 2157 if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr->ctrlr, log_page, SPDK_NVME_GLOBAL_NS_TAG, 2158 ctrlr->latency_page, sizeof(struct spdk_nvme_intel_rw_latency_page), 0, 2159 enable_latency_tracking_complete, 2160 NULL)) { 2161 printf("nvme_ctrlr_cmd_get_log_page() failed\n"); 2162 exit(1); 2163 } 2164 2165 g_outstanding_commands++; 2166 } else { 2167 printf("Controller %s: %s latency statistics not supported\n", ctrlr->name, op_name); 2168 } 2169 } 2170 2171 while (g_outstanding_commands) { 2172 TAILQ_FOREACH(ctrlr, &g_controllers, link) { 2173 spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr); 2174 } 2175 } 2176 2177 TAILQ_FOREACH(ctrlr, &g_controllers, link) { 2178 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) { 2179 print_latency_page(ctrlr); 2180 } 2181 } 2182 printf("\n"); 2183 } 2184 2185 static void 2186 print_stats(void) 2187 { 2188 print_performance(); 2189 if (g_latency_ssd_tracking_enable) { 2190 if (g_rw_percentage != 0) { 2191 print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY); 2192 } 2193 if (g_rw_percentage != 100) { 2194 print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY); 2195 } 2196 } 2197 } 2198 2199 static void 2200 unregister_trids(void) 2201 { 2202 struct trid_entry *trid_entry, *tmp; 2203 2204 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) { 2205 TAILQ_REMOVE(&g_trid_list, trid_entry, tailq); 2206 free(trid_entry); 2207 } 2208 } 2209 2210 static int 2211 add_trid(const char *trid_str) 2212 { 2213 struct trid_entry *trid_entry; 2214 struct spdk_nvme_transport_id *trid; 2215 char *ns; 2216 char *hostnqn; 2217 2218 trid_entry = calloc(1, sizeof(*trid_entry)); 2219 if (trid_entry == NULL) { 2220 return -1; 2221 } 2222 2223 trid = &trid_entry->trid; 2224 trid->trtype = SPDK_NVME_TRANSPORT_PCIE; 2225 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 2226 2227 if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) { 2228 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str); 2229 free(trid_entry); 2230 return 1; 2231 } 2232 2233 if ((ns = strcasestr(trid_str, "ns:")) || 2234 (ns = strcasestr(trid_str, "ns="))) { 2235 char nsid_str[6]; /* 5 digits maximum in an nsid */ 2236 int len; 2237 int nsid; 2238 2239 ns += 3; 2240 2241 len = strcspn(ns, " \t\n"); 2242 if (len > 5) { 2243 fprintf(stderr, "NVMe namespace IDs must be 5 digits or less\n"); 2244 free(trid_entry); 2245 return 1; 2246 } 2247 2248 memcpy(nsid_str, ns, len); 2249 nsid_str[len] = '\0'; 2250 2251 nsid = spdk_strtol(nsid_str, 10); 2252 if (nsid <= 0 || nsid > 65535) { 2253 fprintf(stderr, "NVMe namespace IDs must be less than 65536 and greater than 0\n"); 2254 free(trid_entry); 2255 return 1; 2256 } 2257 2258 trid_entry->nsid = (uint16_t)nsid; 2259 } 2260 2261 if ((hostnqn = strcasestr(trid_str, "hostnqn:")) || 2262 (hostnqn = strcasestr(trid_str, "hostnqn="))) { 2263 size_t len; 2264 2265 hostnqn += strlen("hostnqn:"); 2266 2267 len = strcspn(hostnqn, " \t\n"); 2268 if (len > (sizeof(trid_entry->hostnqn) - 1)) { 2269 fprintf(stderr, "Host NQN is too long\n"); 2270 free(trid_entry); 2271 return 1; 2272 } 2273 2274 memcpy(trid_entry->hostnqn, hostnqn, len); 2275 trid_entry->hostnqn[len] = '\0'; 2276 } 2277 2278 TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq); 2279 return 0; 2280 } 2281 2282 static int 2283 add_allowed_pci_device(const char *bdf_str, struct spdk_env_opts *env_opts) 2284 { 2285 int rc; 2286 2287 if (env_opts->num_pci_addr >= MAX_ALLOWED_PCI_DEVICE_NUM) { 2288 fprintf(stderr, "Currently we only support allowed PCI device num=%d\n", 2289 MAX_ALLOWED_PCI_DEVICE_NUM); 2290 return -1; 2291 } 2292 2293 rc = spdk_pci_addr_parse(&env_opts->pci_allowed[env_opts->num_pci_addr], bdf_str); 2294 if (rc < 0) { 2295 fprintf(stderr, "Failed to parse the given bdf_str=%s\n", bdf_str); 2296 return -1; 2297 } 2298 2299 env_opts->num_pci_addr++; 2300 return 0; 2301 } 2302 2303 static size_t 2304 parse_next_key(const char **str, char *key, char *val, size_t key_buf_size, 2305 size_t val_buf_size) 2306 { 2307 const char *sep; 2308 const char *separator = ", \t\n"; 2309 size_t key_len, val_len; 2310 2311 *str += strspn(*str, separator); 2312 2313 sep = strchr(*str, '='); 2314 if (!sep) { 2315 fprintf(stderr, "Key without '=' separator\n"); 2316 return 0; 2317 } 2318 2319 key_len = sep - *str; 2320 if (key_len >= key_buf_size) { 2321 fprintf(stderr, "Key length %zu is greater than maximum allowed %zu\n", 2322 key_len, key_buf_size - 1); 2323 return 0; 2324 } 2325 2326 memcpy(key, *str, key_len); 2327 key[key_len] = '\0'; 2328 2329 *str += key_len + 1; /* Skip key */ 2330 val_len = strcspn(*str, separator); 2331 if (val_len == 0) { 2332 fprintf(stderr, "Key without value\n"); 2333 return 0; 2334 } 2335 2336 if (val_len >= val_buf_size) { 2337 fprintf(stderr, "Value length %zu is greater than maximum allowed %zu\n", 2338 val_len, val_buf_size - 1); 2339 return 0; 2340 } 2341 2342 memcpy(val, *str, val_len); 2343 val[val_len] = '\0'; 2344 2345 *str += val_len; 2346 2347 return val_len; 2348 } 2349 2350 static int 2351 parse_metadata(const char *metacfg_str) 2352 { 2353 const char *str; 2354 size_t val_len; 2355 char key[32]; 2356 char val[1024]; 2357 2358 if (metacfg_str == NULL) { 2359 return -EINVAL; 2360 } 2361 2362 str = metacfg_str; 2363 2364 while (*str != '\0') { 2365 val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val)); 2366 if (val_len == 0) { 2367 fprintf(stderr, "Failed to parse metadata\n"); 2368 return -EINVAL; 2369 } 2370 2371 if (strcmp(key, "PRACT") == 0) { 2372 if (*val == '1') { 2373 g_metacfg_pract_flag = SPDK_NVME_IO_FLAGS_PRACT; 2374 } 2375 } else if (strcmp(key, "PRCHK") == 0) { 2376 if (strstr(val, "GUARD") != NULL) { 2377 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD; 2378 } 2379 if (strstr(val, "REFTAG") != NULL) { 2380 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG; 2381 } 2382 if (strstr(val, "APPTAG") != NULL) { 2383 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG; 2384 } 2385 } else { 2386 fprintf(stderr, "Unknown key '%s'\n", key); 2387 } 2388 } 2389 2390 return 0; 2391 } 2392 2393 static void 2394 free_key(struct spdk_key **key) 2395 { 2396 if (*key == NULL) { 2397 return; 2398 } 2399 2400 spdk_keyring_put_key(*key); 2401 spdk_keyring_file_remove_key(spdk_key_get_name(*key)); 2402 *key = NULL; 2403 } 2404 2405 static struct spdk_key * 2406 alloc_key(const char *name, const char *path) 2407 { 2408 struct spdk_key *key; 2409 int rc; 2410 2411 rc = spdk_keyring_file_add_key(name, path); 2412 if (rc != 0) { 2413 return NULL; 2414 } 2415 2416 key = spdk_keyring_get_key(name); 2417 if (key == NULL) { 2418 return NULL; 2419 } 2420 2421 return key; 2422 } 2423 2424 #define PERF_GETOPT_SHORT "a:b:c:d:e:ghi:lmo:q:r:k:s:t:w:z:A:C:DF:GHILM:NO:P:Q:RS:T:U:VZ:" 2425 2426 static const struct option g_perf_cmdline_opts[] = { 2427 #define PERF_WARMUP_TIME 'a' 2428 {"warmup-time", required_argument, NULL, PERF_WARMUP_TIME}, 2429 #define PERF_ALLOWED_PCI_ADDR 'b' 2430 {"allowed-pci-addr", required_argument, NULL, PERF_ALLOWED_PCI_ADDR}, 2431 #define PERF_CORE_MASK 'c' 2432 {"core-mask", required_argument, NULL, PERF_CORE_MASK}, 2433 #define PERF_METADATA 'e' 2434 {"metadata", required_argument, NULL, PERF_METADATA}, 2435 #define PERF_MEM_SINGL_SEG 'g' 2436 {"mem-single-seg", no_argument, NULL, PERF_MEM_SINGL_SEG}, 2437 #define PERF_HELP 'h' 2438 {"help", no_argument, NULL, PERF_HELP}, 2439 #define PERF_SHMEM_GROUP_ID 'i' 2440 {"shmem-grp-id", required_argument, NULL, PERF_SHMEM_GROUP_ID}, 2441 #define PERF_ENABLE_SSD_LATENCY_TRACING 'l' 2442 {"enable-ssd-latency-tracking", no_argument, NULL, PERF_ENABLE_SSD_LATENCY_TRACING}, 2443 #define PERF_CPU_USAGE 'm' 2444 {"cpu-usage", no_argument, NULL, PERF_CPU_USAGE}, 2445 #define PERF_IO_SIZE 'o' 2446 {"io-size", required_argument, NULL, PERF_IO_SIZE}, 2447 #define PERF_IO_DEPTH 'q' 2448 {"io-depth", required_argument, NULL, PERF_IO_DEPTH}, 2449 #define PERF_TRANSPORT 'r' 2450 {"transport", required_argument, NULL, PERF_TRANSPORT}, 2451 #define PERF_KEEPALIVE 'k' 2452 {"keepalive", required_argument, NULL, PERF_KEEPALIVE}, 2453 #define PERF_HUGEMEM_SIZE 's' 2454 {"hugemem-size", required_argument, NULL, PERF_HUGEMEM_SIZE}, 2455 #define PERF_TIME 't' 2456 {"time", required_argument, NULL, PERF_TIME}, 2457 #define PERF_NUMBER_IOS 'd' 2458 {"number-ios", required_argument, NULL, PERF_NUMBER_IOS}, 2459 #define PERF_IO_PATTERN 'w' 2460 {"io-pattern", required_argument, NULL, PERF_IO_PATTERN}, 2461 #define PERF_DISABLE_ZCOPY 'z' 2462 {"disable-zcopy", required_argument, NULL, PERF_DISABLE_ZCOPY}, 2463 #define PERF_BUFFER_ALIGNMENT 'A' 2464 {"buffer-alignment", required_argument, NULL, PERF_BUFFER_ALIGNMENT}, 2465 #define PERF_MAX_COMPLETIONS_PER_POLL 'C' 2466 {"max-completion-per-poll", required_argument, NULL, PERF_MAX_COMPLETIONS_PER_POLL}, 2467 #define PERF_DISABLE_SQ_CMB 'D' 2468 {"disable-sq-cmb", no_argument, NULL, PERF_DISABLE_SQ_CMB}, 2469 #define PERF_ZIPF 'F' 2470 {"zipf", required_argument, NULL, PERF_ZIPF}, 2471 #define PERF_ENABLE_DEBUG 'G' 2472 {"enable-debug", no_argument, NULL, PERF_ENABLE_DEBUG}, 2473 #define PERF_ENABLE_TCP_HDGST 'H' 2474 {"enable-tcp-hdgst", no_argument, NULL, PERF_ENABLE_TCP_HDGST}, 2475 #define PERF_ENABLE_TCP_DDGST 'I' 2476 {"enable-tcp-ddgst", no_argument, NULL, PERF_ENABLE_TCP_DDGST}, 2477 #define PERF_ENABLE_SW_LATENCY_TRACING 'L' 2478 {"enable-sw-latency-tracking", no_argument, NULL, PERF_ENABLE_SW_LATENCY_TRACING}, 2479 #define PERF_RW_MIXREAD 'M' 2480 {"rwmixread", required_argument, NULL, PERF_RW_MIXREAD}, 2481 #define PERF_NO_SHST_NOTIFICATION 'N' 2482 {"no-shst-notification", no_argument, NULL, PERF_NO_SHST_NOTIFICATION}, 2483 #define PERF_IO_UNIT_SIZE 'O' 2484 {"io-unit-size", required_argument, NULL, PERF_IO_UNIT_SIZE}, 2485 #define PERF_IO_QUEUES_PER_NS 'P' 2486 {"num-qpairs", required_argument, NULL, PERF_IO_QUEUES_PER_NS}, 2487 #define PERF_CONTINUE_ON_ERROR 'Q' 2488 {"continue-on-error", required_argument, NULL, PERF_CONTINUE_ON_ERROR}, 2489 #define PERF_ENABLE_URING 'R' 2490 {"enable-uring", no_argument, NULL, PERF_ENABLE_URING}, 2491 #define PERF_DEFAULT_SOCK_IMPL 'S' 2492 {"default-sock-impl", required_argument, NULL, PERF_DEFAULT_SOCK_IMPL}, 2493 #define PERF_LOG_FLAG 'T' 2494 {"logflag", required_argument, NULL, PERF_LOG_FLAG}, 2495 #define PERF_NUM_UNUSED_IO_QPAIRS 'U' 2496 {"num-unused-qpairs", required_argument, NULL, PERF_NUM_UNUSED_IO_QPAIRS}, 2497 #define PERF_ENABLE_VMD 'V' 2498 {"enable-vmd", no_argument, NULL, PERF_ENABLE_VMD}, 2499 #define PERF_ENABLE_ZCOPY 'Z' 2500 {"enable-zcopy", required_argument, NULL, PERF_ENABLE_ZCOPY}, 2501 #define PERF_TRANSPORT_STATISTICS 257 2502 {"transport-stats", no_argument, NULL, PERF_TRANSPORT_STATISTICS}, 2503 #define PERF_IOVA_MODE 258 2504 {"iova-mode", required_argument, NULL, PERF_IOVA_MODE}, 2505 #define PERF_IO_QUEUE_SIZE 259 2506 {"io-queue-size", required_argument, NULL, PERF_IO_QUEUE_SIZE}, 2507 #define PERF_DISABLE_KTLS 260 2508 {"disable-ktls", no_argument, NULL, PERF_DISABLE_KTLS}, 2509 #define PERF_ENABLE_KTLS 261 2510 {"enable-ktls", no_argument, NULL, PERF_ENABLE_KTLS}, 2511 #define PERF_TLS_VERSION 262 2512 {"tls-version", required_argument, NULL, PERF_TLS_VERSION}, 2513 #define PERF_PSK_PATH 263 2514 {"psk-path", required_argument, NULL, PERF_PSK_PATH}, 2515 #define PERF_PSK_IDENTITY 264 2516 {"psk-identity ", required_argument, NULL, PERF_PSK_IDENTITY}, 2517 #define PERF_ZEROCOPY_THRESHOLD 265 2518 {"zerocopy-threshold", required_argument, NULL, PERF_ZEROCOPY_THRESHOLD}, 2519 #define PERF_SOCK_IMPL 266 2520 {"zerocopy-threshold-sock-impl", required_argument, NULL, PERF_SOCK_IMPL}, 2521 #define PERF_TRANSPORT_TOS 267 2522 {"transport-tos", required_argument, NULL, PERF_TRANSPORT_TOS}, 2523 #define PERF_RDMA_SRQ_SIZE 268 2524 {"rdma-srq-size", required_argument, NULL, PERF_RDMA_SRQ_SIZE}, 2525 #define PERF_USE_EVERY_CORE 269 2526 {"use-every-core", no_argument, NULL, PERF_USE_EVERY_CORE}, 2527 #define PERF_NO_HUGE 270 2528 {"no-huge", no_argument, NULL, PERF_NO_HUGE}, 2529 /* Should be the last element */ 2530 {0, 0, 0, 0} 2531 }; 2532 2533 static int 2534 parse_args(int argc, char **argv, struct spdk_env_opts *env_opts) 2535 { 2536 int op, long_idx; 2537 long int val; 2538 uint64_t val_u64; 2539 int rc; 2540 char *endptr; 2541 bool ssl_used = false; 2542 char *sock_impl = "posix"; 2543 2544 while ((op = getopt_long(argc, argv, PERF_GETOPT_SHORT, g_perf_cmdline_opts, &long_idx)) != -1) { 2545 switch (op) { 2546 case PERF_WARMUP_TIME: 2547 case PERF_SHMEM_GROUP_ID: 2548 case PERF_MAX_COMPLETIONS_PER_POLL: 2549 case PERF_IO_QUEUES_PER_NS: 2550 case PERF_KEEPALIVE: 2551 case PERF_TIME: 2552 case PERF_RW_MIXREAD: 2553 case PERF_NUM_UNUSED_IO_QPAIRS: 2554 case PERF_CONTINUE_ON_ERROR: 2555 case PERF_RDMA_SRQ_SIZE: 2556 val = spdk_strtol(optarg, 10); 2557 if (val < 0) { 2558 fprintf(stderr, "Converting a string to integer failed\n"); 2559 return val; 2560 } 2561 switch (op) { 2562 case PERF_WARMUP_TIME: 2563 g_warmup_time_in_sec = val; 2564 break; 2565 case PERF_SHMEM_GROUP_ID: 2566 env_opts->shm_id = val; 2567 break; 2568 case PERF_MAX_COMPLETIONS_PER_POLL: 2569 g_max_completions = val; 2570 break; 2571 case PERF_IO_QUEUES_PER_NS: 2572 g_nr_io_queues_per_ns = val; 2573 break; 2574 case PERF_KEEPALIVE: 2575 g_keep_alive_timeout_in_ms = val; 2576 break; 2577 case PERF_TIME: 2578 g_time_in_sec = val; 2579 break; 2580 case PERF_RW_MIXREAD: 2581 g_rw_percentage = val; 2582 g_mix_specified = true; 2583 break; 2584 case PERF_CONTINUE_ON_ERROR: 2585 g_quiet_count = val; 2586 g_continue_on_error = true; 2587 break; 2588 case PERF_NUM_UNUSED_IO_QPAIRS: 2589 g_nr_unused_io_queues = val; 2590 break; 2591 case PERF_RDMA_SRQ_SIZE: 2592 g_rdma_srq_size = val; 2593 break; 2594 } 2595 break; 2596 case PERF_IO_SIZE: 2597 case PERF_IO_UNIT_SIZE: 2598 case PERF_ZEROCOPY_THRESHOLD: 2599 case PERF_BUFFER_ALIGNMENT: 2600 case PERF_HUGEMEM_SIZE: 2601 case PERF_NUMBER_IOS: 2602 case PERF_IO_DEPTH: 2603 case PERF_IO_QUEUE_SIZE: 2604 rc = spdk_parse_capacity(optarg, &val_u64, NULL); 2605 if (rc != 0) { 2606 fprintf(stderr, "Converting a string to integer failed\n"); 2607 return 1; 2608 } 2609 switch (op) { 2610 case PERF_IO_SIZE: 2611 g_io_size_bytes = (uint32_t)val_u64; 2612 break; 2613 case PERF_IO_UNIT_SIZE: 2614 g_io_unit_size = (uint32_t)val_u64; 2615 break; 2616 case PERF_ZEROCOPY_THRESHOLD: 2617 g_sock_zcopy_threshold = (uint32_t)val_u64; 2618 break; 2619 case PERF_IO_DEPTH: 2620 g_queue_depth = (uint32_t)val_u64; 2621 break; 2622 case PERF_IO_QUEUE_SIZE: 2623 g_io_queue_size = (uint32_t)val_u64; 2624 break; 2625 case PERF_BUFFER_ALIGNMENT: 2626 g_io_align = (uint32_t)val_u64; 2627 if (!spdk_u32_is_pow2(g_io_align) || g_io_align < SPDK_CACHE_LINE_SIZE) { 2628 fprintf(stderr, "Wrong alignment %u. Must be power of 2 and not less than cache lize (%u)\n", 2629 g_io_align, SPDK_CACHE_LINE_SIZE); 2630 usage(argv[0]); 2631 return 1; 2632 } 2633 g_io_align_specified = true; 2634 break; 2635 case PERF_HUGEMEM_SIZE: 2636 env_opts->mem_size = (int)val_u64; 2637 break; 2638 case PERF_NUMBER_IOS: 2639 g_number_ios = val_u64; 2640 break; 2641 } 2642 break; 2643 case PERF_ZIPF: 2644 errno = 0; 2645 g_zipf_theta = strtod(optarg, &endptr); 2646 if (errno || optarg == endptr || g_zipf_theta < 0) { 2647 fprintf(stderr, "Illegal zipf theta value %s\n", optarg); 2648 return 1; 2649 } 2650 break; 2651 case PERF_ALLOWED_PCI_ADDR: 2652 if (add_allowed_pci_device(optarg, env_opts)) { 2653 usage(argv[0]); 2654 return 1; 2655 } 2656 break; 2657 case PERF_CORE_MASK: 2658 env_opts->core_mask = optarg; 2659 break; 2660 case PERF_METADATA: 2661 if (parse_metadata(optarg)) { 2662 usage(argv[0]); 2663 return 1; 2664 } 2665 break; 2666 case PERF_MEM_SINGL_SEG: 2667 env_opts->hugepage_single_segments = true; 2668 break; 2669 case PERF_ENABLE_SSD_LATENCY_TRACING: 2670 g_latency_ssd_tracking_enable = true; 2671 break; 2672 case PERF_CPU_USAGE: 2673 g_monitor_perf_cores = true; 2674 break; 2675 case PERF_TRANSPORT: 2676 if (add_trid(optarg)) { 2677 usage(argv[0]); 2678 return 1; 2679 } 2680 break; 2681 case PERF_IO_PATTERN: 2682 g_workload_type = optarg; 2683 break; 2684 case PERF_DISABLE_SQ_CMB: 2685 g_disable_sq_cmb = 1; 2686 break; 2687 case PERF_ENABLE_DEBUG: 2688 #ifndef DEBUG 2689 fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n", 2690 argv[0]); 2691 usage(argv[0]); 2692 return 1; 2693 #else 2694 spdk_log_set_flag("nvme"); 2695 spdk_log_set_print_level(SPDK_LOG_DEBUG); 2696 break; 2697 #endif 2698 case PERF_ENABLE_TCP_HDGST: 2699 g_header_digest = 1; 2700 break; 2701 case PERF_ENABLE_TCP_DDGST: 2702 g_data_digest = 1; 2703 break; 2704 case PERF_ENABLE_SW_LATENCY_TRACING: 2705 g_latency_sw_tracking_level++; 2706 break; 2707 case PERF_NO_SHST_NOTIFICATION: 2708 g_no_shn_notification = true; 2709 break; 2710 case PERF_ENABLE_URING: 2711 #ifndef SPDK_CONFIG_URING 2712 fprintf(stderr, "%s must be rebuilt with CONFIG_URING=y for -R flag.\n", 2713 argv[0]); 2714 usage(argv[0]); 2715 return 0; 2716 #endif 2717 g_use_uring = true; 2718 break; 2719 case PERF_LOG_FLAG: 2720 rc = spdk_log_set_flag(optarg); 2721 if (rc < 0) { 2722 fprintf(stderr, "unknown flag\n"); 2723 usage(argv[0]); 2724 exit(EXIT_FAILURE); 2725 } 2726 #ifdef DEBUG 2727 spdk_log_set_print_level(SPDK_LOG_DEBUG); 2728 #endif 2729 break; 2730 case PERF_ENABLE_VMD: 2731 g_vmd = true; 2732 break; 2733 case PERF_DISABLE_KTLS: 2734 ssl_used = true; 2735 perf_set_sock_opts("ssl", "ktls", 0, NULL); 2736 break; 2737 case PERF_ENABLE_KTLS: 2738 ssl_used = true; 2739 perf_set_sock_opts("ssl", "ktls", 1, NULL); 2740 break; 2741 case PERF_TLS_VERSION: 2742 ssl_used = true; 2743 val = spdk_strtol(optarg, 10); 2744 if (val < 0) { 2745 fprintf(stderr, "Illegal tls version value %s\n", optarg); 2746 return val; 2747 } 2748 perf_set_sock_opts("ssl", "tls_version", val, NULL); 2749 break; 2750 case PERF_PSK_PATH: 2751 ssl_used = true; 2752 free_key(&g_psk); 2753 g_psk = alloc_key("perf-psk", optarg); 2754 if (g_psk == NULL) { 2755 fprintf(stderr, "Unable to set PSK at %s\n", optarg); 2756 return 1; 2757 } 2758 break; 2759 case PERF_PSK_IDENTITY: 2760 ssl_used = true; 2761 perf_set_sock_opts("ssl", "psk_identity", 0, optarg); 2762 break; 2763 case PERF_DISABLE_ZCOPY: 2764 perf_set_sock_opts(optarg, "enable_zerocopy_send_client", 0, NULL); 2765 break; 2766 case PERF_ENABLE_ZCOPY: 2767 perf_set_sock_opts(optarg, "enable_zerocopy_send_client", 1, NULL); 2768 break; 2769 case PERF_USE_EVERY_CORE: 2770 g_use_every_core = true; 2771 break; 2772 case PERF_DEFAULT_SOCK_IMPL: 2773 sock_impl = optarg; 2774 rc = spdk_sock_set_default_impl(optarg); 2775 if (rc) { 2776 fprintf(stderr, "Failed to set sock impl %s, err %d (%s)\n", optarg, errno, strerror(errno)); 2777 return 1; 2778 } 2779 break; 2780 case PERF_TRANSPORT_STATISTICS: 2781 g_dump_transport_stats = true; 2782 break; 2783 case PERF_IOVA_MODE: 2784 env_opts->iova_mode = optarg; 2785 break; 2786 case PERF_SOCK_IMPL: 2787 g_sock_threshold_impl = optarg; 2788 break; 2789 case PERF_TRANSPORT_TOS: 2790 val = spdk_strtol(optarg, 10); 2791 if (val < 0) { 2792 fprintf(stderr, "Invalid TOS value\n"); 2793 return 1; 2794 } 2795 g_transport_tos = val; 2796 break; 2797 case PERF_NO_HUGE: 2798 env_opts->no_huge = true; 2799 break; 2800 case PERF_HELP: 2801 usage(argv[0]); 2802 return HELP_RETURN_CODE; 2803 default: 2804 usage(argv[0]); 2805 return 1; 2806 } 2807 } 2808 2809 if (!g_nr_io_queues_per_ns) { 2810 usage(argv[0]); 2811 return 1; 2812 } 2813 2814 if (!g_queue_depth) { 2815 fprintf(stderr, "missing -q (--io-depth) operand\n"); 2816 usage(argv[0]); 2817 return 1; 2818 } 2819 if (!g_io_size_bytes) { 2820 fprintf(stderr, "missing -o (--io-size) operand\n"); 2821 usage(argv[0]); 2822 return 1; 2823 } 2824 if (!g_io_unit_size || g_io_unit_size % 4) { 2825 fprintf(stderr, "io unit size can not be 0 or non 4-byte aligned\n"); 2826 return 1; 2827 } 2828 if (!g_workload_type) { 2829 fprintf(stderr, "missing -w (--io-pattern) operand\n"); 2830 usage(argv[0]); 2831 return 1; 2832 } 2833 if (!g_time_in_sec) { 2834 fprintf(stderr, "missing -t (--time) operand\n"); 2835 usage(argv[0]); 2836 return 1; 2837 } 2838 if (!g_quiet_count) { 2839 fprintf(stderr, "-Q (--continue-on-error) value must be greater than 0\n"); 2840 usage(argv[0]); 2841 return 1; 2842 } 2843 2844 if (strncmp(g_workload_type, "rand", 4) == 0) { 2845 g_is_random = 1; 2846 g_workload_type = &g_workload_type[4]; 2847 } 2848 2849 if (ssl_used && strncmp(sock_impl, "ssl", 3) != 0) { 2850 fprintf(stderr, "sock impl is not SSL but tried to use one of the SSL only options\n"); 2851 usage(argv[0]); 2852 return 1; 2853 } 2854 2855 2856 if (strcmp(g_workload_type, "read") == 0 || strcmp(g_workload_type, "write") == 0) { 2857 g_rw_percentage = strcmp(g_workload_type, "read") == 0 ? 100 : 0; 2858 if (g_mix_specified) { 2859 fprintf(stderr, "Ignoring -M (--rwmixread) option... Please use -M option" 2860 " only when using rw or randrw.\n"); 2861 } 2862 } else if (strcmp(g_workload_type, "rw") == 0) { 2863 if (g_rw_percentage < 0 || g_rw_percentage > 100) { 2864 fprintf(stderr, 2865 "-M (--rwmixread) must be specified to value from 0 to 100 " 2866 "for rw or randrw.\n"); 2867 return 1; 2868 } 2869 } else { 2870 fprintf(stderr, 2871 "-w (--io-pattern) io pattern type must be one of\n" 2872 "(read, write, randread, randwrite, rw, randrw)\n"); 2873 return 1; 2874 } 2875 2876 if (g_sock_zcopy_threshold > 0) { 2877 if (!g_sock_threshold_impl) { 2878 fprintf(stderr, 2879 "--zerocopy-threshold must be set with sock implementation specified(--zerocopy-threshold-sock-impl <impl>)\n"); 2880 return 1; 2881 } 2882 2883 perf_set_sock_opts(g_sock_threshold_impl, "zerocopy_threshold", g_sock_zcopy_threshold, NULL); 2884 } 2885 2886 if (g_number_ios && g_warmup_time_in_sec) { 2887 fprintf(stderr, "-d (--number-ios) with -a (--warmup-time) is not supported\n"); 2888 return 1; 2889 } 2890 2891 if (g_number_ios && g_number_ios < g_queue_depth) { 2892 fprintf(stderr, "-d (--number-ios) less than -q (--io-depth) is not supported\n"); 2893 return 1; 2894 } 2895 2896 if (g_rdma_srq_size != 0) { 2897 struct spdk_nvme_transport_opts opts; 2898 2899 spdk_nvme_transport_get_opts(&opts, sizeof(opts)); 2900 opts.rdma_srq_size = g_rdma_srq_size; 2901 2902 rc = spdk_nvme_transport_set_opts(&opts, sizeof(opts)); 2903 if (rc != 0) { 2904 fprintf(stderr, "Failed to set NVMe transport options.\n"); 2905 return 1; 2906 } 2907 } 2908 2909 if (TAILQ_EMPTY(&g_trid_list)) { 2910 /* If no transport IDs specified, default to enumerating all local PCIe devices */ 2911 add_trid("trtype:PCIe"); 2912 } else { 2913 struct trid_entry *trid_entry, *trid_entry_tmp; 2914 2915 env_opts->no_pci = true; 2916 /* check whether there is local PCIe type */ 2917 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) { 2918 if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 2919 env_opts->no_pci = false; 2920 break; 2921 } 2922 } 2923 } 2924 2925 g_file_optind = optind; 2926 2927 return 0; 2928 } 2929 2930 static int 2931 register_workers(void) 2932 { 2933 uint32_t i; 2934 struct worker_thread *worker; 2935 2936 SPDK_ENV_FOREACH_CORE(i) { 2937 worker = calloc(1, sizeof(*worker)); 2938 if (worker == NULL) { 2939 fprintf(stderr, "Unable to allocate worker\n"); 2940 return -1; 2941 } 2942 2943 TAILQ_INIT(&worker->ns_ctx); 2944 worker->lcore = i; 2945 TAILQ_INSERT_TAIL(&g_workers, worker, link); 2946 g_num_workers++; 2947 } 2948 2949 return 0; 2950 } 2951 2952 static void 2953 unregister_workers(void) 2954 { 2955 struct worker_thread *worker, *tmp_worker; 2956 struct ns_worker_ctx *ns_ctx, *tmp_ns_ctx; 2957 2958 /* Free namespace context and worker thread */ 2959 TAILQ_FOREACH_SAFE(worker, &g_workers, link, tmp_worker) { 2960 TAILQ_REMOVE(&g_workers, worker, link); 2961 2962 TAILQ_FOREACH_SAFE(ns_ctx, &worker->ns_ctx, link, tmp_ns_ctx) { 2963 TAILQ_REMOVE(&worker->ns_ctx, ns_ctx, link); 2964 spdk_histogram_data_free(ns_ctx->histogram); 2965 free(ns_ctx); 2966 } 2967 2968 free(worker); 2969 } 2970 } 2971 2972 static bool 2973 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2974 struct spdk_nvme_ctrlr_opts *opts) 2975 { 2976 struct trid_entry *trid_entry = cb_ctx; 2977 2978 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2979 if (g_disable_sq_cmb) { 2980 opts->use_cmb_sqs = false; 2981 } 2982 if (g_no_shn_notification) { 2983 opts->no_shn_notification = true; 2984 } 2985 } 2986 2987 if (trid->trtype != trid_entry->trid.trtype && 2988 strcasecmp(trid->trstring, trid_entry->trid.trstring)) { 2989 return false; 2990 } 2991 2992 opts->io_queue_size = g_io_queue_size; 2993 2994 /* Set the header and data_digest */ 2995 opts->header_digest = g_header_digest; 2996 opts->data_digest = g_data_digest; 2997 opts->keep_alive_timeout_ms = g_keep_alive_timeout_in_ms; 2998 opts->tls_psk = g_psk; 2999 memcpy(opts->hostnqn, trid_entry->hostnqn, sizeof(opts->hostnqn)); 3000 3001 opts->transport_tos = g_transport_tos; 3002 if (opts->num_io_queues < g_num_workers * g_nr_io_queues_per_ns) { 3003 opts->num_io_queues = g_num_workers * g_nr_io_queues_per_ns; 3004 } 3005 3006 return true; 3007 } 3008 3009 static void 3010 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 3011 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 3012 { 3013 struct trid_entry *trid_entry = cb_ctx; 3014 struct spdk_pci_addr pci_addr; 3015 struct spdk_pci_device *pci_dev; 3016 struct spdk_pci_id pci_id; 3017 3018 if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) { 3019 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n", 3020 trid->traddr, trid->trsvcid, 3021 trid->subnqn); 3022 } else { 3023 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) { 3024 return; 3025 } 3026 3027 pci_dev = spdk_nvme_ctrlr_get_pci_device(ctrlr); 3028 if (!pci_dev) { 3029 return; 3030 } 3031 3032 pci_id = spdk_pci_device_get_id(pci_dev); 3033 3034 printf("Attached to NVMe Controller at %s [%04x:%04x]\n", 3035 trid->traddr, 3036 pci_id.vendor_id, pci_id.device_id); 3037 } 3038 3039 register_ctrlr(ctrlr, trid_entry); 3040 } 3041 3042 static int 3043 register_controllers(void) 3044 { 3045 struct trid_entry *trid_entry; 3046 3047 printf("Initializing NVMe Controllers\n"); 3048 3049 if (g_vmd && spdk_vmd_init()) { 3050 fprintf(stderr, "Failed to initialize VMD." 3051 " Some NVMe devices can be unavailable.\n"); 3052 } 3053 3054 TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) { 3055 if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) { 3056 fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n", 3057 trid_entry->trid.traddr); 3058 return -1; 3059 } 3060 } 3061 3062 return 0; 3063 } 3064 3065 static void 3066 unregister_controllers(void) 3067 { 3068 struct ctrlr_entry *entry, *tmp; 3069 struct spdk_nvme_detach_ctx *detach_ctx = NULL; 3070 3071 TAILQ_FOREACH_SAFE(entry, &g_controllers, link, tmp) { 3072 TAILQ_REMOVE(&g_controllers, entry, link); 3073 3074 spdk_dma_free(entry->latency_page); 3075 if (g_latency_ssd_tracking_enable && 3076 spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) { 3077 set_latency_tracking_feature(entry->ctrlr, false); 3078 } 3079 3080 if (g_nr_unused_io_queues) { 3081 int i; 3082 3083 for (i = 0; i < g_nr_unused_io_queues; i++) { 3084 spdk_nvme_ctrlr_free_io_qpair(entry->unused_qpairs[i]); 3085 } 3086 3087 free(entry->unused_qpairs); 3088 } 3089 3090 spdk_nvme_detach_async(entry->ctrlr, &detach_ctx); 3091 free(entry); 3092 } 3093 3094 if (detach_ctx) { 3095 spdk_nvme_detach_poll(detach_ctx); 3096 } 3097 3098 if (g_vmd) { 3099 spdk_vmd_fini(); 3100 } 3101 } 3102 3103 static int 3104 allocate_ns_worker(struct ns_entry *entry, struct worker_thread *worker) 3105 { 3106 struct ns_worker_ctx *ns_ctx; 3107 3108 ns_ctx = calloc(1, sizeof(struct ns_worker_ctx)); 3109 if (!ns_ctx) { 3110 return -1; 3111 } 3112 3113 printf("Associating %s with lcore %d\n", entry->name, worker->lcore); 3114 ns_ctx->stats.min_tsc = UINT64_MAX; 3115 ns_ctx->entry = entry; 3116 ns_ctx->histogram = spdk_histogram_data_alloc(); 3117 TAILQ_INSERT_TAIL(&worker->ns_ctx, ns_ctx, link); 3118 3119 return 0; 3120 } 3121 3122 static int 3123 associate_workers_with_ns(void) 3124 { 3125 struct ns_entry *entry = TAILQ_FIRST(&g_namespaces); 3126 struct worker_thread *worker = TAILQ_FIRST(&g_workers); 3127 int i, count; 3128 3129 /* Each core contains single worker, and namespaces are associated as follows: 3130 * --use-every-core not specified (default): 3131 * 2) equal workers and namespaces - each worker associated with single namespace 3132 * 3) more workers than namespaces - each namespace is associated with one or more workers 3133 * 4) more namespaces than workers - each worker is associated with one or more namespaces 3134 * --use-every-core option enabled - every worker is associated with all namespaces 3135 */ 3136 if (g_use_every_core) { 3137 TAILQ_FOREACH(worker, &g_workers, link) { 3138 TAILQ_FOREACH(entry, &g_namespaces, link) { 3139 if (allocate_ns_worker(entry, worker) != 0) { 3140 return -1; 3141 } 3142 } 3143 } 3144 return 0; 3145 } 3146 3147 count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers; 3148 3149 for (i = 0; i < count; i++) { 3150 if (entry == NULL) { 3151 break; 3152 } 3153 3154 if (allocate_ns_worker(entry, worker) != 0) { 3155 return -1; 3156 } 3157 3158 worker = TAILQ_NEXT(worker, link); 3159 if (worker == NULL) { 3160 worker = TAILQ_FIRST(&g_workers); 3161 } 3162 3163 entry = TAILQ_NEXT(entry, link); 3164 if (entry == NULL) { 3165 entry = TAILQ_FIRST(&g_namespaces); 3166 } 3167 3168 } 3169 3170 return 0; 3171 } 3172 3173 static void * 3174 nvme_poll_ctrlrs(void *arg) 3175 { 3176 struct ctrlr_entry *entry; 3177 int oldstate; 3178 int rc; 3179 3180 spdk_unaffinitize_thread(); 3181 3182 while (true) { 3183 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate); 3184 3185 TAILQ_FOREACH(entry, &g_controllers, link) { 3186 if (entry->trtype != SPDK_NVME_TRANSPORT_PCIE) { 3187 rc = spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr); 3188 if (spdk_unlikely(rc < 0 && !g_exit)) { 3189 g_exit = true; 3190 } 3191 } 3192 } 3193 3194 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate); 3195 3196 /* This is a pthread cancellation point and cannot be removed. */ 3197 sleep(1); 3198 } 3199 3200 return NULL; 3201 } 3202 3203 static void 3204 sig_handler(int signo) 3205 { 3206 g_exit = true; 3207 } 3208 3209 static int 3210 setup_sig_handlers(void) 3211 { 3212 struct sigaction sigact = {}; 3213 int rc; 3214 3215 sigemptyset(&sigact.sa_mask); 3216 sigact.sa_handler = sig_handler; 3217 rc = sigaction(SIGINT, &sigact, NULL); 3218 if (rc < 0) { 3219 fprintf(stderr, "sigaction(SIGINT) failed, errno %d (%s)\n", errno, strerror(errno)); 3220 return -1; 3221 } 3222 3223 rc = sigaction(SIGTERM, &sigact, NULL); 3224 if (rc < 0) { 3225 fprintf(stderr, "sigaction(SIGTERM) failed, errno %d (%s)\n", errno, strerror(errno)); 3226 return -1; 3227 } 3228 3229 return 0; 3230 } 3231 3232 int 3233 main(int argc, char **argv) 3234 { 3235 int rc; 3236 struct worker_thread *worker, *main_worker; 3237 struct ns_worker_ctx *ns_ctx; 3238 struct spdk_env_opts opts; 3239 pthread_t thread_id = 0; 3240 3241 /* Use the runtime PID to set the random seed */ 3242 srand(getpid()); 3243 3244 opts.opts_size = sizeof(opts); 3245 spdk_env_opts_init(&opts); 3246 opts.name = "perf"; 3247 opts.pci_allowed = g_allowed_pci_addr; 3248 rc = parse_args(argc, argv, &opts); 3249 if (rc != 0 || rc == HELP_RETURN_CODE) { 3250 free_key(&g_psk); 3251 if (rc == HELP_RETURN_CODE) { 3252 return 0; 3253 } 3254 3255 return rc; 3256 } 3257 /* Transport statistics are printed from each thread. 3258 * To avoid mess in terminal, init and use mutex */ 3259 rc = pthread_mutex_init(&g_stats_mutex, NULL); 3260 if (rc != 0) { 3261 fprintf(stderr, "Failed to init mutex\n"); 3262 free_key(&g_psk); 3263 return -1; 3264 } 3265 if (spdk_env_init(&opts) < 0) { 3266 fprintf(stderr, "Unable to initialize SPDK env\n"); 3267 unregister_trids(); 3268 pthread_mutex_destroy(&g_stats_mutex); 3269 free_key(&g_psk); 3270 return -1; 3271 } 3272 3273 rc = spdk_keyring_init(); 3274 if (rc != 0) { 3275 fprintf(stderr, "Unable to initialize keyring: %s\n", spdk_strerror(-rc)); 3276 unregister_trids(); 3277 pthread_mutex_destroy(&g_stats_mutex); 3278 free_key(&g_psk); 3279 spdk_env_fini(); 3280 return -1; 3281 } 3282 3283 rc = setup_sig_handlers(); 3284 if (rc != 0) { 3285 rc = -1; 3286 goto cleanup; 3287 } 3288 3289 g_tsc_rate = spdk_get_ticks_hz(); 3290 3291 if (register_workers() != 0) { 3292 rc = -1; 3293 goto cleanup; 3294 } 3295 3296 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING) 3297 if (register_files(argc, argv) != 0) { 3298 rc = -1; 3299 goto cleanup; 3300 } 3301 #endif 3302 3303 if (register_controllers() != 0) { 3304 rc = -1; 3305 goto cleanup; 3306 } 3307 3308 if (g_warn) { 3309 printf("WARNING: Some requested NVMe devices were skipped\n"); 3310 } 3311 3312 if (g_num_namespaces == 0) { 3313 fprintf(stderr, "No valid NVMe controllers or AIO or URING devices found\n"); 3314 goto cleanup; 3315 } 3316 3317 if (g_num_workers > 1 && g_quiet_count > 1) { 3318 fprintf(stderr, "Error message rate-limiting enabled across multiple threads.\n"); 3319 fprintf(stderr, "Error suppression count may not be exact.\n"); 3320 } 3321 3322 rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL); 3323 if (rc != 0) { 3324 fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n"); 3325 goto cleanup; 3326 } 3327 3328 if (associate_workers_with_ns() != 0) { 3329 rc = -1; 3330 goto cleanup; 3331 } 3332 3333 rc = pthread_barrier_init(&g_worker_sync_barrier, NULL, g_num_workers); 3334 if (rc != 0) { 3335 fprintf(stderr, "Unable to initialize thread sync barrier\n"); 3336 goto cleanup; 3337 } 3338 3339 printf("Initialization complete. Launching workers.\n"); 3340 3341 /* Launch all of the secondary workers */ 3342 g_main_core = spdk_env_get_current_core(); 3343 main_worker = NULL; 3344 TAILQ_FOREACH(worker, &g_workers, link) { 3345 if (worker->lcore != g_main_core) { 3346 spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker); 3347 } else { 3348 assert(main_worker == NULL); 3349 main_worker = worker; 3350 } 3351 } 3352 3353 assert(main_worker != NULL); 3354 work_fn(main_worker); 3355 3356 spdk_env_thread_wait_all(); 3357 3358 print_stats(); 3359 3360 pthread_barrier_destroy(&g_worker_sync_barrier); 3361 3362 cleanup: 3363 fflush(stdout); 3364 3365 if (thread_id && pthread_cancel(thread_id) == 0) { 3366 pthread_join(thread_id, NULL); 3367 } 3368 3369 /* Collect errors from all workers and namespaces */ 3370 TAILQ_FOREACH(worker, &g_workers, link) { 3371 if (rc != 0) { 3372 break; 3373 } 3374 3375 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 3376 if (ns_ctx->status != 0) { 3377 rc = ns_ctx->status; 3378 break; 3379 } 3380 } 3381 } 3382 3383 unregister_trids(); 3384 unregister_namespaces(); 3385 unregister_controllers(); 3386 unregister_workers(); 3387 3388 free_key(&g_psk); 3389 spdk_keyring_cleanup(); 3390 spdk_env_fini(); 3391 3392 pthread_mutex_destroy(&g_stats_mutex); 3393 3394 if (rc != 0) { 3395 fprintf(stderr, "%s: errors occurred\n", argv[0]); 3396 } 3397 3398 return rc; 3399 } 3400