1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2015 Intel Corporation. 3 * All rights reserved. 4 * 5 * Copyright (c) 2019-2021 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 */ 8 9 #include "spdk/stdinc.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/fd.h" 14 #include "spdk/nvme.h" 15 #include "spdk/vmd.h" 16 #include "spdk/queue.h" 17 #include "spdk/string.h" 18 #include "spdk/nvme_intel.h" 19 #include "spdk/histogram_data.h" 20 #include "spdk/endian.h" 21 #include "spdk/dif.h" 22 #include "spdk/util.h" 23 #include "spdk/log.h" 24 #include "spdk/likely.h" 25 #include "spdk/sock.h" 26 #include "spdk/zipf.h" 27 #include "spdk/nvmf.h" 28 29 #ifdef SPDK_CONFIG_URING 30 #include <liburing.h> 31 #endif 32 33 #if HAVE_LIBAIO 34 #include <libaio.h> 35 #endif 36 37 #define HELP_RETURN_CODE UINT16_MAX 38 39 struct ctrlr_entry { 40 struct spdk_nvme_ctrlr *ctrlr; 41 enum spdk_nvme_transport_type trtype; 42 struct spdk_nvme_intel_rw_latency_page *latency_page; 43 44 struct spdk_nvme_qpair **unused_qpairs; 45 46 TAILQ_ENTRY(ctrlr_entry) link; 47 char name[1024]; 48 }; 49 50 enum entry_type { 51 ENTRY_TYPE_NVME_NS, 52 ENTRY_TYPE_AIO_FILE, 53 ENTRY_TYPE_URING_FILE, 54 }; 55 56 struct ns_fn_table; 57 58 struct ns_entry { 59 enum entry_type type; 60 const struct ns_fn_table *fn_table; 61 62 union { 63 struct { 64 struct spdk_nvme_ctrlr *ctrlr; 65 struct spdk_nvme_ns *ns; 66 } nvme; 67 #ifdef SPDK_CONFIG_URING 68 struct { 69 int fd; 70 } uring; 71 #endif 72 #if HAVE_LIBAIO 73 struct { 74 int fd; 75 } aio; 76 #endif 77 } u; 78 79 TAILQ_ENTRY(ns_entry) link; 80 uint32_t io_size_blocks; 81 uint32_t num_io_requests; 82 uint64_t size_in_ios; 83 uint32_t block_size; 84 uint32_t md_size; 85 bool md_interleave; 86 unsigned int seed; 87 struct spdk_zipf *zipf; 88 bool pi_loc; 89 enum spdk_nvme_pi_type pi_type; 90 uint32_t io_flags; 91 char name[1024]; 92 }; 93 94 static const double g_latency_cutoffs[] = { 95 0.01, 96 0.10, 97 0.25, 98 0.50, 99 0.75, 100 0.90, 101 0.95, 102 0.98, 103 0.99, 104 0.995, 105 0.999, 106 0.9999, 107 0.99999, 108 0.999999, 109 0.9999999, 110 -1, 111 }; 112 113 struct ns_worker_stats { 114 uint64_t io_submitted; 115 uint64_t io_completed; 116 uint64_t last_io_completed; 117 uint64_t total_tsc; 118 uint64_t min_tsc; 119 uint64_t max_tsc; 120 uint64_t last_tsc; 121 uint64_t busy_tsc; 122 uint64_t idle_tsc; 123 uint64_t last_busy_tsc; 124 uint64_t last_idle_tsc; 125 }; 126 127 struct ns_worker_ctx { 128 struct ns_entry *entry; 129 struct ns_worker_stats stats; 130 uint64_t current_queue_depth; 131 uint64_t offset_in_ios; 132 bool is_draining; 133 134 union { 135 struct { 136 int num_active_qpairs; 137 int num_all_qpairs; 138 struct spdk_nvme_qpair **qpair; 139 struct spdk_nvme_poll_group *group; 140 int last_qpair; 141 } nvme; 142 143 #ifdef SPDK_CONFIG_URING 144 struct { 145 struct io_uring ring; 146 uint64_t io_inflight; 147 uint64_t io_pending; 148 struct io_uring_cqe **cqes; 149 150 } uring; 151 #endif 152 #if HAVE_LIBAIO 153 struct { 154 struct io_event *events; 155 io_context_t ctx; 156 } aio; 157 #endif 158 } u; 159 160 TAILQ_ENTRY(ns_worker_ctx) link; 161 162 TAILQ_HEAD(, perf_task) queued_tasks; 163 164 struct spdk_histogram_data *histogram; 165 int status; 166 }; 167 168 struct perf_task { 169 struct ns_worker_ctx *ns_ctx; 170 struct iovec *iovs; /* array of iovecs to transfer. */ 171 int iovcnt; /* Number of iovecs in iovs array. */ 172 int iovpos; /* Current iovec position. */ 173 uint32_t iov_offset; /* Offset in current iovec. */ 174 struct iovec md_iov; 175 uint64_t submit_tsc; 176 bool is_read; 177 struct spdk_dif_ctx dif_ctx; 178 #if HAVE_LIBAIO 179 struct iocb iocb; 180 #endif 181 TAILQ_ENTRY(perf_task) link; 182 }; 183 184 struct worker_thread { 185 TAILQ_HEAD(, ns_worker_ctx) ns_ctx; 186 TAILQ_ENTRY(worker_thread) link; 187 unsigned lcore; 188 }; 189 190 struct ns_fn_table { 191 void (*setup_payload)(struct perf_task *task, uint8_t pattern); 192 193 int (*submit_io)(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 194 struct ns_entry *entry, uint64_t offset_in_ios); 195 196 int64_t (*check_io)(struct ns_worker_ctx *ns_ctx); 197 198 void (*verify_io)(struct perf_task *task, struct ns_entry *entry); 199 200 int (*init_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx); 201 202 void (*cleanup_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx); 203 void (*dump_transport_stats)(uint32_t lcore, struct ns_worker_ctx *ns_ctx); 204 }; 205 206 static uint32_t g_io_unit_size = (UINT32_MAX & (~0x03)); 207 208 static int g_outstanding_commands; 209 210 static bool g_latency_ssd_tracking_enable; 211 static int g_latency_sw_tracking_level; 212 213 static bool g_vmd; 214 static const char *g_workload_type; 215 static TAILQ_HEAD(, ctrlr_entry) g_controllers = TAILQ_HEAD_INITIALIZER(g_controllers); 216 static TAILQ_HEAD(, ns_entry) g_namespaces = TAILQ_HEAD_INITIALIZER(g_namespaces); 217 static uint32_t g_num_namespaces; 218 static TAILQ_HEAD(, worker_thread) g_workers = TAILQ_HEAD_INITIALIZER(g_workers); 219 static uint32_t g_num_workers = 0; 220 static bool g_use_every_core = false; 221 static uint32_t g_main_core; 222 static pthread_barrier_t g_worker_sync_barrier; 223 224 static uint64_t g_tsc_rate; 225 226 static bool g_monitor_perf_cores = false; 227 228 static uint32_t g_io_align = 0x200; 229 static bool g_io_align_specified; 230 static uint32_t g_io_size_bytes; 231 static uint32_t g_max_io_md_size; 232 static uint32_t g_max_io_size_blocks; 233 static uint32_t g_metacfg_pract_flag; 234 static uint32_t g_metacfg_prchk_flags; 235 static int g_rw_percentage = -1; 236 static int g_is_random; 237 static uint32_t g_queue_depth; 238 static int g_nr_io_queues_per_ns = 1; 239 static int g_nr_unused_io_queues; 240 static int g_time_in_sec; 241 static uint64_t g_number_ios; 242 static uint64_t g_elapsed_time_in_usec; 243 static int g_warmup_time_in_sec; 244 static uint32_t g_max_completions; 245 static uint32_t g_disable_sq_cmb; 246 static bool g_use_uring; 247 static bool g_warn; 248 static bool g_header_digest; 249 static bool g_data_digest; 250 static bool g_no_shn_notification; 251 static bool g_mix_specified; 252 /* The flag is used to exit the program while keep alive fails on the transport */ 253 static bool g_exit; 254 /* Default to 10 seconds for the keep alive value. This value is arbitrary. */ 255 static uint32_t g_keep_alive_timeout_in_ms = 10000; 256 static bool g_continue_on_error = false; 257 static uint32_t g_quiet_count = 1; 258 static double g_zipf_theta; 259 /* Set default io_queue_size to UINT16_MAX, NVMe driver will then reduce this 260 * to MQES to maximize the io_queue_size as much as possible. 261 */ 262 static uint32_t g_io_queue_size = UINT16_MAX; 263 264 static uint32_t g_sock_zcopy_threshold; 265 static char *g_sock_threshold_impl; 266 267 static uint8_t g_transport_tos = 0; 268 269 static uint32_t g_rdma_srq_size; 270 uint8_t *g_psk = NULL; 271 272 /* When user specifies -Q, some error messages are rate limited. When rate 273 * limited, we only print the error message every g_quiet_count times the 274 * error occurs. 275 * 276 * Note: the __count is not thread safe, meaning the rate limiting will not 277 * be exact when running perf with multiple thread with lots of errors. 278 * Thread-local __count would mean rate-limiting per thread which doesn't 279 * seem as useful. 280 */ 281 #define RATELIMIT_LOG(...) \ 282 { \ 283 static uint64_t __count = 0; \ 284 if ((__count % g_quiet_count) == 0) { \ 285 if (__count > 0 && g_quiet_count > 1) { \ 286 fprintf(stderr, "Message suppressed %" PRIu32 " times: ", \ 287 g_quiet_count - 1); \ 288 } \ 289 fprintf(stderr, __VA_ARGS__); \ 290 } \ 291 __count++; \ 292 } 293 294 static bool g_dump_transport_stats; 295 static pthread_mutex_t g_stats_mutex; 296 297 #define MAX_ALLOWED_PCI_DEVICE_NUM 128 298 static struct spdk_pci_addr g_allowed_pci_addr[MAX_ALLOWED_PCI_DEVICE_NUM]; 299 300 struct trid_entry { 301 struct spdk_nvme_transport_id trid; 302 uint16_t nsid; 303 char hostnqn[SPDK_NVMF_NQN_MAX_LEN + 1]; 304 TAILQ_ENTRY(trid_entry) tailq; 305 }; 306 307 static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list); 308 309 static int g_file_optind; /* Index of first filename in argv */ 310 311 static inline void task_complete(struct perf_task *task); 312 313 static void 314 perf_set_sock_opts(const char *impl_name, const char *field, uint32_t val, const char *valstr) 315 { 316 struct spdk_sock_impl_opts sock_opts = {}; 317 size_t opts_size = sizeof(sock_opts); 318 int rc; 319 320 rc = spdk_sock_impl_get_opts(impl_name, &sock_opts, &opts_size); 321 if (rc != 0) { 322 if (errno == EINVAL) { 323 fprintf(stderr, "Unknown sock impl %s\n", impl_name); 324 } else { 325 fprintf(stderr, "Failed to get opts for sock impl %s: error %d (%s)\n", impl_name, errno, 326 strerror(errno)); 327 } 328 return; 329 } 330 331 if (opts_size != sizeof(sock_opts)) { 332 fprintf(stderr, "Warning: sock_opts size mismatch. Expected %zu, received %zu\n", 333 sizeof(sock_opts), opts_size); 334 opts_size = sizeof(sock_opts); 335 } 336 337 if (!field) { 338 fprintf(stderr, "Warning: no socket opts field specified\n"); 339 return; 340 } else if (strcmp(field, "enable_zerocopy_send_client") == 0) { 341 sock_opts.enable_zerocopy_send_client = val; 342 } else if (strcmp(field, "tls_version") == 0) { 343 sock_opts.tls_version = val; 344 } else if (strcmp(field, "ktls") == 0) { 345 sock_opts.enable_ktls = val; 346 } else if (strcmp(field, "psk_path") == 0) { 347 if (!valstr) { 348 fprintf(stderr, "No socket opts value specified\n"); 349 return; 350 } 351 g_psk = calloc(1, SPDK_TLS_PSK_MAX_LEN + 1); 352 if (g_psk == NULL) { 353 fprintf(stderr, "Failed to allocate memory for psk\n"); 354 return; 355 } 356 FILE *psk_file = fopen(valstr, "r"); 357 if (psk_file == NULL) { 358 fprintf(stderr, "Could not open PSK file\n"); 359 return; 360 } 361 if (fscanf(psk_file, "%" SPDK_STRINGIFY(SPDK_TLS_PSK_MAX_LEN) "s", g_psk) != 1) { 362 fprintf(stderr, "Could not retrieve PSK from file\n"); 363 fclose(psk_file); 364 return; 365 } 366 if (fclose(psk_file)) { 367 fprintf(stderr, "Failed to close PSK file\n"); 368 return; 369 } 370 } else if (strcmp(field, "zerocopy_threshold") == 0) { 371 sock_opts.zerocopy_threshold = val; 372 } else { 373 fprintf(stderr, "Warning: invalid or unprocessed socket opts field: %s\n", field); 374 return; 375 } 376 377 if (spdk_sock_impl_set_opts(impl_name, &sock_opts, opts_size)) { 378 fprintf(stderr, "Failed to set %s: %d for sock impl %s : error %d (%s)\n", field, val, impl_name, 379 errno, strerror(errno)); 380 } 381 } 382 383 static void 384 nvme_perf_reset_sgl(void *ref, uint32_t sgl_offset) 385 { 386 struct iovec *iov; 387 struct perf_task *task = (struct perf_task *)ref; 388 389 task->iov_offset = sgl_offset; 390 for (task->iovpos = 0; task->iovpos < task->iovcnt; task->iovpos++) { 391 iov = &task->iovs[task->iovpos]; 392 if (task->iov_offset < iov->iov_len) { 393 break; 394 } 395 396 task->iov_offset -= iov->iov_len; 397 } 398 } 399 400 static int 401 nvme_perf_next_sge(void *ref, void **address, uint32_t *length) 402 { 403 struct iovec *iov; 404 struct perf_task *task = (struct perf_task *)ref; 405 406 assert(task->iovpos < task->iovcnt); 407 408 iov = &task->iovs[task->iovpos]; 409 assert(task->iov_offset <= iov->iov_len); 410 411 *address = iov->iov_base + task->iov_offset; 412 *length = iov->iov_len - task->iov_offset; 413 task->iovpos++; 414 task->iov_offset = 0; 415 416 return 0; 417 } 418 419 static int 420 nvme_perf_allocate_iovs(struct perf_task *task, void *buf, uint32_t length) 421 { 422 int iovpos = 0; 423 struct iovec *iov; 424 uint32_t offset = 0; 425 426 task->iovcnt = SPDK_CEIL_DIV(length, (uint64_t)g_io_unit_size); 427 task->iovs = calloc(task->iovcnt, sizeof(struct iovec)); 428 if (!task->iovs) { 429 return -1; 430 } 431 432 while (length > 0) { 433 iov = &task->iovs[iovpos]; 434 iov->iov_len = spdk_min(length, g_io_unit_size); 435 iov->iov_base = buf + offset; 436 length -= iov->iov_len; 437 offset += iov->iov_len; 438 iovpos++; 439 } 440 441 return 0; 442 } 443 444 #ifdef SPDK_CONFIG_URING 445 446 static void 447 uring_setup_payload(struct perf_task *task, uint8_t pattern) 448 { 449 struct iovec *iov; 450 451 task->iovs = calloc(1, sizeof(struct iovec)); 452 if (!task->iovs) { 453 fprintf(stderr, "perf task failed to allocate iovs\n"); 454 exit(1); 455 } 456 task->iovcnt = 1; 457 458 iov = &task->iovs[0]; 459 iov->iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); 460 iov->iov_len = g_io_size_bytes; 461 if (iov->iov_base == NULL) { 462 fprintf(stderr, "spdk_dma_zmalloc() for task->iovs[0].iov_base failed\n"); 463 free(task->iovs); 464 exit(1); 465 } 466 memset(iov->iov_base, pattern, iov->iov_len); 467 } 468 469 static int 470 uring_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 471 struct ns_entry *entry, uint64_t offset_in_ios) 472 { 473 struct io_uring_sqe *sqe; 474 475 sqe = io_uring_get_sqe(&ns_ctx->u.uring.ring); 476 if (!sqe) { 477 fprintf(stderr, "Cannot get sqe\n"); 478 return -1; 479 } 480 481 if (task->is_read) { 482 io_uring_prep_readv(sqe, entry->u.uring.fd, task->iovs, 1, offset_in_ios * task->iovs[0].iov_len); 483 } else { 484 io_uring_prep_writev(sqe, entry->u.uring.fd, task->iovs, 1, offset_in_ios * task->iovs[0].iov_len); 485 } 486 487 io_uring_sqe_set_data(sqe, task); 488 ns_ctx->u.uring.io_pending++; 489 490 return 0; 491 } 492 493 static int64_t 494 uring_check_io(struct ns_worker_ctx *ns_ctx) 495 { 496 int i, to_complete, to_submit, count = 0, ret = 0; 497 struct perf_task *task; 498 499 to_submit = ns_ctx->u.uring.io_pending; 500 501 if (to_submit > 0) { 502 /* If there are I/O to submit, use io_uring_submit here. 503 * It will automatically call spdk_io_uring_enter appropriately. */ 504 ret = io_uring_submit(&ns_ctx->u.uring.ring); 505 if (ret < 0) { 506 ns_ctx->status = 1; 507 return -1; 508 } 509 ns_ctx->u.uring.io_pending = 0; 510 ns_ctx->u.uring.io_inflight += to_submit; 511 } 512 513 to_complete = ns_ctx->u.uring.io_inflight; 514 if (to_complete > 0) { 515 count = io_uring_peek_batch_cqe(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes, to_complete); 516 ns_ctx->u.uring.io_inflight -= count; 517 for (i = 0; i < count; i++) { 518 int res; 519 520 assert(ns_ctx->u.uring.cqes[i] != NULL); 521 task = (struct perf_task *)ns_ctx->u.uring.cqes[i]->user_data; 522 res = ns_ctx->u.uring.cqes[i]->res; 523 if (res != (int)task->iovs[0].iov_len) { 524 fprintf(stderr, "cqe->status=%d, iov_len=%d\n", res, 525 (int)task->iovs[0].iov_len); 526 ns_ctx->status = 1; 527 if (res == -EIO) { 528 /* The block device has been removed. 529 * Stop trying to send I/O to it. 530 */ 531 ns_ctx->is_draining = true; 532 } 533 } 534 io_uring_cqe_seen(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes[i]); 535 task_complete(task); 536 } 537 } 538 return count; 539 } 540 541 static void 542 uring_verify_io(struct perf_task *task, struct ns_entry *entry) 543 { 544 } 545 546 static int 547 uring_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 548 { 549 if (io_uring_queue_init(g_queue_depth, &ns_ctx->u.uring.ring, 0) < 0) { 550 SPDK_ERRLOG("uring I/O context setup failure\n"); 551 return -1; 552 } 553 554 ns_ctx->u.uring.cqes = calloc(g_queue_depth, sizeof(struct io_uring_cqe *)); 555 if (!ns_ctx->u.uring.cqes) { 556 io_uring_queue_exit(&ns_ctx->u.uring.ring); 557 return -1; 558 } 559 560 return 0; 561 } 562 563 static void 564 uring_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 565 { 566 io_uring_queue_exit(&ns_ctx->u.uring.ring); 567 free(ns_ctx->u.uring.cqes); 568 } 569 570 static const struct ns_fn_table uring_fn_table = { 571 .setup_payload = uring_setup_payload, 572 .submit_io = uring_submit_io, 573 .check_io = uring_check_io, 574 .verify_io = uring_verify_io, 575 .init_ns_worker_ctx = uring_init_ns_worker_ctx, 576 .cleanup_ns_worker_ctx = uring_cleanup_ns_worker_ctx, 577 }; 578 579 #endif 580 581 #ifdef HAVE_LIBAIO 582 static void 583 aio_setup_payload(struct perf_task *task, uint8_t pattern) 584 { 585 struct iovec *iov; 586 587 task->iovs = calloc(1, sizeof(struct iovec)); 588 if (!task->iovs) { 589 fprintf(stderr, "perf task failed to allocate iovs\n"); 590 exit(1); 591 } 592 task->iovcnt = 1; 593 594 iov = &task->iovs[0]; 595 iov->iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); 596 iov->iov_len = g_io_size_bytes; 597 if (iov->iov_base == NULL) { 598 fprintf(stderr, "spdk_dma_zmalloc() for task->iovs[0].iov_base failed\n"); 599 free(task->iovs); 600 exit(1); 601 } 602 memset(iov->iov_base, pattern, iov->iov_len); 603 } 604 605 static int 606 aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd, 607 struct iovec *iov, uint64_t offset, void *cb_ctx) 608 { 609 iocb->aio_fildes = fd; 610 iocb->aio_reqprio = 0; 611 iocb->aio_lio_opcode = cmd; 612 iocb->u.c.buf = iov->iov_base; 613 iocb->u.c.nbytes = iov->iov_len; 614 iocb->u.c.offset = offset * iov->iov_len; 615 iocb->data = cb_ctx; 616 617 if (io_submit(aio_ctx, 1, &iocb) < 0) { 618 printf("io_submit"); 619 return -1; 620 } 621 622 return 0; 623 } 624 625 static int 626 aio_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 627 struct ns_entry *entry, uint64_t offset_in_ios) 628 { 629 if (task->is_read) { 630 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PREAD, 631 task->iovs, offset_in_ios, task); 632 } else { 633 return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PWRITE, 634 task->iovs, offset_in_ios, task); 635 } 636 } 637 638 static int64_t 639 aio_check_io(struct ns_worker_ctx *ns_ctx) 640 { 641 int count, i; 642 struct timespec timeout; 643 struct perf_task *task; 644 645 timeout.tv_sec = 0; 646 timeout.tv_nsec = 0; 647 648 count = io_getevents(ns_ctx->u.aio.ctx, 1, g_queue_depth, ns_ctx->u.aio.events, &timeout); 649 if (count < 0) { 650 fprintf(stderr, "io_getevents error\n"); 651 ns_ctx->status = 1; 652 return -1; 653 } 654 655 for (i = 0; i < count; i++) { 656 unsigned long res; 657 658 task = (struct perf_task *)ns_ctx->u.aio.events[i].data; 659 res = ns_ctx->u.aio.events[i].res; 660 if (res != (uint64_t)task->iovs[0].iov_len) { 661 fprintf(stderr, "event->res=%ld, iov_len=%lu\n", (long)res, 662 (uint64_t)task->iovs[0].iov_len); 663 ns_ctx->status = 1; 664 if ((long)res == -EIO) { 665 /* The block device has been removed. Stop trying to send I/O to it. */ 666 ns_ctx->is_draining = true; 667 } 668 } 669 task_complete(ns_ctx->u.aio.events[i].data); 670 } 671 return count; 672 } 673 674 static void 675 aio_verify_io(struct perf_task *task, struct ns_entry *entry) 676 { 677 } 678 679 static int 680 aio_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 681 { 682 ns_ctx->u.aio.events = calloc(g_queue_depth, sizeof(struct io_event)); 683 if (!ns_ctx->u.aio.events) { 684 return -1; 685 } 686 ns_ctx->u.aio.ctx = 0; 687 if (io_setup(g_queue_depth, &ns_ctx->u.aio.ctx) < 0) { 688 free(ns_ctx->u.aio.events); 689 perror("io_setup"); 690 return -1; 691 } 692 return 0; 693 } 694 695 static void 696 aio_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 697 { 698 io_destroy(ns_ctx->u.aio.ctx); 699 free(ns_ctx->u.aio.events); 700 } 701 702 static const struct ns_fn_table aio_fn_table = { 703 .setup_payload = aio_setup_payload, 704 .submit_io = aio_submit_io, 705 .check_io = aio_check_io, 706 .verify_io = aio_verify_io, 707 .init_ns_worker_ctx = aio_init_ns_worker_ctx, 708 .cleanup_ns_worker_ctx = aio_cleanup_ns_worker_ctx, 709 }; 710 711 #endif /* HAVE_LIBAIO */ 712 713 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING) 714 715 static int 716 register_file(const char *path) 717 { 718 struct ns_entry *entry; 719 720 int flags, fd; 721 uint64_t size; 722 uint32_t blklen; 723 724 if (g_rw_percentage == 100) { 725 flags = O_RDONLY; 726 } else if (g_rw_percentage == 0) { 727 flags = O_WRONLY; 728 } else { 729 flags = O_RDWR; 730 } 731 732 flags |= O_DIRECT; 733 734 fd = open(path, flags); 735 if (fd < 0) { 736 fprintf(stderr, "Could not open device %s: %s\n", path, strerror(errno)); 737 return -1; 738 } 739 740 size = spdk_fd_get_size(fd); 741 if (size == 0) { 742 fprintf(stderr, "Could not determine size of device %s\n", path); 743 close(fd); 744 return -1; 745 } 746 747 blklen = spdk_fd_get_blocklen(fd); 748 if (blklen == 0) { 749 fprintf(stderr, "Could not determine block size of device %s\n", path); 750 close(fd); 751 return -1; 752 } 753 754 /* 755 * TODO: This should really calculate the LCM of the current g_io_align and blklen. 756 * For now, it's fairly safe to just assume all block sizes are powers of 2. 757 */ 758 if (g_io_align < blklen) { 759 if (g_io_align_specified) { 760 fprintf(stderr, "Wrong IO alignment (%u). aio requires block-sized alignment (%u)\n", g_io_align, 761 blklen); 762 close(fd); 763 return -1; 764 } 765 766 g_io_align = blklen; 767 } 768 769 entry = calloc(1, sizeof(struct ns_entry)); 770 if (entry == NULL) { 771 close(fd); 772 perror("ns_entry malloc"); 773 return -1; 774 } 775 776 if (g_use_uring) { 777 #ifdef SPDK_CONFIG_URING 778 entry->type = ENTRY_TYPE_URING_FILE; 779 entry->fn_table = &uring_fn_table; 780 entry->u.uring.fd = fd; 781 #endif 782 } else { 783 #if HAVE_LIBAIO 784 entry->type = ENTRY_TYPE_AIO_FILE; 785 entry->fn_table = &aio_fn_table; 786 entry->u.aio.fd = fd; 787 #endif 788 } 789 entry->size_in_ios = size / g_io_size_bytes; 790 entry->io_size_blocks = g_io_size_bytes / blklen; 791 792 if (g_is_random) { 793 entry->seed = rand(); 794 if (g_zipf_theta > 0) { 795 entry->zipf = spdk_zipf_create(entry->size_in_ios, g_zipf_theta, 0); 796 } 797 } 798 799 snprintf(entry->name, sizeof(entry->name), "%s", path); 800 801 g_num_namespaces++; 802 TAILQ_INSERT_TAIL(&g_namespaces, entry, link); 803 804 return 0; 805 } 806 807 static int 808 register_files(int argc, char **argv) 809 { 810 int i; 811 812 /* Treat everything after the options as files for AIO/URING */ 813 for (i = g_file_optind; i < argc; i++) { 814 if (register_file(argv[i]) != 0) { 815 return 1; 816 } 817 } 818 819 return 0; 820 } 821 #endif 822 823 static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl); 824 825 static void 826 nvme_setup_payload(struct perf_task *task, uint8_t pattern) 827 { 828 uint32_t max_io_size_bytes, max_io_md_size; 829 void *buf; 830 int rc; 831 832 /* maximum extended lba format size from all active namespace, 833 * it's same with g_io_size_bytes for namespace without metadata. 834 */ 835 max_io_size_bytes = g_io_size_bytes + g_max_io_md_size * g_max_io_size_blocks; 836 buf = spdk_dma_zmalloc(max_io_size_bytes, g_io_align, NULL); 837 if (buf == NULL) { 838 fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n"); 839 exit(1); 840 } 841 memset(buf, pattern, max_io_size_bytes); 842 843 rc = nvme_perf_allocate_iovs(task, buf, max_io_size_bytes); 844 if (rc < 0) { 845 fprintf(stderr, "perf task failed to allocate iovs\n"); 846 spdk_dma_free(buf); 847 exit(1); 848 } 849 850 max_io_md_size = g_max_io_md_size * g_max_io_size_blocks; 851 if (max_io_md_size != 0) { 852 task->md_iov.iov_base = spdk_dma_zmalloc(max_io_md_size, g_io_align, NULL); 853 task->md_iov.iov_len = max_io_md_size; 854 if (task->md_iov.iov_base == NULL) { 855 fprintf(stderr, "task->md_buf spdk_dma_zmalloc failed\n"); 856 spdk_dma_free(task->iovs[0].iov_base); 857 free(task->iovs); 858 exit(1); 859 } 860 } 861 } 862 863 static int 864 nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, 865 struct ns_entry *entry, uint64_t offset_in_ios) 866 { 867 uint64_t lba; 868 int rc; 869 int qp_num; 870 struct spdk_dif_ctx_init_ext_opts dif_opts; 871 872 enum dif_mode { 873 DIF_MODE_NONE = 0, 874 DIF_MODE_DIF = 1, 875 DIF_MODE_DIX = 2, 876 } mode = DIF_MODE_NONE; 877 878 lba = offset_in_ios * entry->io_size_blocks; 879 880 if (entry->md_size != 0 && !(entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) { 881 if (entry->md_interleave) { 882 mode = DIF_MODE_DIF; 883 } else { 884 mode = DIF_MODE_DIX; 885 } 886 } 887 888 qp_num = ns_ctx->u.nvme.last_qpair; 889 ns_ctx->u.nvme.last_qpair++; 890 if (ns_ctx->u.nvme.last_qpair == ns_ctx->u.nvme.num_active_qpairs) { 891 ns_ctx->u.nvme.last_qpair = 0; 892 } 893 894 if (mode != DIF_MODE_NONE) { 895 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 896 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 897 rc = spdk_dif_ctx_init(&task->dif_ctx, entry->block_size, entry->md_size, 898 entry->md_interleave, entry->pi_loc, 899 (enum spdk_dif_type)entry->pi_type, entry->io_flags, 900 lba, 0xFFFF, (uint16_t)entry->io_size_blocks, 0, 0, &dif_opts); 901 if (rc != 0) { 902 fprintf(stderr, "Initialization of DIF context failed\n"); 903 exit(1); 904 } 905 } 906 907 if (task->is_read) { 908 if (task->iovcnt == 1) { 909 return spdk_nvme_ns_cmd_read_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 910 task->iovs[0].iov_base, task->md_iov.iov_base, 911 lba, 912 entry->io_size_blocks, io_complete, 913 task, entry->io_flags, 914 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 915 } else { 916 return spdk_nvme_ns_cmd_readv_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 917 lba, entry->io_size_blocks, 918 io_complete, task, entry->io_flags, 919 nvme_perf_reset_sgl, nvme_perf_next_sge, 920 task->md_iov.iov_base, 921 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 922 } 923 } else { 924 switch (mode) { 925 case DIF_MODE_DIF: 926 rc = spdk_dif_generate(task->iovs, task->iovcnt, entry->io_size_blocks, &task->dif_ctx); 927 if (rc != 0) { 928 fprintf(stderr, "Generation of DIF failed\n"); 929 return rc; 930 } 931 break; 932 case DIF_MODE_DIX: 933 rc = spdk_dix_generate(task->iovs, task->iovcnt, &task->md_iov, entry->io_size_blocks, 934 &task->dif_ctx); 935 if (rc != 0) { 936 fprintf(stderr, "Generation of DIX failed\n"); 937 return rc; 938 } 939 break; 940 default: 941 break; 942 } 943 944 if (task->iovcnt == 1) { 945 return spdk_nvme_ns_cmd_write_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 946 task->iovs[0].iov_base, task->md_iov.iov_base, 947 lba, 948 entry->io_size_blocks, io_complete, 949 task, entry->io_flags, 950 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 951 } else { 952 return spdk_nvme_ns_cmd_writev_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], 953 lba, entry->io_size_blocks, 954 io_complete, task, entry->io_flags, 955 nvme_perf_reset_sgl, nvme_perf_next_sge, 956 task->md_iov.iov_base, 957 task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); 958 } 959 } 960 } 961 962 static void 963 perf_disconnect_cb(struct spdk_nvme_qpair *qpair, void *ctx) 964 { 965 struct ns_worker_ctx *ns_ctx = ctx; 966 967 ns_ctx->is_draining = true; 968 ns_ctx->status = 1; 969 } 970 971 static int64_t 972 nvme_check_io(struct ns_worker_ctx *ns_ctx) 973 { 974 int64_t rc; 975 976 rc = spdk_nvme_poll_group_process_completions(ns_ctx->u.nvme.group, g_max_completions, 977 perf_disconnect_cb); 978 if (rc < 0) { 979 fprintf(stderr, "NVMe io qpair process completion error\n"); 980 ns_ctx->status = 1; 981 return -1; 982 } 983 return rc; 984 } 985 986 static void 987 nvme_verify_io(struct perf_task *task, struct ns_entry *entry) 988 { 989 struct spdk_dif_error err_blk = {}; 990 int rc; 991 992 if (!task->is_read || (entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) { 993 return; 994 } 995 996 if (entry->md_interleave) { 997 rc = spdk_dif_verify(task->iovs, task->iovcnt, entry->io_size_blocks, &task->dif_ctx, 998 &err_blk); 999 if (rc != 0) { 1000 fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n", 1001 err_blk.err_type, err_blk.err_offset); 1002 } 1003 } else { 1004 rc = spdk_dix_verify(task->iovs, task->iovcnt, &task->md_iov, entry->io_size_blocks, 1005 &task->dif_ctx, &err_blk); 1006 if (rc != 0) { 1007 fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n", 1008 err_blk.err_type, err_blk.err_offset); 1009 } 1010 } 1011 } 1012 1013 /* 1014 * TODO: If a controller has multiple namespaces, they could all use the same queue. 1015 * For now, give each namespace/thread combination its own queue. 1016 */ 1017 static int 1018 nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1019 { 1020 const struct spdk_nvme_ctrlr_opts *ctrlr_opts; 1021 struct spdk_nvme_io_qpair_opts opts; 1022 struct ns_entry *entry = ns_ctx->entry; 1023 struct spdk_nvme_poll_group *group; 1024 struct spdk_nvme_qpair *qpair; 1025 uint64_t poll_timeout_tsc; 1026 int i, rc; 1027 1028 ns_ctx->u.nvme.num_active_qpairs = g_nr_io_queues_per_ns; 1029 ns_ctx->u.nvme.num_all_qpairs = g_nr_io_queues_per_ns + g_nr_unused_io_queues; 1030 ns_ctx->u.nvme.qpair = calloc(ns_ctx->u.nvme.num_all_qpairs, sizeof(struct spdk_nvme_qpair *)); 1031 if (!ns_ctx->u.nvme.qpair) { 1032 return -1; 1033 } 1034 1035 spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->u.nvme.ctrlr, &opts, sizeof(opts)); 1036 if (opts.io_queue_requests < entry->num_io_requests) { 1037 opts.io_queue_requests = entry->num_io_requests; 1038 } 1039 opts.delay_cmd_submit = true; 1040 opts.create_only = true; 1041 1042 ctrlr_opts = spdk_nvme_ctrlr_get_opts(entry->u.nvme.ctrlr); 1043 opts.async_mode = !(spdk_nvme_ctrlr_get_transport_id(entry->u.nvme.ctrlr)->trtype == 1044 SPDK_NVME_TRANSPORT_PCIE 1045 && ns_ctx->u.nvme.num_all_qpairs > ctrlr_opts->admin_queue_size); 1046 1047 ns_ctx->u.nvme.group = spdk_nvme_poll_group_create(ns_ctx, NULL); 1048 if (ns_ctx->u.nvme.group == NULL) { 1049 goto poll_group_failed; 1050 } 1051 1052 group = ns_ctx->u.nvme.group; 1053 for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) { 1054 ns_ctx->u.nvme.qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->u.nvme.ctrlr, &opts, 1055 sizeof(opts)); 1056 qpair = ns_ctx->u.nvme.qpair[i]; 1057 if (!qpair) { 1058 printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n"); 1059 goto qpair_failed; 1060 } 1061 1062 if (spdk_nvme_poll_group_add(group, qpair)) { 1063 printf("ERROR: unable to add I/O qpair to poll group.\n"); 1064 spdk_nvme_ctrlr_free_io_qpair(qpair); 1065 goto qpair_failed; 1066 } 1067 1068 if (spdk_nvme_ctrlr_connect_io_qpair(entry->u.nvme.ctrlr, qpair)) { 1069 printf("ERROR: unable to connect I/O qpair.\n"); 1070 spdk_nvme_ctrlr_free_io_qpair(qpair); 1071 goto qpair_failed; 1072 } 1073 } 1074 1075 /* Busy poll here until all qpairs are connected - this ensures once we start 1076 * I/O we aren't still waiting for some qpairs to connect. Limit the poll to 1077 * 10 seconds though. 1078 */ 1079 poll_timeout_tsc = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 1080 rc = -EAGAIN; 1081 while (spdk_get_ticks() < poll_timeout_tsc && rc == -EAGAIN) { 1082 spdk_nvme_poll_group_process_completions(group, 0, perf_disconnect_cb); 1083 rc = spdk_nvme_poll_group_all_connected(group); 1084 if (rc == 0) { 1085 return 0; 1086 } 1087 } 1088 1089 /* If we reach here, it means we either timed out, or some connection failed. */ 1090 assert(spdk_get_ticks() > poll_timeout_tsc || rc == -EIO); 1091 1092 qpair_failed: 1093 for (; i > 0; --i) { 1094 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i - 1]); 1095 } 1096 1097 spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group); 1098 poll_group_failed: 1099 free(ns_ctx->u.nvme.qpair); 1100 return -1; 1101 } 1102 1103 static void 1104 nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1105 { 1106 int i; 1107 1108 for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) { 1109 spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i]); 1110 } 1111 1112 spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group); 1113 free(ns_ctx->u.nvme.qpair); 1114 } 1115 1116 static void 1117 nvme_dump_rdma_statistics(struct spdk_nvme_transport_poll_group_stat *stat) 1118 { 1119 struct spdk_nvme_rdma_device_stat *device_stats; 1120 uint32_t i; 1121 1122 printf("RDMA transport:\n"); 1123 for (i = 0; i < stat->rdma.num_devices; i++) { 1124 device_stats = &stat->rdma.device_stats[i]; 1125 printf("\tdev name: %s\n", device_stats->name); 1126 printf("\tpolls: %"PRIu64"\n", device_stats->polls); 1127 printf("\tidle_polls: %"PRIu64"\n", device_stats->idle_polls); 1128 printf("\tcompletions: %"PRIu64"\n", device_stats->completions); 1129 printf("\tqueued_requests: %"PRIu64"\n", device_stats->queued_requests); 1130 printf("\ttotal_send_wrs: %"PRIu64"\n", device_stats->total_send_wrs); 1131 printf("\tsend_doorbell_updates: %"PRIu64"\n", device_stats->send_doorbell_updates); 1132 printf("\ttotal_recv_wrs: %"PRIu64"\n", device_stats->total_recv_wrs); 1133 printf("\trecv_doorbell_updates: %"PRIu64"\n", device_stats->recv_doorbell_updates); 1134 printf("\t---------------------------------\n"); 1135 } 1136 } 1137 1138 static void 1139 nvme_dump_pcie_statistics(struct spdk_nvme_transport_poll_group_stat *stat) 1140 { 1141 struct spdk_nvme_pcie_stat *pcie_stat; 1142 1143 pcie_stat = &stat->pcie; 1144 1145 printf("PCIE transport:\n"); 1146 printf("\tpolls: %"PRIu64"\n", pcie_stat->polls); 1147 printf("\tidle_polls: %"PRIu64"\n", pcie_stat->idle_polls); 1148 printf("\tcompletions: %"PRIu64"\n", pcie_stat->completions); 1149 printf("\tcq_mmio_doorbell_updates: %"PRIu64"\n", pcie_stat->cq_mmio_doorbell_updates); 1150 printf("\tcq_shadow_doorbell_updates: %"PRIu64"\n", pcie_stat->cq_shadow_doorbell_updates); 1151 printf("\tsubmitted_requests: %"PRIu64"\n", pcie_stat->submitted_requests); 1152 printf("\tsq_mmio_doorbell_updates: %"PRIu64"\n", pcie_stat->sq_mmio_doorbell_updates); 1153 printf("\tsq_shadow_doorbell_updates: %"PRIu64"\n", pcie_stat->sq_shadow_doorbell_updates); 1154 printf("\tqueued_requests: %"PRIu64"\n", pcie_stat->queued_requests); 1155 } 1156 1157 static void 1158 nvme_dump_tcp_statistics(struct spdk_nvme_transport_poll_group_stat *stat) 1159 { 1160 struct spdk_nvme_tcp_stat *tcp_stat; 1161 1162 tcp_stat = &stat->tcp; 1163 1164 printf("TCP transport:\n"); 1165 printf("\tpolls: %"PRIu64"\n", tcp_stat->polls); 1166 printf("\tidle_polls: %"PRIu64"\n", tcp_stat->idle_polls); 1167 printf("\tsock_completions: %"PRIu64"\n", tcp_stat->socket_completions); 1168 printf("\tnvme_completions: %"PRIu64"\n", tcp_stat->nvme_completions); 1169 printf("\tsubmitted_requests: %"PRIu64"\n", tcp_stat->submitted_requests); 1170 printf("\tqueued_requests: %"PRIu64"\n", tcp_stat->queued_requests); 1171 } 1172 1173 static void 1174 nvme_dump_transport_stats(uint32_t lcore, struct ns_worker_ctx *ns_ctx) 1175 { 1176 struct spdk_nvme_poll_group *group; 1177 struct spdk_nvme_poll_group_stat *stat = NULL; 1178 uint32_t i; 1179 int rc; 1180 1181 group = ns_ctx->u.nvme.group; 1182 if (group == NULL) { 1183 return; 1184 } 1185 1186 rc = spdk_nvme_poll_group_get_stats(group, &stat); 1187 if (rc) { 1188 fprintf(stderr, "Can't get transport stats, error %d\n", rc); 1189 return; 1190 } 1191 1192 printf("\n====================\n"); 1193 printf("lcore %u, ns %s statistics:\n", lcore, ns_ctx->entry->name); 1194 1195 for (i = 0; i < stat->num_transports; i++) { 1196 switch (stat->transport_stat[i]->trtype) { 1197 case SPDK_NVME_TRANSPORT_RDMA: 1198 nvme_dump_rdma_statistics(stat->transport_stat[i]); 1199 break; 1200 case SPDK_NVME_TRANSPORT_PCIE: 1201 nvme_dump_pcie_statistics(stat->transport_stat[i]); 1202 break; 1203 case SPDK_NVME_TRANSPORT_TCP: 1204 nvme_dump_tcp_statistics(stat->transport_stat[i]); 1205 break; 1206 default: 1207 fprintf(stderr, "Unknown transport statistics %d %s\n", stat->transport_stat[i]->trtype, 1208 spdk_nvme_transport_id_trtype_str(stat->transport_stat[i]->trtype)); 1209 } 1210 } 1211 1212 spdk_nvme_poll_group_free_stats(group, stat); 1213 } 1214 1215 static const struct ns_fn_table nvme_fn_table = { 1216 .setup_payload = nvme_setup_payload, 1217 .submit_io = nvme_submit_io, 1218 .check_io = nvme_check_io, 1219 .verify_io = nvme_verify_io, 1220 .init_ns_worker_ctx = nvme_init_ns_worker_ctx, 1221 .cleanup_ns_worker_ctx = nvme_cleanup_ns_worker_ctx, 1222 .dump_transport_stats = nvme_dump_transport_stats 1223 }; 1224 1225 static int 1226 build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) 1227 { 1228 const struct spdk_nvme_transport_id *trid; 1229 int res = 0; 1230 1231 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1232 1233 switch (trid->trtype) { 1234 case SPDK_NVME_TRANSPORT_PCIE: 1235 res = snprintf(name, length, "PCIE (%s)", trid->traddr); 1236 break; 1237 case SPDK_NVME_TRANSPORT_RDMA: 1238 res = snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 1239 break; 1240 case SPDK_NVME_TRANSPORT_TCP: 1241 res = snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); 1242 break; 1243 case SPDK_NVME_TRANSPORT_VFIOUSER: 1244 res = snprintf(name, length, "VFIOUSER (%s)", trid->traddr); 1245 break; 1246 case SPDK_NVME_TRANSPORT_CUSTOM: 1247 res = snprintf(name, length, "CUSTOM (%s)", trid->traddr); 1248 break; 1249 1250 default: 1251 fprintf(stderr, "Unknown transport type %d\n", trid->trtype); 1252 break; 1253 } 1254 return res; 1255 } 1256 1257 static void 1258 build_nvme_ns_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) 1259 { 1260 int res = 0; 1261 1262 res = build_nvme_name(name, length, ctrlr); 1263 if (res > 0) { 1264 snprintf(name + res, length - res, " NSID %u", nsid); 1265 } 1266 1267 } 1268 1269 static void 1270 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) 1271 { 1272 struct ns_entry *entry; 1273 const struct spdk_nvme_ctrlr_data *cdata; 1274 uint32_t max_xfer_size, entries, sector_size; 1275 uint64_t ns_size; 1276 struct spdk_nvme_io_qpair_opts opts; 1277 1278 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1279 1280 if (!spdk_nvme_ns_is_active(ns)) { 1281 printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", 1282 cdata->mn, cdata->sn, 1283 spdk_nvme_ns_get_id(ns)); 1284 g_warn = true; 1285 return; 1286 } 1287 1288 ns_size = spdk_nvme_ns_get_size(ns); 1289 sector_size = spdk_nvme_ns_get_sector_size(ns); 1290 1291 if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) { 1292 printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " 1293 "ns size %" PRIu64 " / block size %u for I/O size %u\n", 1294 cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), 1295 ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes); 1296 g_warn = true; 1297 return; 1298 } 1299 1300 max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 1301 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 1302 /* NVMe driver may add additional entries based on 1303 * stripe size and maximum transfer size, we assume 1304 * 1 more entry be used for stripe. 1305 */ 1306 entries = (g_io_size_bytes - 1) / max_xfer_size + 2; 1307 if ((g_queue_depth * entries) > opts.io_queue_size) { 1308 printf("Controller IO queue size %u, less than required.\n", 1309 opts.io_queue_size); 1310 printf("Consider using lower queue depth or smaller IO size, because " 1311 "IO requests may be queued at the NVMe driver.\n"); 1312 } 1313 /* For requests which have children requests, parent request itself 1314 * will also occupy 1 entry. 1315 */ 1316 entries += 1; 1317 1318 entry = calloc(1, sizeof(struct ns_entry)); 1319 if (entry == NULL) { 1320 perror("ns_entry malloc"); 1321 exit(1); 1322 } 1323 1324 entry->type = ENTRY_TYPE_NVME_NS; 1325 entry->fn_table = &nvme_fn_table; 1326 entry->u.nvme.ctrlr = ctrlr; 1327 entry->u.nvme.ns = ns; 1328 entry->num_io_requests = entries * spdk_divide_round_up(g_queue_depth, g_nr_io_queues_per_ns); 1329 1330 entry->size_in_ios = ns_size / g_io_size_bytes; 1331 entry->io_size_blocks = g_io_size_bytes / sector_size; 1332 1333 if (g_is_random) { 1334 entry->seed = rand(); 1335 if (g_zipf_theta > 0) { 1336 entry->zipf = spdk_zipf_create(entry->size_in_ios, g_zipf_theta, 0); 1337 } 1338 } 1339 1340 entry->block_size = spdk_nvme_ns_get_extended_sector_size(ns); 1341 entry->md_size = spdk_nvme_ns_get_md_size(ns); 1342 entry->md_interleave = spdk_nvme_ns_supports_extended_lba(ns); 1343 entry->pi_loc = spdk_nvme_ns_get_data(ns)->dps.md_start; 1344 entry->pi_type = spdk_nvme_ns_get_pi_type(ns); 1345 1346 if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) { 1347 entry->io_flags = g_metacfg_pract_flag | g_metacfg_prchk_flags; 1348 } 1349 1350 /* If metadata size = 8 bytes, PI is stripped (read) or inserted (write), 1351 * and so reduce metadata size from block size. (If metadata size > 8 bytes, 1352 * PI is passed (read) or replaced (write). So block size is not necessary 1353 * to change.) 1354 */ 1355 if ((entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT) && (entry->md_size == 8)) { 1356 entry->block_size = spdk_nvme_ns_get_sector_size(ns); 1357 } 1358 1359 if (g_io_size_bytes % entry->block_size != 0) { 1360 printf("WARNING: IO size %u (-o) is not a multiple of nsid %u sector size %u." 1361 " Removing this ns from test\n", g_io_size_bytes, spdk_nvme_ns_get_id(ns), entry->block_size); 1362 g_warn = true; 1363 spdk_zipf_free(&entry->zipf); 1364 free(entry); 1365 return; 1366 } 1367 1368 if (g_max_io_md_size < entry->md_size) { 1369 g_max_io_md_size = entry->md_size; 1370 } 1371 1372 if (g_max_io_size_blocks < entry->io_size_blocks) { 1373 g_max_io_size_blocks = entry->io_size_blocks; 1374 } 1375 1376 build_nvme_ns_name(entry->name, sizeof(entry->name), ctrlr, spdk_nvme_ns_get_id(ns)); 1377 1378 g_num_namespaces++; 1379 TAILQ_INSERT_TAIL(&g_namespaces, entry, link); 1380 } 1381 1382 static void 1383 unregister_namespaces(void) 1384 { 1385 struct ns_entry *entry, *tmp; 1386 1387 TAILQ_FOREACH_SAFE(entry, &g_namespaces, link, tmp) { 1388 TAILQ_REMOVE(&g_namespaces, entry, link); 1389 spdk_zipf_free(&entry->zipf); 1390 if (g_use_uring) { 1391 #ifdef SPDK_CONFIG_URING 1392 close(entry->u.uring.fd); 1393 #endif 1394 } else { 1395 #if HAVE_LIBAIO 1396 close(entry->u.aio.fd); 1397 #endif 1398 } 1399 free(entry); 1400 } 1401 } 1402 1403 static void 1404 enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl) 1405 { 1406 if (spdk_nvme_cpl_is_error(cpl)) { 1407 printf("enable_latency_tracking_complete failed\n"); 1408 } 1409 g_outstanding_commands--; 1410 } 1411 1412 static void 1413 set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable) 1414 { 1415 int res; 1416 union spdk_nvme_intel_feat_latency_tracking latency_tracking; 1417 1418 if (enable) { 1419 latency_tracking.bits.enable = 0x01; 1420 } else { 1421 latency_tracking.bits.enable = 0x00; 1422 } 1423 1424 res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING, 1425 latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL); 1426 if (res) { 1427 printf("fail to allocate nvme request.\n"); 1428 return; 1429 } 1430 g_outstanding_commands++; 1431 1432 while (g_outstanding_commands) { 1433 spdk_nvme_ctrlr_process_admin_completions(ctrlr); 1434 } 1435 } 1436 1437 static void 1438 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry) 1439 { 1440 struct spdk_nvme_ns *ns; 1441 struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry)); 1442 uint32_t nsid; 1443 1444 if (entry == NULL) { 1445 perror("ctrlr_entry malloc"); 1446 exit(1); 1447 } 1448 1449 entry->latency_page = spdk_dma_zmalloc(sizeof(struct spdk_nvme_intel_rw_latency_page), 1450 4096, NULL); 1451 if (entry->latency_page == NULL) { 1452 printf("Allocation error (latency page)\n"); 1453 exit(1); 1454 } 1455 1456 build_nvme_name(entry->name, sizeof(entry->name), ctrlr); 1457 1458 entry->ctrlr = ctrlr; 1459 entry->trtype = trid_entry->trid.trtype; 1460 TAILQ_INSERT_TAIL(&g_controllers, entry, link); 1461 1462 if (g_latency_ssd_tracking_enable && 1463 spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) { 1464 set_latency_tracking_feature(ctrlr, true); 1465 } 1466 1467 if (trid_entry->nsid == 0) { 1468 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 1469 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 1470 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1471 if (ns == NULL) { 1472 continue; 1473 } 1474 register_ns(ctrlr, ns); 1475 } 1476 } else { 1477 ns = spdk_nvme_ctrlr_get_ns(ctrlr, trid_entry->nsid); 1478 if (!ns) { 1479 perror("Namespace does not exist."); 1480 exit(1); 1481 } 1482 1483 register_ns(ctrlr, ns); 1484 } 1485 } 1486 1487 static inline void 1488 submit_single_io(struct perf_task *task) 1489 { 1490 uint64_t rand_value, offset_in_ios; 1491 int rc; 1492 struct ns_worker_ctx *ns_ctx = task->ns_ctx; 1493 struct ns_entry *entry = ns_ctx->entry; 1494 1495 assert(!ns_ctx->is_draining); 1496 1497 if (entry->zipf) { 1498 offset_in_ios = spdk_zipf_generate(entry->zipf); 1499 } else if (g_is_random) { 1500 /* rand_r() returns int, so we need to use two calls to ensure 1501 * we get a large enough value to cover a very large block 1502 * device. 1503 */ 1504 rand_value = (uint64_t)rand_r(&entry->seed) * 1505 ((uint64_t)RAND_MAX + 1) + 1506 rand_r(&entry->seed); 1507 offset_in_ios = rand_value % entry->size_in_ios; 1508 } else { 1509 offset_in_ios = ns_ctx->offset_in_ios++; 1510 if (ns_ctx->offset_in_ios == entry->size_in_ios) { 1511 ns_ctx->offset_in_ios = 0; 1512 } 1513 } 1514 1515 task->submit_tsc = spdk_get_ticks(); 1516 1517 if ((g_rw_percentage == 100) || 1518 (g_rw_percentage != 0 && ((rand_r(&entry->seed) % 100) < g_rw_percentage))) { 1519 task->is_read = true; 1520 } else { 1521 task->is_read = false; 1522 } 1523 1524 rc = entry->fn_table->submit_io(task, ns_ctx, entry, offset_in_ios); 1525 1526 if (spdk_unlikely(rc != 0)) { 1527 if (g_continue_on_error) { 1528 /* We can't just resubmit here or we can get in a loop that 1529 * stack overflows. */ 1530 TAILQ_INSERT_TAIL(&ns_ctx->queued_tasks, task, link); 1531 } else { 1532 RATELIMIT_LOG("starting I/O failed: %d\n", rc); 1533 spdk_dma_free(task->iovs[0].iov_base); 1534 free(task->iovs); 1535 spdk_dma_free(task->md_iov.iov_base); 1536 task->ns_ctx->status = 1; 1537 free(task); 1538 } 1539 } else { 1540 ns_ctx->current_queue_depth++; 1541 ns_ctx->stats.io_submitted++; 1542 } 1543 1544 if (spdk_unlikely(g_number_ios && ns_ctx->stats.io_submitted >= g_number_ios)) { 1545 ns_ctx->is_draining = true; 1546 } 1547 } 1548 1549 static inline void 1550 task_complete(struct perf_task *task) 1551 { 1552 struct ns_worker_ctx *ns_ctx; 1553 uint64_t tsc_diff; 1554 struct ns_entry *entry; 1555 1556 ns_ctx = task->ns_ctx; 1557 entry = ns_ctx->entry; 1558 ns_ctx->current_queue_depth--; 1559 ns_ctx->stats.io_completed++; 1560 tsc_diff = spdk_get_ticks() - task->submit_tsc; 1561 ns_ctx->stats.total_tsc += tsc_diff; 1562 if (spdk_unlikely(ns_ctx->stats.min_tsc > tsc_diff)) { 1563 ns_ctx->stats.min_tsc = tsc_diff; 1564 } 1565 if (spdk_unlikely(ns_ctx->stats.max_tsc < tsc_diff)) { 1566 ns_ctx->stats.max_tsc = tsc_diff; 1567 } 1568 if (spdk_unlikely(g_latency_sw_tracking_level > 0)) { 1569 spdk_histogram_data_tally(ns_ctx->histogram, tsc_diff); 1570 } 1571 1572 if (spdk_unlikely(entry->md_size > 0)) { 1573 /* add application level verification for end-to-end data protection */ 1574 entry->fn_table->verify_io(task, entry); 1575 } 1576 1577 /* 1578 * is_draining indicates when time has expired or io_submitted exceeded 1579 * g_number_ios for the test run and we are just waiting for the previously 1580 * submitted I/O to complete. In this case, do not submit a new I/O to 1581 * replace the one just completed. 1582 */ 1583 if (spdk_unlikely(ns_ctx->is_draining)) { 1584 spdk_dma_free(task->iovs[0].iov_base); 1585 free(task->iovs); 1586 spdk_dma_free(task->md_iov.iov_base); 1587 free(task); 1588 } else { 1589 submit_single_io(task); 1590 } 1591 } 1592 1593 static void 1594 io_complete(void *ctx, const struct spdk_nvme_cpl *cpl) 1595 { 1596 struct perf_task *task = ctx; 1597 1598 if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { 1599 if (task->is_read) { 1600 RATELIMIT_LOG("Read completed with error (sct=%d, sc=%d)\n", 1601 cpl->status.sct, cpl->status.sc); 1602 } else { 1603 RATELIMIT_LOG("Write completed with error (sct=%d, sc=%d)\n", 1604 cpl->status.sct, cpl->status.sc); 1605 } 1606 if (!g_continue_on_error) { 1607 if (cpl->status.sct == SPDK_NVME_SCT_GENERIC && 1608 cpl->status.sc == SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT) { 1609 /* The namespace was hotplugged. Stop trying to send I/O to it. */ 1610 task->ns_ctx->is_draining = true; 1611 } 1612 1613 task->ns_ctx->status = 1; 1614 } 1615 } 1616 1617 task_complete(task); 1618 } 1619 1620 static struct perf_task * 1621 allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth) 1622 { 1623 struct perf_task *task; 1624 1625 task = calloc(1, sizeof(*task)); 1626 if (task == NULL) { 1627 fprintf(stderr, "Out of memory allocating tasks\n"); 1628 exit(1); 1629 } 1630 1631 ns_ctx->entry->fn_table->setup_payload(task, queue_depth % 8 + 1); 1632 1633 task->ns_ctx = ns_ctx; 1634 1635 return task; 1636 } 1637 1638 static void 1639 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth) 1640 { 1641 struct perf_task *task; 1642 1643 while (queue_depth-- > 0) { 1644 task = allocate_task(ns_ctx, queue_depth); 1645 submit_single_io(task); 1646 } 1647 } 1648 1649 static int 1650 init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1651 { 1652 TAILQ_INIT(&ns_ctx->queued_tasks); 1653 return ns_ctx->entry->fn_table->init_ns_worker_ctx(ns_ctx); 1654 } 1655 1656 static void 1657 cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) 1658 { 1659 struct perf_task *task, *ttask; 1660 1661 TAILQ_FOREACH_SAFE(task, &ns_ctx->queued_tasks, link, ttask) { 1662 TAILQ_REMOVE(&ns_ctx->queued_tasks, task, link); 1663 task_complete(task); 1664 } 1665 ns_ctx->entry->fn_table->cleanup_ns_worker_ctx(ns_ctx); 1666 } 1667 1668 static void 1669 print_periodic_performance(bool warmup) 1670 { 1671 uint64_t io_this_second; 1672 double mb_this_second; 1673 struct worker_thread *worker; 1674 struct ns_worker_ctx *ns_ctx; 1675 uint64_t busy_tsc; 1676 uint64_t idle_tsc; 1677 uint64_t core_busy_tsc = 0; 1678 uint64_t core_idle_tsc = 0; 1679 double core_busy_perc = 0; 1680 1681 if (!isatty(STDOUT_FILENO)) { 1682 /* Don't print periodic stats if output is not going 1683 * to a terminal. 1684 */ 1685 return; 1686 } 1687 io_this_second = 0; 1688 TAILQ_FOREACH(worker, &g_workers, link) { 1689 busy_tsc = 0; 1690 idle_tsc = 0; 1691 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1692 io_this_second += ns_ctx->stats.io_completed - ns_ctx->stats.last_io_completed; 1693 ns_ctx->stats.last_io_completed = ns_ctx->stats.io_completed; 1694 1695 if (g_monitor_perf_cores) { 1696 busy_tsc += ns_ctx->stats.busy_tsc - ns_ctx->stats.last_busy_tsc; 1697 idle_tsc += ns_ctx->stats.idle_tsc - ns_ctx->stats.last_idle_tsc; 1698 ns_ctx->stats.last_busy_tsc = ns_ctx->stats.busy_tsc; 1699 ns_ctx->stats.last_idle_tsc = ns_ctx->stats.idle_tsc; 1700 } 1701 } 1702 if (g_monitor_perf_cores) { 1703 core_busy_tsc += busy_tsc; 1704 core_idle_tsc += idle_tsc; 1705 } 1706 } 1707 mb_this_second = (double)io_this_second * g_io_size_bytes / (1024 * 1024); 1708 1709 printf("%s%9ju IOPS, %8.2f MiB/s", warmup ? "[warmup] " : "", io_this_second, mb_this_second); 1710 if (g_monitor_perf_cores) { 1711 core_busy_perc = (double)core_busy_tsc / (core_idle_tsc + core_busy_tsc) * 100; 1712 printf("%3d Core(s): %6.2f%% Busy", g_num_workers, core_busy_perc); 1713 } 1714 printf("\r"); 1715 fflush(stdout); 1716 } 1717 1718 static void 1719 perf_dump_transport_statistics(struct worker_thread *worker) 1720 { 1721 struct ns_worker_ctx *ns_ctx; 1722 1723 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1724 if (ns_ctx->entry->fn_table->dump_transport_stats) { 1725 ns_ctx->entry->fn_table->dump_transport_stats(worker->lcore, ns_ctx); 1726 } 1727 } 1728 } 1729 1730 static int 1731 work_fn(void *arg) 1732 { 1733 uint64_t tsc_start, tsc_end, tsc_current, tsc_next_print; 1734 struct worker_thread *worker = (struct worker_thread *) arg; 1735 struct ns_worker_ctx *ns_ctx = NULL; 1736 uint32_t unfinished_ns_ctx; 1737 bool warmup = false; 1738 int rc; 1739 int64_t check_rc; 1740 uint64_t check_now; 1741 TAILQ_HEAD(, perf_task) swap; 1742 struct perf_task *task; 1743 1744 /* Allocate queue pairs for each namespace. */ 1745 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1746 if (init_ns_worker_ctx(ns_ctx) != 0) { 1747 printf("ERROR: init_ns_worker_ctx() failed\n"); 1748 /* Wait on barrier to avoid blocking of successful workers */ 1749 pthread_barrier_wait(&g_worker_sync_barrier); 1750 ns_ctx->status = 1; 1751 return 1; 1752 } 1753 } 1754 1755 rc = pthread_barrier_wait(&g_worker_sync_barrier); 1756 if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { 1757 printf("ERROR: failed to wait on thread sync barrier\n"); 1758 ns_ctx->status = 1; 1759 return 1; 1760 } 1761 1762 tsc_start = spdk_get_ticks(); 1763 tsc_current = tsc_start; 1764 tsc_next_print = tsc_current + g_tsc_rate; 1765 1766 if (g_warmup_time_in_sec) { 1767 warmup = true; 1768 tsc_end = tsc_current + g_warmup_time_in_sec * g_tsc_rate; 1769 } else { 1770 tsc_end = tsc_current + g_time_in_sec * g_tsc_rate; 1771 } 1772 1773 /* Submit initial I/O for each namespace. */ 1774 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1775 submit_io(ns_ctx, g_queue_depth); 1776 } 1777 1778 while (spdk_likely(!g_exit)) { 1779 bool all_draining = true; 1780 1781 /* 1782 * Check for completed I/O for each controller. A new 1783 * I/O will be submitted in the io_complete callback 1784 * to replace each I/O that is completed. 1785 */ 1786 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1787 if (g_continue_on_error && !ns_ctx->is_draining) { 1788 /* Submit any I/O that is queued up */ 1789 TAILQ_INIT(&swap); 1790 TAILQ_SWAP(&swap, &ns_ctx->queued_tasks, perf_task, link); 1791 while (!TAILQ_EMPTY(&swap)) { 1792 task = TAILQ_FIRST(&swap); 1793 TAILQ_REMOVE(&swap, task, link); 1794 if (ns_ctx->is_draining) { 1795 TAILQ_INSERT_TAIL(&ns_ctx->queued_tasks, 1796 task, link); 1797 continue; 1798 } 1799 submit_single_io(task); 1800 } 1801 } 1802 1803 check_now = spdk_get_ticks(); 1804 check_rc = ns_ctx->entry->fn_table->check_io(ns_ctx); 1805 1806 if (check_rc > 0) { 1807 ns_ctx->stats.busy_tsc += check_now - ns_ctx->stats.last_tsc; 1808 } else { 1809 ns_ctx->stats.idle_tsc += check_now - ns_ctx->stats.last_tsc; 1810 } 1811 ns_ctx->stats.last_tsc = check_now; 1812 1813 if (!ns_ctx->is_draining) { 1814 all_draining = false; 1815 } 1816 } 1817 1818 if (spdk_unlikely(all_draining)) { 1819 break; 1820 } 1821 1822 tsc_current = spdk_get_ticks(); 1823 1824 if (worker->lcore == g_main_core && tsc_current > tsc_next_print) { 1825 tsc_next_print += g_tsc_rate; 1826 print_periodic_performance(warmup); 1827 } 1828 1829 if (tsc_current > tsc_end) { 1830 if (warmup) { 1831 /* Update test start and end time, clear statistics */ 1832 tsc_start = spdk_get_ticks(); 1833 tsc_end = tsc_start + g_time_in_sec * g_tsc_rate; 1834 1835 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1836 memset(&ns_ctx->stats, 0, sizeof(ns_ctx->stats)); 1837 ns_ctx->stats.min_tsc = UINT64_MAX; 1838 spdk_histogram_data_reset(ns_ctx->histogram); 1839 } 1840 1841 if (worker->lcore == g_main_core && isatty(STDOUT_FILENO)) { 1842 /* warmup stage prints a longer string to stdout, need to erase it */ 1843 printf("%c[2K", 27); 1844 } 1845 1846 warmup = false; 1847 } else { 1848 break; 1849 } 1850 } 1851 } 1852 1853 /* Capture the actual elapsed time when we break out of the main loop. This will account 1854 * for cases where we exit prematurely due to a signal. We only need to capture it on 1855 * one core, so use the main core. 1856 */ 1857 if (worker->lcore == g_main_core) { 1858 g_elapsed_time_in_usec = (tsc_current - tsc_start) * SPDK_SEC_TO_USEC / g_tsc_rate; 1859 } 1860 1861 /* drain the io of each ns_ctx in round robin to make the fairness */ 1862 do { 1863 unfinished_ns_ctx = 0; 1864 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1865 /* first time will enter into this if case */ 1866 if (!ns_ctx->is_draining) { 1867 ns_ctx->is_draining = true; 1868 } 1869 1870 if (ns_ctx->current_queue_depth > 0) { 1871 ns_ctx->entry->fn_table->check_io(ns_ctx); 1872 if (ns_ctx->current_queue_depth > 0) { 1873 unfinished_ns_ctx++; 1874 } 1875 } 1876 } 1877 } while (unfinished_ns_ctx > 0); 1878 1879 if (g_dump_transport_stats) { 1880 pthread_mutex_lock(&g_stats_mutex); 1881 perf_dump_transport_statistics(worker); 1882 pthread_mutex_unlock(&g_stats_mutex); 1883 } 1884 1885 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 1886 cleanup_ns_worker_ctx(ns_ctx); 1887 } 1888 1889 return 0; 1890 } 1891 1892 static void 1893 usage(char *program_name) 1894 { 1895 printf("%s options", program_name); 1896 #if defined(SPDK_CONFIG_URING) || defined(HAVE_LIBAIO) 1897 printf(" [Kernel device(s)]..."); 1898 #endif 1899 printf("\n\n"); 1900 printf("==== BASIC OPTIONS ====\n\n"); 1901 printf("\t-q, --io-depth <val> io depth\n"); 1902 printf("\t-o, --io-size <val> io size in bytes\n"); 1903 printf("\t-w, --io-pattern <pattern> io pattern type, must be one of\n"); 1904 printf("\t\t(read, write, randread, randwrite, rw, randrw)\n"); 1905 printf("\t-M, --rwmixread <0-100> rwmixread (100 for reads, 0 for writes)\n"); 1906 printf("\t-t, --time <sec> time in seconds\n"); 1907 printf("\t-a, --warmup-time <sec> warmup time in seconds\n"); 1908 printf("\t-c, --core-mask <mask> core mask for I/O submission/completion.\n"); 1909 printf("\t\t(default: 1)\n"); 1910 printf("\t-r, --transport <fmt> Transport ID for local PCIe NVMe or NVMeoF\n"); 1911 printf("\t\t Format: 'key:value [key:value] ...'\n"); 1912 printf("\t\t Keys:\n"); 1913 printf("\t\t trtype Transport type (e.g. PCIe, RDMA)\n"); 1914 printf("\t\t adrfam Address family (e.g. IPv4, IPv6)\n"); 1915 printf("\t\t traddr Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n"); 1916 printf("\t\t trsvcid Transport service identifier (e.g. 4420)\n"); 1917 printf("\t\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN); 1918 printf("\t\t ns NVMe namespace ID (all active namespaces are used by default)\n"); 1919 printf("\t\t hostnqn Host NQN\n"); 1920 printf("\t\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n"); 1921 printf("\t\t -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n"); 1922 printf("\t\t Note: can be specified multiple times to test multiple disks/targets.\n"); 1923 printf("\n"); 1924 1925 printf("==== ADVANCED OPTIONS ====\n\n"); 1926 printf("\t--use-every-core for each namespace, I/Os are submitted from all cores\n"); 1927 printf("\t--io-queue-size <val> size of NVMe IO queue. Default: maximum allowed by controller\n"); 1928 printf("\t-O, --io-unit-size io unit size in bytes (4-byte aligned) for SPDK driver. default: same as io size\n"); 1929 printf("\t-P, --num-qpairs <val> number of io queues per namespace. default: 1\n"); 1930 printf("\t-U, --num-unused-qpairs <val> number of unused io queues per controller. default: 0\n"); 1931 printf("\t-A, --buffer-alignment IO buffer alignment. Must be power of 2 and not less than cache line (%u)\n", 1932 SPDK_CACHE_LINE_SIZE); 1933 printf("\t-s, --hugemem-size <MB> DPDK huge memory size in MB.\n"); 1934 printf("\t-g, --mem-single-seg use single file descriptor for DPDK memory segments\n"); 1935 printf("\t-C, --max-completion-per-poll <val> max completions per poll\n"); 1936 printf("\t\t(default: 0 - unlimited)\n"); 1937 printf("\t-i, --shmem-grp-id <id> shared memory group ID\n"); 1938 printf("\t-d, --number-ios <val> number of I/O to perform per thread on each namespace. Note: this is additional exit criteria.\n"); 1939 printf("\t\t(default: 0 - unlimited)\n"); 1940 printf("\t-e, --metadata <fmt> metadata configuration\n"); 1941 printf("\t\t Keys:\n"); 1942 printf("\t\t PRACT Protection Information Action bit (PRACT=1 or PRACT=0)\n"); 1943 printf("\t\t PRCHK Control of Protection Information Checking (PRCHK=GUARD|REFTAG|APPTAG)\n"); 1944 printf("\t\t Example: -e 'PRACT=0,PRCHK=GUARD|REFTAG|APPTAG'\n"); 1945 printf("\t\t -e 'PRACT=1,PRCHK=GUARD'\n"); 1946 printf("\t-F, --zipf <theta> use zipf distribution for random I/O\n"); 1947 #ifdef SPDK_CONFIG_URING 1948 printf("\t-R, --enable-uring enable using liburing to drive kernel devices (Default: libaio)\n"); 1949 #endif 1950 printf("\t--iova-mode <mode> specify DPDK IOVA mode: va|pa\n"); 1951 printf("\t--no-huge, SPDK is run without hugepages\n"); 1952 printf("\n"); 1953 1954 printf("==== PCIe OPTIONS ====\n\n"); 1955 printf("\t-b, --allowed-pci-addr <addr> allowed local PCIe device address\n"); 1956 printf("\t\t Example: -b 0000:d8:00.0 -b 0000:d9:00.0\n"); 1957 printf("\t-V, --enable-vmd enable VMD enumeration\n"); 1958 printf("\t-D, --disable-sq-cmb disable submission queue in controller memory buffer, default: enabled\n"); 1959 printf("\n"); 1960 1961 printf("==== TCP OPTIONS ====\n\n"); 1962 printf("\t-S, --default-sock-impl <impl> set the default sock impl, e.g. \"posix\"\n"); 1963 printf("\t--disable-ktls disable Kernel TLS. Only valid for ssl impl. Default for ssl impl\n"); 1964 printf("\t--enable-ktls enable Kernel TLS. Only valid for ssl impl\n"); 1965 printf("\t--tls-version <val> TLS version to use. Only valid for ssl impl. Default: 0 (auto-negotiation)\n"); 1966 printf("\t--psk-path <val> Path to PSK file (only applies when sock_impl == ssl)\n"); 1967 printf("\t--psk-identity <val> Default PSK ID, e.g. psk.spdk.io (only applies when sock_impl == ssl)\n"); 1968 printf("\t--zerocopy-threshold <val> data is sent with MSG_ZEROCOPY if size is greater than this val. Default: 0 to disable it\n"); 1969 printf("\t--zerocopy-threshold-sock-impl <impl> specify the sock implementation to set zerocopy_threshold\n"); 1970 printf("\t-z, --disable-zcopy <impl> disable zero copy send for the given sock implementation. Default for posix impl\n"); 1971 printf("\t-Z, --enable-zcopy <impl> enable zero copy send for the given sock implementation\n"); 1972 printf("\t-k, --keepalive <ms> keep alive timeout period in millisecond\n"); 1973 printf("\t-H, --enable-tcp-hdgst enable header digest for TCP transport, default: disabled\n"); 1974 printf("\t-I, --enable-tcp-ddgst enable data digest for TCP transport, default: disabled\n"); 1975 printf("\n"); 1976 1977 printf("==== RDMA OPTIONS ====\n\n"); 1978 printf("\t--transport-tos <val> specify the type of service for RDMA transport. Default: 0 (disabled)\n"); 1979 printf("\t--rdma-srq-size <val> The size of a shared rdma receive queue. Default: 0 (disabled)\n"); 1980 printf("\t-k, --keepalive <ms> keep alive timeout period in millisecond\n"); 1981 printf("\n"); 1982 1983 printf("==== LOGGING ====\n\n"); 1984 printf("\t-L, --enable-sw-latency-tracking enable latency tracking via sw, default: disabled\n"); 1985 printf("\t\t-L for latency summary, -LL for detailed histogram\n"); 1986 printf("\t-l, --enable-ssd-latency-tracking enable latency tracking via ssd (if supported), default: disabled\n"); 1987 printf("\t-N, --no-shst-notification no shutdown notification process for controllers, default: disabled\n"); 1988 printf("\t-Q, --continue-on-error <val> Do not stop on error. Log I/O errors every N times (default: 1)\n"); 1989 spdk_log_usage(stdout, "\t-T"); 1990 printf("\t-m, --cpu-usage display real-time overall cpu usage on used cores\n"); 1991 #ifdef DEBUG 1992 printf("\t-G, --enable-debug enable debug logging\n"); 1993 #else 1994 printf("\t-G, --enable-debug enable debug logging (flag disabled, must reconfigure with --enable-debug)\n"); 1995 #endif 1996 printf("\t--transport-stats dump transport statistics\n"); 1997 printf("\n\n"); 1998 } 1999 2000 static void 2001 check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count, 2002 uint64_t total, uint64_t so_far) 2003 { 2004 double so_far_pct; 2005 double **cutoff = ctx; 2006 2007 if (count == 0) { 2008 return; 2009 } 2010 2011 so_far_pct = (double)so_far / total; 2012 while (so_far_pct >= **cutoff && **cutoff > 0) { 2013 printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * 1000 * 1000 / g_tsc_rate); 2014 (*cutoff)++; 2015 } 2016 } 2017 2018 static void 2019 print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count, 2020 uint64_t total, uint64_t so_far) 2021 { 2022 double so_far_pct; 2023 2024 if (count == 0) { 2025 return; 2026 } 2027 2028 so_far_pct = (double)so_far * 100 / total; 2029 printf("%9.3f - %9.3f: %9.4f%% (%9ju)\n", 2030 (double)start * 1000 * 1000 / g_tsc_rate, 2031 (double)end * 1000 * 1000 / g_tsc_rate, 2032 so_far_pct, count); 2033 } 2034 2035 static void 2036 print_performance(void) 2037 { 2038 uint64_t total_io_completed, total_io_tsc; 2039 double io_per_second, mb_per_second, average_latency, min_latency, max_latency; 2040 double sum_ave_latency, min_latency_so_far, max_latency_so_far; 2041 double total_io_per_second, total_mb_per_second; 2042 int ns_count; 2043 struct worker_thread *worker; 2044 struct ns_worker_ctx *ns_ctx; 2045 uint32_t max_strlen; 2046 2047 total_io_per_second = 0; 2048 total_mb_per_second = 0; 2049 total_io_completed = 0; 2050 total_io_tsc = 0; 2051 min_latency_so_far = (double)UINT64_MAX; 2052 max_latency_so_far = 0; 2053 ns_count = 0; 2054 2055 max_strlen = 0; 2056 TAILQ_FOREACH(worker, &g_workers, link) { 2057 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2058 max_strlen = spdk_max(strlen(ns_ctx->entry->name), max_strlen); 2059 } 2060 } 2061 2062 printf("========================================================\n"); 2063 printf("%*s\n", max_strlen + 60, "Latency(us)"); 2064 printf("%-*s: %10s %10s %10s %10s %10s\n", 2065 max_strlen + 13, "Device Information", "IOPS", "MiB/s", "Average", "min", "max"); 2066 2067 TAILQ_FOREACH(worker, &g_workers, link) { 2068 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2069 if (ns_ctx->stats.io_completed != 0) { 2070 io_per_second = (double)ns_ctx->stats.io_completed * 1000 * 1000 / g_elapsed_time_in_usec; 2071 mb_per_second = io_per_second * g_io_size_bytes / (1024 * 1024); 2072 average_latency = ((double)ns_ctx->stats.total_tsc / ns_ctx->stats.io_completed) * 1000 * 1000 / 2073 g_tsc_rate; 2074 min_latency = (double)ns_ctx->stats.min_tsc * 1000 * 1000 / g_tsc_rate; 2075 if (min_latency < min_latency_so_far) { 2076 min_latency_so_far = min_latency; 2077 } 2078 2079 max_latency = (double)ns_ctx->stats.max_tsc * 1000 * 1000 / g_tsc_rate; 2080 if (max_latency > max_latency_so_far) { 2081 max_latency_so_far = max_latency; 2082 } 2083 2084 printf("%-*.*s from core %2u: %10.2f %10.2f %10.2f %10.2f %10.2f\n", 2085 max_strlen, max_strlen, ns_ctx->entry->name, worker->lcore, 2086 io_per_second, mb_per_second, 2087 average_latency, min_latency, max_latency); 2088 total_io_per_second += io_per_second; 2089 total_mb_per_second += mb_per_second; 2090 total_io_completed += ns_ctx->stats.io_completed; 2091 total_io_tsc += ns_ctx->stats.total_tsc; 2092 ns_count++; 2093 } 2094 } 2095 } 2096 2097 if (ns_count != 0 && total_io_completed) { 2098 sum_ave_latency = ((double)total_io_tsc / total_io_completed) * 1000 * 1000 / g_tsc_rate; 2099 printf("========================================================\n"); 2100 printf("%-*s: %10.2f %10.2f %10.2f %10.2f %10.2f\n", 2101 max_strlen + 13, "Total", total_io_per_second, total_mb_per_second, 2102 sum_ave_latency, min_latency_so_far, max_latency_so_far); 2103 printf("\n"); 2104 } 2105 2106 if (g_latency_sw_tracking_level == 0 || total_io_completed == 0) { 2107 return; 2108 } 2109 2110 TAILQ_FOREACH(worker, &g_workers, link) { 2111 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2112 const double *cutoff = g_latency_cutoffs; 2113 2114 printf("Summary latency data for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore); 2115 printf("=================================================================================\n"); 2116 2117 spdk_histogram_data_iterate(ns_ctx->histogram, check_cutoff, &cutoff); 2118 2119 printf("\n"); 2120 } 2121 } 2122 2123 if (g_latency_sw_tracking_level == 1) { 2124 return; 2125 } 2126 2127 TAILQ_FOREACH(worker, &g_workers, link) { 2128 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 2129 printf("Latency histogram for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore); 2130 printf("==============================================================================\n"); 2131 printf(" Range in us Cumulative IO count\n"); 2132 2133 spdk_histogram_data_iterate(ns_ctx->histogram, print_bucket, NULL); 2134 printf("\n"); 2135 } 2136 } 2137 2138 } 2139 2140 static void 2141 print_latency_page(struct ctrlr_entry *entry) 2142 { 2143 int i; 2144 2145 printf("\n"); 2146 printf("%s\n", entry->name); 2147 printf("--------------------------------------------------------\n"); 2148 2149 for (i = 0; i < 32; i++) { 2150 if (entry->latency_page->buckets_32us[i]) { 2151 printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32, entry->latency_page->buckets_32us[i]); 2152 } 2153 } 2154 for (i = 0; i < 31; i++) { 2155 if (entry->latency_page->buckets_1ms[i]) { 2156 printf("Bucket %dms - %dms: %d\n", i + 1, i + 2, entry->latency_page->buckets_1ms[i]); 2157 } 2158 } 2159 for (i = 0; i < 31; i++) { 2160 if (entry->latency_page->buckets_32ms[i]) 2161 printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32, 2162 entry->latency_page->buckets_32ms[i]); 2163 } 2164 } 2165 2166 static void 2167 print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page) 2168 { 2169 struct ctrlr_entry *ctrlr; 2170 2171 printf("%s Latency Statistics:\n", op_name); 2172 printf("========================================================\n"); 2173 TAILQ_FOREACH(ctrlr, &g_controllers, link) { 2174 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) { 2175 if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr->ctrlr, log_page, SPDK_NVME_GLOBAL_NS_TAG, 2176 ctrlr->latency_page, sizeof(struct spdk_nvme_intel_rw_latency_page), 0, 2177 enable_latency_tracking_complete, 2178 NULL)) { 2179 printf("nvme_ctrlr_cmd_get_log_page() failed\n"); 2180 exit(1); 2181 } 2182 2183 g_outstanding_commands++; 2184 } else { 2185 printf("Controller %s: %s latency statistics not supported\n", ctrlr->name, op_name); 2186 } 2187 } 2188 2189 while (g_outstanding_commands) { 2190 TAILQ_FOREACH(ctrlr, &g_controllers, link) { 2191 spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr); 2192 } 2193 } 2194 2195 TAILQ_FOREACH(ctrlr, &g_controllers, link) { 2196 if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) { 2197 print_latency_page(ctrlr); 2198 } 2199 } 2200 printf("\n"); 2201 } 2202 2203 static void 2204 print_stats(void) 2205 { 2206 print_performance(); 2207 if (g_latency_ssd_tracking_enable) { 2208 if (g_rw_percentage != 0) { 2209 print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY); 2210 } 2211 if (g_rw_percentage != 100) { 2212 print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY); 2213 } 2214 } 2215 } 2216 2217 static void 2218 unregister_trids(void) 2219 { 2220 struct trid_entry *trid_entry, *tmp; 2221 2222 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) { 2223 TAILQ_REMOVE(&g_trid_list, trid_entry, tailq); 2224 free(trid_entry); 2225 } 2226 } 2227 2228 static int 2229 add_trid(const char *trid_str) 2230 { 2231 struct trid_entry *trid_entry; 2232 struct spdk_nvme_transport_id *trid; 2233 char *ns; 2234 char *hostnqn; 2235 2236 trid_entry = calloc(1, sizeof(*trid_entry)); 2237 if (trid_entry == NULL) { 2238 return -1; 2239 } 2240 2241 trid = &trid_entry->trid; 2242 trid->trtype = SPDK_NVME_TRANSPORT_PCIE; 2243 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 2244 2245 if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) { 2246 fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str); 2247 free(trid_entry); 2248 return 1; 2249 } 2250 2251 if ((ns = strcasestr(trid_str, "ns:")) || 2252 (ns = strcasestr(trid_str, "ns="))) { 2253 char nsid_str[6]; /* 5 digits maximum in an nsid */ 2254 int len; 2255 int nsid; 2256 2257 ns += 3; 2258 2259 len = strcspn(ns, " \t\n"); 2260 if (len > 5) { 2261 fprintf(stderr, "NVMe namespace IDs must be 5 digits or less\n"); 2262 free(trid_entry); 2263 return 1; 2264 } 2265 2266 memcpy(nsid_str, ns, len); 2267 nsid_str[len] = '\0'; 2268 2269 nsid = spdk_strtol(nsid_str, 10); 2270 if (nsid <= 0 || nsid > 65535) { 2271 fprintf(stderr, "NVMe namespace IDs must be less than 65536 and greater than 0\n"); 2272 free(trid_entry); 2273 return 1; 2274 } 2275 2276 trid_entry->nsid = (uint16_t)nsid; 2277 } 2278 2279 if ((hostnqn = strcasestr(trid_str, "hostnqn:")) || 2280 (hostnqn = strcasestr(trid_str, "hostnqn="))) { 2281 size_t len; 2282 2283 hostnqn += strlen("hostnqn:"); 2284 2285 len = strcspn(hostnqn, " \t\n"); 2286 if (len > (sizeof(trid_entry->hostnqn) - 1)) { 2287 fprintf(stderr, "Host NQN is too long\n"); 2288 free(trid_entry); 2289 return 1; 2290 } 2291 2292 memcpy(trid_entry->hostnqn, hostnqn, len); 2293 trid_entry->hostnqn[len] = '\0'; 2294 } 2295 2296 TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq); 2297 return 0; 2298 } 2299 2300 static int 2301 add_allowed_pci_device(const char *bdf_str, struct spdk_env_opts *env_opts) 2302 { 2303 int rc; 2304 2305 if (env_opts->num_pci_addr >= MAX_ALLOWED_PCI_DEVICE_NUM) { 2306 fprintf(stderr, "Currently we only support allowed PCI device num=%d\n", 2307 MAX_ALLOWED_PCI_DEVICE_NUM); 2308 return -1; 2309 } 2310 2311 rc = spdk_pci_addr_parse(&env_opts->pci_allowed[env_opts->num_pci_addr], bdf_str); 2312 if (rc < 0) { 2313 fprintf(stderr, "Failed to parse the given bdf_str=%s\n", bdf_str); 2314 return -1; 2315 } 2316 2317 env_opts->num_pci_addr++; 2318 return 0; 2319 } 2320 2321 static size_t 2322 parse_next_key(const char **str, char *key, char *val, size_t key_buf_size, 2323 size_t val_buf_size) 2324 { 2325 const char *sep; 2326 const char *separator = ", \t\n"; 2327 size_t key_len, val_len; 2328 2329 *str += strspn(*str, separator); 2330 2331 sep = strchr(*str, '='); 2332 if (!sep) { 2333 fprintf(stderr, "Key without '=' separator\n"); 2334 return 0; 2335 } 2336 2337 key_len = sep - *str; 2338 if (key_len >= key_buf_size) { 2339 fprintf(stderr, "Key length %zu is greater than maximum allowed %zu\n", 2340 key_len, key_buf_size - 1); 2341 return 0; 2342 } 2343 2344 memcpy(key, *str, key_len); 2345 key[key_len] = '\0'; 2346 2347 *str += key_len + 1; /* Skip key */ 2348 val_len = strcspn(*str, separator); 2349 if (val_len == 0) { 2350 fprintf(stderr, "Key without value\n"); 2351 return 0; 2352 } 2353 2354 if (val_len >= val_buf_size) { 2355 fprintf(stderr, "Value length %zu is greater than maximum allowed %zu\n", 2356 val_len, val_buf_size - 1); 2357 return 0; 2358 } 2359 2360 memcpy(val, *str, val_len); 2361 val[val_len] = '\0'; 2362 2363 *str += val_len; 2364 2365 return val_len; 2366 } 2367 2368 static int 2369 parse_metadata(const char *metacfg_str) 2370 { 2371 const char *str; 2372 size_t val_len; 2373 char key[32]; 2374 char val[1024]; 2375 2376 if (metacfg_str == NULL) { 2377 return -EINVAL; 2378 } 2379 2380 str = metacfg_str; 2381 2382 while (*str != '\0') { 2383 val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val)); 2384 if (val_len == 0) { 2385 fprintf(stderr, "Failed to parse metadata\n"); 2386 return -EINVAL; 2387 } 2388 2389 if (strcmp(key, "PRACT") == 0) { 2390 if (*val == '1') { 2391 g_metacfg_pract_flag = SPDK_NVME_IO_FLAGS_PRACT; 2392 } 2393 } else if (strcmp(key, "PRCHK") == 0) { 2394 if (strstr(val, "GUARD") != NULL) { 2395 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD; 2396 } 2397 if (strstr(val, "REFTAG") != NULL) { 2398 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG; 2399 } 2400 if (strstr(val, "APPTAG") != NULL) { 2401 g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG; 2402 } 2403 } else { 2404 fprintf(stderr, "Unknown key '%s'\n", key); 2405 } 2406 } 2407 2408 return 0; 2409 } 2410 2411 #define PERF_GETOPT_SHORT "a:b:c:d:e:ghi:lmo:q:r:k:s:t:w:z:A:C:DF:GHILM:NO:P:Q:RS:T:U:VZ:" 2412 2413 static const struct option g_perf_cmdline_opts[] = { 2414 #define PERF_WARMUP_TIME 'a' 2415 {"warmup-time", required_argument, NULL, PERF_WARMUP_TIME}, 2416 #define PERF_ALLOWED_PCI_ADDR 'b' 2417 {"allowed-pci-addr", required_argument, NULL, PERF_ALLOWED_PCI_ADDR}, 2418 #define PERF_CORE_MASK 'c' 2419 {"core-mask", required_argument, NULL, PERF_CORE_MASK}, 2420 #define PERF_METADATA 'e' 2421 {"metadata", required_argument, NULL, PERF_METADATA}, 2422 #define PERF_MEM_SINGL_SEG 'g' 2423 {"mem-single-seg", no_argument, NULL, PERF_MEM_SINGL_SEG}, 2424 #define PERF_HELP 'h' 2425 {"help", no_argument, NULL, PERF_HELP}, 2426 #define PERF_SHMEM_GROUP_ID 'i' 2427 {"shmem-grp-id", required_argument, NULL, PERF_SHMEM_GROUP_ID}, 2428 #define PERF_ENABLE_SSD_LATENCY_TRACING 'l' 2429 {"enable-ssd-latency-tracking", no_argument, NULL, PERF_ENABLE_SSD_LATENCY_TRACING}, 2430 #define PERF_CPU_USAGE 'm' 2431 {"cpu-usage", no_argument, NULL, PERF_CPU_USAGE}, 2432 #define PERF_IO_SIZE 'o' 2433 {"io-size", required_argument, NULL, PERF_IO_SIZE}, 2434 #define PERF_IO_DEPTH 'q' 2435 {"io-depth", required_argument, NULL, PERF_IO_DEPTH}, 2436 #define PERF_TRANSPORT 'r' 2437 {"transport", required_argument, NULL, PERF_TRANSPORT}, 2438 #define PERF_KEEPALIVE 'k' 2439 {"keepalive", required_argument, NULL, PERF_KEEPALIVE}, 2440 #define PERF_HUGEMEM_SIZE 's' 2441 {"hugemem-size", required_argument, NULL, PERF_HUGEMEM_SIZE}, 2442 #define PERF_TIME 't' 2443 {"time", required_argument, NULL, PERF_TIME}, 2444 #define PERF_NUMBER_IOS 'd' 2445 {"number-ios", required_argument, NULL, PERF_NUMBER_IOS}, 2446 #define PERF_IO_PATTERN 'w' 2447 {"io-pattern", required_argument, NULL, PERF_IO_PATTERN}, 2448 #define PERF_DISABLE_ZCOPY 'z' 2449 {"disable-zcopy", required_argument, NULL, PERF_DISABLE_ZCOPY}, 2450 #define PERF_BUFFER_ALIGNMENT 'A' 2451 {"buffer-alignment", required_argument, NULL, PERF_BUFFER_ALIGNMENT}, 2452 #define PERF_MAX_COMPLETIONS_PER_POLL 'C' 2453 {"max-completion-per-poll", required_argument, NULL, PERF_MAX_COMPLETIONS_PER_POLL}, 2454 #define PERF_DISABLE_SQ_CMB 'D' 2455 {"disable-sq-cmb", no_argument, NULL, PERF_DISABLE_SQ_CMB}, 2456 #define PERF_ZIPF 'F' 2457 {"zipf", required_argument, NULL, PERF_ZIPF}, 2458 #define PERF_ENABLE_DEBUG 'G' 2459 {"enable-debug", no_argument, NULL, PERF_ENABLE_DEBUG}, 2460 #define PERF_ENABLE_TCP_HDGST 'H' 2461 {"enable-tcp-hdgst", no_argument, NULL, PERF_ENABLE_TCP_HDGST}, 2462 #define PERF_ENABLE_TCP_DDGST 'I' 2463 {"enable-tcp-ddgst", no_argument, NULL, PERF_ENABLE_TCP_DDGST}, 2464 #define PERF_ENABLE_SW_LATENCY_TRACING 'L' 2465 {"enable-sw-latency-tracking", no_argument, NULL, PERF_ENABLE_SW_LATENCY_TRACING}, 2466 #define PERF_RW_MIXREAD 'M' 2467 {"rwmixread", required_argument, NULL, PERF_RW_MIXREAD}, 2468 #define PERF_NO_SHST_NOTIFICATION 'N' 2469 {"no-shst-notification", no_argument, NULL, PERF_NO_SHST_NOTIFICATION}, 2470 #define PERF_IO_UNIT_SIZE 'O' 2471 {"io-unit-size", required_argument, NULL, PERF_IO_UNIT_SIZE}, 2472 #define PERF_IO_QUEUES_PER_NS 'P' 2473 {"num-qpairs", required_argument, NULL, PERF_IO_QUEUES_PER_NS}, 2474 #define PERF_CONTINUE_ON_ERROR 'Q' 2475 {"continue-on-error", required_argument, NULL, PERF_CONTINUE_ON_ERROR}, 2476 #define PERF_ENABLE_URING 'R' 2477 {"enable-uring", no_argument, NULL, PERF_ENABLE_URING}, 2478 #define PERF_DEFAULT_SOCK_IMPL 'S' 2479 {"default-sock-impl", required_argument, NULL, PERF_DEFAULT_SOCK_IMPL}, 2480 #define PERF_LOG_FLAG 'T' 2481 {"logflag", required_argument, NULL, PERF_LOG_FLAG}, 2482 #define PERF_NUM_UNUSED_IO_QPAIRS 'U' 2483 {"num-unused-qpairs", required_argument, NULL, PERF_NUM_UNUSED_IO_QPAIRS}, 2484 #define PERF_ENABLE_VMD 'V' 2485 {"enable-vmd", no_argument, NULL, PERF_ENABLE_VMD}, 2486 #define PERF_ENABLE_ZCOPY 'Z' 2487 {"enable-zcopy", required_argument, NULL, PERF_ENABLE_ZCOPY}, 2488 #define PERF_TRANSPORT_STATISTICS 257 2489 {"transport-stats", no_argument, NULL, PERF_TRANSPORT_STATISTICS}, 2490 #define PERF_IOVA_MODE 258 2491 {"iova-mode", required_argument, NULL, PERF_IOVA_MODE}, 2492 #define PERF_IO_QUEUE_SIZE 259 2493 {"io-queue-size", required_argument, NULL, PERF_IO_QUEUE_SIZE}, 2494 #define PERF_DISABLE_KTLS 260 2495 {"disable-ktls", no_argument, NULL, PERF_DISABLE_KTLS}, 2496 #define PERF_ENABLE_KTLS 261 2497 {"enable-ktls", no_argument, NULL, PERF_ENABLE_KTLS}, 2498 #define PERF_TLS_VERSION 262 2499 {"tls-version", required_argument, NULL, PERF_TLS_VERSION}, 2500 #define PERF_PSK_PATH 263 2501 {"psk-path", required_argument, NULL, PERF_PSK_PATH}, 2502 #define PERF_PSK_IDENTITY 264 2503 {"psk-identity ", required_argument, NULL, PERF_PSK_IDENTITY}, 2504 #define PERF_ZEROCOPY_THRESHOLD 265 2505 {"zerocopy-threshold", required_argument, NULL, PERF_ZEROCOPY_THRESHOLD}, 2506 #define PERF_SOCK_IMPL 266 2507 {"zerocopy-threshold-sock-impl", required_argument, NULL, PERF_SOCK_IMPL}, 2508 #define PERF_TRANSPORT_TOS 267 2509 {"transport-tos", required_argument, NULL, PERF_TRANSPORT_TOS}, 2510 #define PERF_RDMA_SRQ_SIZE 268 2511 {"rdma-srq-size", required_argument, NULL, PERF_RDMA_SRQ_SIZE}, 2512 #define PERF_USE_EVERY_CORE 269 2513 {"use-every-core", no_argument, NULL, PERF_USE_EVERY_CORE}, 2514 #define PERF_NO_HUGE 270 2515 {"no-huge", no_argument, NULL, PERF_NO_HUGE}, 2516 /* Should be the last element */ 2517 {0, 0, 0, 0} 2518 }; 2519 2520 static int 2521 parse_args(int argc, char **argv, struct spdk_env_opts *env_opts) 2522 { 2523 int op, long_idx; 2524 long int val; 2525 uint64_t val_u64; 2526 int rc; 2527 char *endptr; 2528 bool ssl_used = false; 2529 char *sock_impl = "posix"; 2530 2531 while ((op = getopt_long(argc, argv, PERF_GETOPT_SHORT, g_perf_cmdline_opts, &long_idx)) != -1) { 2532 switch (op) { 2533 case PERF_WARMUP_TIME: 2534 case PERF_SHMEM_GROUP_ID: 2535 case PERF_MAX_COMPLETIONS_PER_POLL: 2536 case PERF_IO_QUEUES_PER_NS: 2537 case PERF_KEEPALIVE: 2538 case PERF_TIME: 2539 case PERF_RW_MIXREAD: 2540 case PERF_NUM_UNUSED_IO_QPAIRS: 2541 case PERF_CONTINUE_ON_ERROR: 2542 case PERF_RDMA_SRQ_SIZE: 2543 val = spdk_strtol(optarg, 10); 2544 if (val < 0) { 2545 fprintf(stderr, "Converting a string to integer failed\n"); 2546 return val; 2547 } 2548 switch (op) { 2549 case PERF_WARMUP_TIME: 2550 g_warmup_time_in_sec = val; 2551 break; 2552 case PERF_SHMEM_GROUP_ID: 2553 env_opts->shm_id = val; 2554 break; 2555 case PERF_MAX_COMPLETIONS_PER_POLL: 2556 g_max_completions = val; 2557 break; 2558 case PERF_IO_QUEUES_PER_NS: 2559 g_nr_io_queues_per_ns = val; 2560 break; 2561 case PERF_KEEPALIVE: 2562 g_keep_alive_timeout_in_ms = val; 2563 break; 2564 case PERF_TIME: 2565 g_time_in_sec = val; 2566 break; 2567 case PERF_RW_MIXREAD: 2568 g_rw_percentage = val; 2569 g_mix_specified = true; 2570 break; 2571 case PERF_CONTINUE_ON_ERROR: 2572 g_quiet_count = val; 2573 g_continue_on_error = true; 2574 break; 2575 case PERF_NUM_UNUSED_IO_QPAIRS: 2576 g_nr_unused_io_queues = val; 2577 break; 2578 case PERF_RDMA_SRQ_SIZE: 2579 g_rdma_srq_size = val; 2580 break; 2581 } 2582 break; 2583 case PERF_IO_SIZE: 2584 case PERF_IO_UNIT_SIZE: 2585 case PERF_ZEROCOPY_THRESHOLD: 2586 case PERF_BUFFER_ALIGNMENT: 2587 case PERF_HUGEMEM_SIZE: 2588 case PERF_NUMBER_IOS: 2589 case PERF_IO_DEPTH: 2590 case PERF_IO_QUEUE_SIZE: 2591 rc = spdk_parse_capacity(optarg, &val_u64, NULL); 2592 if (rc != 0) { 2593 fprintf(stderr, "Converting a string to integer failed\n"); 2594 return 1; 2595 } 2596 switch (op) { 2597 case PERF_IO_SIZE: 2598 g_io_size_bytes = (uint32_t)val_u64; 2599 break; 2600 case PERF_IO_UNIT_SIZE: 2601 g_io_unit_size = (uint32_t)val_u64; 2602 break; 2603 case PERF_ZEROCOPY_THRESHOLD: 2604 g_sock_zcopy_threshold = (uint32_t)val_u64; 2605 break; 2606 case PERF_IO_DEPTH: 2607 g_queue_depth = (uint32_t)val_u64; 2608 break; 2609 case PERF_IO_QUEUE_SIZE: 2610 g_io_queue_size = (uint32_t)val_u64; 2611 break; 2612 case PERF_BUFFER_ALIGNMENT: 2613 g_io_align = (uint32_t)val_u64; 2614 if (!spdk_u32_is_pow2(g_io_align) || g_io_align < SPDK_CACHE_LINE_SIZE) { 2615 fprintf(stderr, "Wrong alignment %u. Must be power of 2 and not less than cache lize (%u)\n", 2616 g_io_align, SPDK_CACHE_LINE_SIZE); 2617 usage(argv[0]); 2618 return 1; 2619 } 2620 g_io_align_specified = true; 2621 break; 2622 case PERF_HUGEMEM_SIZE: 2623 env_opts->mem_size = (int)val_u64; 2624 break; 2625 case PERF_NUMBER_IOS: 2626 g_number_ios = val_u64; 2627 break; 2628 } 2629 break; 2630 case PERF_ZIPF: 2631 errno = 0; 2632 g_zipf_theta = strtod(optarg, &endptr); 2633 if (errno || optarg == endptr || g_zipf_theta < 0) { 2634 fprintf(stderr, "Illegal zipf theta value %s\n", optarg); 2635 return 1; 2636 } 2637 break; 2638 case PERF_ALLOWED_PCI_ADDR: 2639 if (add_allowed_pci_device(optarg, env_opts)) { 2640 usage(argv[0]); 2641 return 1; 2642 } 2643 break; 2644 case PERF_CORE_MASK: 2645 env_opts->core_mask = optarg; 2646 break; 2647 case PERF_METADATA: 2648 if (parse_metadata(optarg)) { 2649 usage(argv[0]); 2650 return 1; 2651 } 2652 break; 2653 case PERF_MEM_SINGL_SEG: 2654 env_opts->hugepage_single_segments = true; 2655 break; 2656 case PERF_ENABLE_SSD_LATENCY_TRACING: 2657 g_latency_ssd_tracking_enable = true; 2658 break; 2659 case PERF_CPU_USAGE: 2660 g_monitor_perf_cores = true; 2661 break; 2662 case PERF_TRANSPORT: 2663 if (add_trid(optarg)) { 2664 usage(argv[0]); 2665 return 1; 2666 } 2667 break; 2668 case PERF_IO_PATTERN: 2669 g_workload_type = optarg; 2670 break; 2671 case PERF_DISABLE_SQ_CMB: 2672 g_disable_sq_cmb = 1; 2673 break; 2674 case PERF_ENABLE_DEBUG: 2675 #ifndef DEBUG 2676 fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n", 2677 argv[0]); 2678 usage(argv[0]); 2679 return 1; 2680 #else 2681 spdk_log_set_flag("nvme"); 2682 spdk_log_set_print_level(SPDK_LOG_DEBUG); 2683 break; 2684 #endif 2685 case PERF_ENABLE_TCP_HDGST: 2686 g_header_digest = 1; 2687 break; 2688 case PERF_ENABLE_TCP_DDGST: 2689 g_data_digest = 1; 2690 break; 2691 case PERF_ENABLE_SW_LATENCY_TRACING: 2692 g_latency_sw_tracking_level++; 2693 break; 2694 case PERF_NO_SHST_NOTIFICATION: 2695 g_no_shn_notification = true; 2696 break; 2697 case PERF_ENABLE_URING: 2698 #ifndef SPDK_CONFIG_URING 2699 fprintf(stderr, "%s must be rebuilt with CONFIG_URING=y for -R flag.\n", 2700 argv[0]); 2701 usage(argv[0]); 2702 return 0; 2703 #endif 2704 g_use_uring = true; 2705 break; 2706 case PERF_LOG_FLAG: 2707 rc = spdk_log_set_flag(optarg); 2708 if (rc < 0) { 2709 fprintf(stderr, "unknown flag\n"); 2710 usage(argv[0]); 2711 exit(EXIT_FAILURE); 2712 } 2713 #ifdef DEBUG 2714 spdk_log_set_print_level(SPDK_LOG_DEBUG); 2715 #endif 2716 break; 2717 case PERF_ENABLE_VMD: 2718 g_vmd = true; 2719 break; 2720 case PERF_DISABLE_KTLS: 2721 ssl_used = true; 2722 perf_set_sock_opts("ssl", "ktls", 0, NULL); 2723 break; 2724 case PERF_ENABLE_KTLS: 2725 ssl_used = true; 2726 perf_set_sock_opts("ssl", "ktls", 1, NULL); 2727 break; 2728 case PERF_TLS_VERSION: 2729 ssl_used = true; 2730 val = spdk_strtol(optarg, 10); 2731 if (val < 0) { 2732 fprintf(stderr, "Illegal tls version value %s\n", optarg); 2733 return val; 2734 } 2735 perf_set_sock_opts("ssl", "tls_version", val, NULL); 2736 break; 2737 case PERF_PSK_PATH: 2738 ssl_used = true; 2739 perf_set_sock_opts("ssl", "psk_path", 0, optarg); 2740 break; 2741 case PERF_PSK_IDENTITY: 2742 ssl_used = true; 2743 perf_set_sock_opts("ssl", "psk_identity", 0, optarg); 2744 break; 2745 case PERF_DISABLE_ZCOPY: 2746 perf_set_sock_opts(optarg, "enable_zerocopy_send_client", 0, NULL); 2747 break; 2748 case PERF_ENABLE_ZCOPY: 2749 perf_set_sock_opts(optarg, "enable_zerocopy_send_client", 1, NULL); 2750 break; 2751 case PERF_USE_EVERY_CORE: 2752 g_use_every_core = true; 2753 break; 2754 case PERF_DEFAULT_SOCK_IMPL: 2755 sock_impl = optarg; 2756 rc = spdk_sock_set_default_impl(optarg); 2757 if (rc) { 2758 fprintf(stderr, "Failed to set sock impl %s, err %d (%s)\n", optarg, errno, strerror(errno)); 2759 return 1; 2760 } 2761 break; 2762 case PERF_TRANSPORT_STATISTICS: 2763 g_dump_transport_stats = true; 2764 break; 2765 case PERF_IOVA_MODE: 2766 env_opts->iova_mode = optarg; 2767 break; 2768 case PERF_SOCK_IMPL: 2769 g_sock_threshold_impl = optarg; 2770 break; 2771 case PERF_TRANSPORT_TOS: 2772 val = spdk_strtol(optarg, 10); 2773 if (val < 0) { 2774 fprintf(stderr, "Invalid TOS value\n"); 2775 return 1; 2776 } 2777 g_transport_tos = val; 2778 break; 2779 case PERF_NO_HUGE: 2780 env_opts->no_huge = true; 2781 break; 2782 case PERF_HELP: 2783 usage(argv[0]); 2784 return HELP_RETURN_CODE; 2785 default: 2786 usage(argv[0]); 2787 return 1; 2788 } 2789 } 2790 2791 if (!g_nr_io_queues_per_ns) { 2792 usage(argv[0]); 2793 return 1; 2794 } 2795 2796 if (!g_queue_depth) { 2797 fprintf(stderr, "missing -q (--io-depth) operand\n"); 2798 usage(argv[0]); 2799 return 1; 2800 } 2801 if (!g_io_size_bytes) { 2802 fprintf(stderr, "missing -o (--io-size) operand\n"); 2803 usage(argv[0]); 2804 return 1; 2805 } 2806 if (!g_io_unit_size || g_io_unit_size % 4) { 2807 fprintf(stderr, "io unit size can not be 0 or non 4-byte aligned\n"); 2808 return 1; 2809 } 2810 if (!g_workload_type) { 2811 fprintf(stderr, "missing -w (--io-pattern) operand\n"); 2812 usage(argv[0]); 2813 return 1; 2814 } 2815 if (!g_time_in_sec) { 2816 fprintf(stderr, "missing -t (--time) operand\n"); 2817 usage(argv[0]); 2818 return 1; 2819 } 2820 if (!g_quiet_count) { 2821 fprintf(stderr, "-Q (--continue-on-error) value must be greater than 0\n"); 2822 usage(argv[0]); 2823 return 1; 2824 } 2825 2826 if (strncmp(g_workload_type, "rand", 4) == 0) { 2827 g_is_random = 1; 2828 g_workload_type = &g_workload_type[4]; 2829 } 2830 2831 if (ssl_used && strncmp(sock_impl, "ssl", 3) != 0) { 2832 fprintf(stderr, "sock impl is not SSL but tried to use one of the SSL only options\n"); 2833 usage(argv[0]); 2834 return 1; 2835 } 2836 2837 2838 if (strcmp(g_workload_type, "read") == 0 || strcmp(g_workload_type, "write") == 0) { 2839 g_rw_percentage = strcmp(g_workload_type, "read") == 0 ? 100 : 0; 2840 if (g_mix_specified) { 2841 fprintf(stderr, "Ignoring -M (--rwmixread) option... Please use -M option" 2842 " only when using rw or randrw.\n"); 2843 } 2844 } else if (strcmp(g_workload_type, "rw") == 0) { 2845 if (g_rw_percentage < 0 || g_rw_percentage > 100) { 2846 fprintf(stderr, 2847 "-M (--rwmixread) must be specified to value from 0 to 100 " 2848 "for rw or randrw.\n"); 2849 return 1; 2850 } 2851 } else { 2852 fprintf(stderr, 2853 "-w (--io-pattern) io pattern type must be one of\n" 2854 "(read, write, randread, randwrite, rw, randrw)\n"); 2855 return 1; 2856 } 2857 2858 if (g_sock_zcopy_threshold > 0) { 2859 if (!g_sock_threshold_impl) { 2860 fprintf(stderr, 2861 "--zerocopy-threshold must be set with sock implementation specified(--zerocopy-threshold-sock-impl <impl>)\n"); 2862 return 1; 2863 } 2864 2865 perf_set_sock_opts(g_sock_threshold_impl, "zerocopy_threshold", g_sock_zcopy_threshold, NULL); 2866 } 2867 2868 if (g_number_ios && g_warmup_time_in_sec) { 2869 fprintf(stderr, "-d (--number-ios) with -a (--warmup-time) is not supported\n"); 2870 return 1; 2871 } 2872 2873 if (g_number_ios && g_number_ios < g_queue_depth) { 2874 fprintf(stderr, "-d (--number-ios) less than -q (--io-depth) is not supported\n"); 2875 return 1; 2876 } 2877 2878 if (g_rdma_srq_size != 0) { 2879 struct spdk_nvme_transport_opts opts; 2880 2881 spdk_nvme_transport_get_opts(&opts, sizeof(opts)); 2882 opts.rdma_srq_size = g_rdma_srq_size; 2883 2884 rc = spdk_nvme_transport_set_opts(&opts, sizeof(opts)); 2885 if (rc != 0) { 2886 fprintf(stderr, "Failed to set NVMe transport options.\n"); 2887 return 1; 2888 } 2889 } 2890 2891 if (TAILQ_EMPTY(&g_trid_list)) { 2892 /* If no transport IDs specified, default to enumerating all local PCIe devices */ 2893 add_trid("trtype:PCIe"); 2894 } else { 2895 struct trid_entry *trid_entry, *trid_entry_tmp; 2896 2897 env_opts->no_pci = true; 2898 /* check whether there is local PCIe type */ 2899 TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) { 2900 if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 2901 env_opts->no_pci = false; 2902 break; 2903 } 2904 } 2905 } 2906 2907 g_file_optind = optind; 2908 2909 return 0; 2910 } 2911 2912 static int 2913 register_workers(void) 2914 { 2915 uint32_t i; 2916 struct worker_thread *worker; 2917 2918 SPDK_ENV_FOREACH_CORE(i) { 2919 worker = calloc(1, sizeof(*worker)); 2920 if (worker == NULL) { 2921 fprintf(stderr, "Unable to allocate worker\n"); 2922 return -1; 2923 } 2924 2925 TAILQ_INIT(&worker->ns_ctx); 2926 worker->lcore = i; 2927 TAILQ_INSERT_TAIL(&g_workers, worker, link); 2928 g_num_workers++; 2929 } 2930 2931 return 0; 2932 } 2933 2934 static void 2935 unregister_workers(void) 2936 { 2937 struct worker_thread *worker, *tmp_worker; 2938 struct ns_worker_ctx *ns_ctx, *tmp_ns_ctx; 2939 2940 /* Free namespace context and worker thread */ 2941 TAILQ_FOREACH_SAFE(worker, &g_workers, link, tmp_worker) { 2942 TAILQ_REMOVE(&g_workers, worker, link); 2943 2944 TAILQ_FOREACH_SAFE(ns_ctx, &worker->ns_ctx, link, tmp_ns_ctx) { 2945 TAILQ_REMOVE(&worker->ns_ctx, ns_ctx, link); 2946 spdk_histogram_data_free(ns_ctx->histogram); 2947 free(ns_ctx); 2948 } 2949 2950 free(worker); 2951 } 2952 } 2953 2954 static bool 2955 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2956 struct spdk_nvme_ctrlr_opts *opts) 2957 { 2958 struct trid_entry *trid_entry = cb_ctx; 2959 2960 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2961 if (g_disable_sq_cmb) { 2962 opts->use_cmb_sqs = false; 2963 } 2964 if (g_no_shn_notification) { 2965 opts->no_shn_notification = true; 2966 } 2967 } 2968 2969 if (trid->trtype != trid_entry->trid.trtype && 2970 strcasecmp(trid->trstring, trid_entry->trid.trstring)) { 2971 return false; 2972 } 2973 2974 opts->io_queue_size = g_io_queue_size; 2975 2976 /* Set the header and data_digest */ 2977 opts->header_digest = g_header_digest; 2978 opts->data_digest = g_data_digest; 2979 opts->keep_alive_timeout_ms = g_keep_alive_timeout_in_ms; 2980 memcpy(opts->hostnqn, trid_entry->hostnqn, sizeof(opts->hostnqn)); 2981 2982 opts->transport_tos = g_transport_tos; 2983 if (opts->num_io_queues < g_num_workers * g_nr_io_queues_per_ns) { 2984 opts->num_io_queues = g_num_workers * g_nr_io_queues_per_ns; 2985 } 2986 2987 if (g_psk != NULL) { 2988 memcpy(opts->psk, g_psk, strlen(g_psk)); 2989 } 2990 2991 return true; 2992 } 2993 2994 static void 2995 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2996 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2997 { 2998 struct trid_entry *trid_entry = cb_ctx; 2999 struct spdk_pci_addr pci_addr; 3000 struct spdk_pci_device *pci_dev; 3001 struct spdk_pci_id pci_id; 3002 3003 if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) { 3004 printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n", 3005 trid->traddr, trid->trsvcid, 3006 trid->subnqn); 3007 } else { 3008 if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) { 3009 return; 3010 } 3011 3012 pci_dev = spdk_nvme_ctrlr_get_pci_device(ctrlr); 3013 if (!pci_dev) { 3014 return; 3015 } 3016 3017 pci_id = spdk_pci_device_get_id(pci_dev); 3018 3019 printf("Attached to NVMe Controller at %s [%04x:%04x]\n", 3020 trid->traddr, 3021 pci_id.vendor_id, pci_id.device_id); 3022 } 3023 3024 register_ctrlr(ctrlr, trid_entry); 3025 } 3026 3027 static int 3028 register_controllers(void) 3029 { 3030 struct trid_entry *trid_entry; 3031 3032 printf("Initializing NVMe Controllers\n"); 3033 3034 if (g_vmd && spdk_vmd_init()) { 3035 fprintf(stderr, "Failed to initialize VMD." 3036 " Some NVMe devices can be unavailable.\n"); 3037 } 3038 3039 TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) { 3040 if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) { 3041 fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n", 3042 trid_entry->trid.traddr); 3043 return -1; 3044 } 3045 } 3046 3047 return 0; 3048 } 3049 3050 static void 3051 unregister_controllers(void) 3052 { 3053 struct ctrlr_entry *entry, *tmp; 3054 struct spdk_nvme_detach_ctx *detach_ctx = NULL; 3055 3056 TAILQ_FOREACH_SAFE(entry, &g_controllers, link, tmp) { 3057 TAILQ_REMOVE(&g_controllers, entry, link); 3058 3059 spdk_dma_free(entry->latency_page); 3060 if (g_latency_ssd_tracking_enable && 3061 spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) { 3062 set_latency_tracking_feature(entry->ctrlr, false); 3063 } 3064 3065 if (g_nr_unused_io_queues) { 3066 int i; 3067 3068 for (i = 0; i < g_nr_unused_io_queues; i++) { 3069 spdk_nvme_ctrlr_free_io_qpair(entry->unused_qpairs[i]); 3070 } 3071 3072 free(entry->unused_qpairs); 3073 } 3074 3075 spdk_nvme_detach_async(entry->ctrlr, &detach_ctx); 3076 free(entry); 3077 } 3078 3079 if (detach_ctx) { 3080 spdk_nvme_detach_poll(detach_ctx); 3081 } 3082 3083 if (g_vmd) { 3084 spdk_vmd_fini(); 3085 } 3086 } 3087 3088 static int 3089 allocate_ns_worker(struct ns_entry *entry, struct worker_thread *worker) 3090 { 3091 struct ns_worker_ctx *ns_ctx; 3092 3093 ns_ctx = calloc(1, sizeof(struct ns_worker_ctx)); 3094 if (!ns_ctx) { 3095 return -1; 3096 } 3097 3098 printf("Associating %s with lcore %d\n", entry->name, worker->lcore); 3099 ns_ctx->stats.min_tsc = UINT64_MAX; 3100 ns_ctx->entry = entry; 3101 ns_ctx->histogram = spdk_histogram_data_alloc(); 3102 TAILQ_INSERT_TAIL(&worker->ns_ctx, ns_ctx, link); 3103 3104 return 0; 3105 } 3106 3107 static int 3108 associate_workers_with_ns(void) 3109 { 3110 struct ns_entry *entry = TAILQ_FIRST(&g_namespaces); 3111 struct worker_thread *worker = TAILQ_FIRST(&g_workers); 3112 int i, count; 3113 3114 /* Each core contains single worker, and namespaces are associated as follows: 3115 * --use-every-core not specified (default): 3116 * 2) equal workers and namespaces - each worker associated with single namespace 3117 * 3) more workers than namespaces - each namespace is associated with one or more workers 3118 * 4) more namespaces than workers - each worker is associated with one or more namespaces 3119 * --use-every-core option enabled - every worker is associated with all namespaces 3120 */ 3121 if (g_use_every_core) { 3122 TAILQ_FOREACH(worker, &g_workers, link) { 3123 TAILQ_FOREACH(entry, &g_namespaces, link) { 3124 if (allocate_ns_worker(entry, worker) != 0) { 3125 return -1; 3126 } 3127 } 3128 } 3129 return 0; 3130 } 3131 3132 count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers; 3133 3134 for (i = 0; i < count; i++) { 3135 if (entry == NULL) { 3136 break; 3137 } 3138 3139 if (allocate_ns_worker(entry, worker) != 0) { 3140 return -1; 3141 } 3142 3143 worker = TAILQ_NEXT(worker, link); 3144 if (worker == NULL) { 3145 worker = TAILQ_FIRST(&g_workers); 3146 } 3147 3148 entry = TAILQ_NEXT(entry, link); 3149 if (entry == NULL) { 3150 entry = TAILQ_FIRST(&g_namespaces); 3151 } 3152 3153 } 3154 3155 return 0; 3156 } 3157 3158 static void * 3159 nvme_poll_ctrlrs(void *arg) 3160 { 3161 struct ctrlr_entry *entry; 3162 int oldstate; 3163 int rc; 3164 3165 spdk_unaffinitize_thread(); 3166 3167 while (true) { 3168 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate); 3169 3170 TAILQ_FOREACH(entry, &g_controllers, link) { 3171 if (entry->trtype != SPDK_NVME_TRANSPORT_PCIE) { 3172 rc = spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr); 3173 if (spdk_unlikely(rc < 0 && !g_exit)) { 3174 g_exit = true; 3175 } 3176 } 3177 } 3178 3179 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate); 3180 3181 /* This is a pthread cancellation point and cannot be removed. */ 3182 sleep(1); 3183 } 3184 3185 return NULL; 3186 } 3187 3188 static void 3189 sig_handler(int signo) 3190 { 3191 g_exit = true; 3192 } 3193 3194 static int 3195 setup_sig_handlers(void) 3196 { 3197 struct sigaction sigact = {}; 3198 int rc; 3199 3200 sigemptyset(&sigact.sa_mask); 3201 sigact.sa_handler = sig_handler; 3202 rc = sigaction(SIGINT, &sigact, NULL); 3203 if (rc < 0) { 3204 fprintf(stderr, "sigaction(SIGINT) failed, errno %d (%s)\n", errno, strerror(errno)); 3205 return -1; 3206 } 3207 3208 rc = sigaction(SIGTERM, &sigact, NULL); 3209 if (rc < 0) { 3210 fprintf(stderr, "sigaction(SIGTERM) failed, errno %d (%s)\n", errno, strerror(errno)); 3211 return -1; 3212 } 3213 3214 return 0; 3215 } 3216 3217 int 3218 main(int argc, char **argv) 3219 { 3220 int rc; 3221 struct worker_thread *worker, *main_worker; 3222 struct ns_worker_ctx *ns_ctx; 3223 struct spdk_env_opts opts; 3224 pthread_t thread_id = 0; 3225 3226 /* Use the runtime PID to set the random seed */ 3227 srand(getpid()); 3228 3229 opts.opts_size = sizeof(opts); 3230 spdk_env_opts_init(&opts); 3231 opts.name = "perf"; 3232 opts.pci_allowed = g_allowed_pci_addr; 3233 rc = parse_args(argc, argv, &opts); 3234 if (rc != 0 || rc == HELP_RETURN_CODE) { 3235 free(g_psk); 3236 if (rc == HELP_RETURN_CODE) { 3237 return 0; 3238 } 3239 3240 return rc; 3241 } 3242 /* Transport statistics are printed from each thread. 3243 * To avoid mess in terminal, init and use mutex */ 3244 rc = pthread_mutex_init(&g_stats_mutex, NULL); 3245 if (rc != 0) { 3246 fprintf(stderr, "Failed to init mutex\n"); 3247 free(g_psk); 3248 return -1; 3249 } 3250 if (spdk_env_init(&opts) < 0) { 3251 fprintf(stderr, "Unable to initialize SPDK env\n"); 3252 unregister_trids(); 3253 pthread_mutex_destroy(&g_stats_mutex); 3254 free(g_psk); 3255 return -1; 3256 } 3257 3258 rc = setup_sig_handlers(); 3259 if (rc != 0) { 3260 rc = -1; 3261 goto cleanup; 3262 } 3263 3264 g_tsc_rate = spdk_get_ticks_hz(); 3265 3266 if (register_workers() != 0) { 3267 rc = -1; 3268 goto cleanup; 3269 } 3270 3271 #if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING) 3272 if (register_files(argc, argv) != 0) { 3273 rc = -1; 3274 goto cleanup; 3275 } 3276 #endif 3277 3278 if (register_controllers() != 0) { 3279 rc = -1; 3280 goto cleanup; 3281 } 3282 3283 if (g_warn) { 3284 printf("WARNING: Some requested NVMe devices were skipped\n"); 3285 } 3286 3287 if (g_num_namespaces == 0) { 3288 fprintf(stderr, "No valid NVMe controllers or AIO or URING devices found\n"); 3289 goto cleanup; 3290 } 3291 3292 if (g_num_workers > 1 && g_quiet_count > 1) { 3293 fprintf(stderr, "Error message rate-limiting enabled across multiple threads.\n"); 3294 fprintf(stderr, "Error suppression count may not be exact.\n"); 3295 } 3296 3297 rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL); 3298 if (rc != 0) { 3299 fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n"); 3300 goto cleanup; 3301 } 3302 3303 if (associate_workers_with_ns() != 0) { 3304 rc = -1; 3305 goto cleanup; 3306 } 3307 3308 rc = pthread_barrier_init(&g_worker_sync_barrier, NULL, g_num_workers); 3309 if (rc != 0) { 3310 fprintf(stderr, "Unable to initialize thread sync barrier\n"); 3311 goto cleanup; 3312 } 3313 3314 printf("Initialization complete. Launching workers.\n"); 3315 3316 /* Launch all of the secondary workers */ 3317 g_main_core = spdk_env_get_current_core(); 3318 main_worker = NULL; 3319 TAILQ_FOREACH(worker, &g_workers, link) { 3320 if (worker->lcore != g_main_core) { 3321 spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker); 3322 } else { 3323 assert(main_worker == NULL); 3324 main_worker = worker; 3325 } 3326 } 3327 3328 assert(main_worker != NULL); 3329 work_fn(main_worker); 3330 3331 spdk_env_thread_wait_all(); 3332 3333 print_stats(); 3334 3335 pthread_barrier_destroy(&g_worker_sync_barrier); 3336 3337 cleanup: 3338 fflush(stdout); 3339 3340 if (thread_id && pthread_cancel(thread_id) == 0) { 3341 pthread_join(thread_id, NULL); 3342 } 3343 3344 /* Collect errors from all workers and namespaces */ 3345 TAILQ_FOREACH(worker, &g_workers, link) { 3346 if (rc != 0) { 3347 break; 3348 } 3349 3350 TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) { 3351 if (ns_ctx->status != 0) { 3352 rc = ns_ctx->status; 3353 break; 3354 } 3355 } 3356 } 3357 3358 unregister_trids(); 3359 unregister_namespaces(); 3360 unregister_controllers(); 3361 unregister_workers(); 3362 3363 spdk_env_fini(); 3364 3365 free(g_psk); 3366 3367 pthread_mutex_destroy(&g_stats_mutex); 3368 3369 if (rc != 0) { 3370 fprintf(stderr, "%s: errors occurred\n", argv[0]); 3371 } 3372 3373 return rc; 3374 } 3375