1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. 3 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. 4 * All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 #include "spdk/accel.h" 11 #include "spdk/endian.h" 12 #include "spdk/env.h" 13 #include "spdk/event.h" 14 #include "spdk/log.h" 15 #include "spdk/util.h" 16 #include "spdk/thread.h" 17 #include "spdk/string.h" 18 #include "spdk/rpc.h" 19 #include "spdk/bit_array.h" 20 #include "spdk/conf.h" 21 #include "spdk/zipf.h" 22 #include "spdk/histogram_data.h" 23 24 #define BDEVPERF_CONFIG_MAX_FILENAME 1024 25 #define BDEVPERF_CONFIG_UNDEFINED -1 26 #define BDEVPERF_CONFIG_ERROR -2 27 28 struct bdevperf_task { 29 struct iovec iov; 30 struct bdevperf_job *job; 31 struct spdk_bdev_io *bdev_io; 32 void *buf; 33 void *md_buf; 34 uint64_t offset_blocks; 35 struct bdevperf_task *task_to_abort; 36 enum spdk_bdev_io_type io_type; 37 TAILQ_ENTRY(bdevperf_task) link; 38 struct spdk_bdev_io_wait_entry bdev_io_wait; 39 }; 40 41 static const char *g_workload_type = NULL; 42 static int g_io_size = 0; 43 /* initialize to invalid value so we can detect if user overrides it. */ 44 static int g_rw_percentage = -1; 45 static bool g_verify = false; 46 static bool g_reset = false; 47 static bool g_continue_on_failure = false; 48 static bool g_abort = false; 49 static bool g_error_to_exit = false; 50 static int g_queue_depth = 0; 51 static uint64_t g_time_in_usec; 52 static int g_show_performance_real_time = 0; 53 static uint64_t g_show_performance_period_in_usec = SPDK_SEC_TO_USEC; 54 static uint64_t g_show_performance_period_num = 0; 55 static uint64_t g_show_performance_ema_period = 0; 56 static int g_run_rc = 0; 57 static bool g_shutdown = false; 58 static uint64_t g_start_tsc; 59 static uint64_t g_shutdown_tsc; 60 static bool g_zcopy = false; 61 static struct spdk_thread *g_main_thread; 62 static int g_time_in_sec = 0; 63 static bool g_mix_specified = false; 64 static const char *g_job_bdev_name; 65 static bool g_wait_for_tests = false; 66 static struct spdk_jsonrpc_request *g_request = NULL; 67 static bool g_multithread_mode = false; 68 static int g_timeout_in_sec; 69 static struct spdk_conf *g_bdevperf_conf = NULL; 70 static const char *g_bdevperf_conf_file = NULL; 71 static double g_zipf_theta; 72 73 static struct spdk_cpuset g_all_cpuset; 74 static struct spdk_poller *g_perf_timer = NULL; 75 76 static void bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task); 77 static void rpc_perform_tests_cb(void); 78 79 static uint32_t g_bdev_count = 0; 80 static uint32_t g_latency_display_level; 81 82 static const double g_latency_cutoffs[] = { 83 0.01, 84 0.10, 85 0.25, 86 0.50, 87 0.75, 88 0.90, 89 0.95, 90 0.98, 91 0.99, 92 0.995, 93 0.999, 94 0.9999, 95 0.99999, 96 0.999999, 97 0.9999999, 98 -1, 99 }; 100 101 struct latency_info { 102 uint64_t min; 103 uint64_t max; 104 uint64_t total; 105 }; 106 107 struct bdevperf_job { 108 char *name; 109 struct spdk_bdev *bdev; 110 struct spdk_bdev_desc *bdev_desc; 111 struct spdk_io_channel *ch; 112 TAILQ_ENTRY(bdevperf_job) link; 113 struct spdk_thread *thread; 114 115 const char *workload_type; 116 int io_size; 117 int rw_percentage; 118 bool is_random; 119 bool verify; 120 bool reset; 121 bool continue_on_failure; 122 bool unmap; 123 bool write_zeroes; 124 bool flush; 125 bool abort; 126 int queue_depth; 127 unsigned int seed; 128 129 uint64_t io_completed; 130 uint64_t io_failed; 131 uint64_t io_timeout; 132 uint64_t prev_io_completed; 133 double ema_io_per_second; 134 int current_queue_depth; 135 uint64_t size_in_ios; 136 uint64_t ios_base; 137 uint64_t offset_in_ios; 138 uint64_t io_size_blocks; 139 uint64_t buf_size; 140 uint32_t dif_check_flags; 141 bool is_draining; 142 struct spdk_poller *run_timer; 143 struct spdk_poller *reset_timer; 144 struct spdk_bit_array *outstanding; 145 struct spdk_zipf *zipf; 146 TAILQ_HEAD(, bdevperf_task) task_list; 147 uint64_t run_time_in_usec; 148 149 /* keep channel's histogram data before being destroyed */ 150 struct spdk_histogram_data *histogram; 151 }; 152 153 struct spdk_bdevperf { 154 TAILQ_HEAD(, bdevperf_job) jobs; 155 uint32_t running_jobs; 156 }; 157 158 static struct spdk_bdevperf g_bdevperf = { 159 .jobs = TAILQ_HEAD_INITIALIZER(g_bdevperf.jobs), 160 .running_jobs = 0, 161 }; 162 163 enum job_config_rw { 164 JOB_CONFIG_RW_READ = 0, 165 JOB_CONFIG_RW_WRITE, 166 JOB_CONFIG_RW_RANDREAD, 167 JOB_CONFIG_RW_RANDWRITE, 168 JOB_CONFIG_RW_RW, 169 JOB_CONFIG_RW_RANDRW, 170 JOB_CONFIG_RW_VERIFY, 171 JOB_CONFIG_RW_RESET, 172 JOB_CONFIG_RW_UNMAP, 173 JOB_CONFIG_RW_FLUSH, 174 JOB_CONFIG_RW_WRITE_ZEROES, 175 }; 176 177 /* Storing values from a section of job config file */ 178 struct job_config { 179 const char *name; 180 const char *filename; 181 struct spdk_cpuset cpumask; 182 int bs; 183 int iodepth; 184 int rwmixread; 185 int64_t offset; 186 uint64_t length; 187 enum job_config_rw rw; 188 TAILQ_ENTRY(job_config) link; 189 }; 190 191 TAILQ_HEAD(, job_config) job_config_list 192 = TAILQ_HEAD_INITIALIZER(job_config_list); 193 194 static bool g_performance_dump_active = false; 195 196 struct bdevperf_aggregate_stats { 197 struct bdevperf_job *current_job; 198 uint64_t io_time_in_usec; 199 uint64_t ema_period; 200 double total_io_per_second; 201 double total_mb_per_second; 202 double total_failed_per_second; 203 double total_timeout_per_second; 204 double min_latency; 205 double max_latency; 206 uint64_t total_io_completed; 207 uint64_t total_tsc; 208 }; 209 210 static struct bdevperf_aggregate_stats g_stats = {.min_latency = (double)UINT64_MAX}; 211 212 /* 213 * Cumulative Moving Average (CMA): average of all data up to current 214 * Exponential Moving Average (EMA): weighted mean of the previous n data and more weight is given to recent 215 * Simple Moving Average (SMA): unweighted mean of the previous n data 216 * 217 * Bdevperf supports CMA and EMA. 218 */ 219 static double 220 get_cma_io_per_second(struct bdevperf_job *job, uint64_t io_time_in_usec) 221 { 222 return (double)job->io_completed * SPDK_SEC_TO_USEC / io_time_in_usec; 223 } 224 225 static double 226 get_ema_io_per_second(struct bdevperf_job *job, uint64_t ema_period) 227 { 228 double io_completed, io_per_second; 229 230 io_completed = job->io_completed; 231 io_per_second = (double)(io_completed - job->prev_io_completed) * SPDK_SEC_TO_USEC 232 / g_show_performance_period_in_usec; 233 job->prev_io_completed = io_completed; 234 235 job->ema_io_per_second += (io_per_second - job->ema_io_per_second) * 2 236 / (ema_period + 1); 237 return job->ema_io_per_second; 238 } 239 240 static void 241 get_avg_latency(void *ctx, uint64_t start, uint64_t end, uint64_t count, 242 uint64_t total, uint64_t so_far) 243 { 244 struct latency_info *latency_info = ctx; 245 246 if (count == 0) { 247 return; 248 } 249 250 latency_info->total += (start + end) / 2 * count; 251 252 if (so_far == count) { 253 latency_info->min = start; 254 } 255 256 if (so_far == total) { 257 latency_info->max = end; 258 } 259 } 260 261 static void 262 performance_dump_job(struct bdevperf_aggregate_stats *stats, struct bdevperf_job *job) 263 { 264 double io_per_second, mb_per_second, failed_per_second, timeout_per_second; 265 double average_latency = 0.0, min_latency, max_latency; 266 uint64_t time_in_usec; 267 uint64_t tsc_rate; 268 uint64_t total_io; 269 struct latency_info latency_info = {}; 270 271 printf("\r Job: %s (Core Mask 0x%s)\n", spdk_thread_get_name(job->thread), 272 spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread))); 273 274 if (job->io_failed > 0 && !job->reset && !job->continue_on_failure) { 275 printf("\r Job: %s ended in about %.2f seconds with error\n", 276 spdk_thread_get_name(job->thread), (double)job->run_time_in_usec / SPDK_SEC_TO_USEC); 277 } 278 if (job->verify) { 279 printf("\t Verification LBA range: start 0x%" PRIx64 " length 0x%" PRIx64 "\n", 280 job->ios_base, job->size_in_ios); 281 } 282 283 if (g_performance_dump_active == true) { 284 /* Use job's actual run time as Job has ended */ 285 if (job->io_failed > 0 && !job->continue_on_failure) { 286 time_in_usec = job->run_time_in_usec; 287 } else { 288 time_in_usec = stats->io_time_in_usec; 289 } 290 } else { 291 time_in_usec = job->run_time_in_usec; 292 } 293 294 if (stats->ema_period == 0) { 295 io_per_second = get_cma_io_per_second(job, time_in_usec); 296 } else { 297 io_per_second = get_ema_io_per_second(job, stats->ema_period); 298 } 299 300 tsc_rate = spdk_get_ticks_hz(); 301 mb_per_second = io_per_second * job->io_size / (1024 * 1024); 302 303 spdk_histogram_data_iterate(job->histogram, get_avg_latency, &latency_info); 304 305 total_io = job->io_completed + job->io_failed; 306 if (total_io != 0) { 307 average_latency = (double)latency_info.total / total_io * SPDK_SEC_TO_USEC / tsc_rate; 308 } 309 min_latency = (double)latency_info.min * SPDK_SEC_TO_USEC / tsc_rate; 310 max_latency = (double)latency_info.max * SPDK_SEC_TO_USEC / tsc_rate; 311 312 failed_per_second = (double)job->io_failed * SPDK_SEC_TO_USEC / time_in_usec; 313 timeout_per_second = (double)job->io_timeout * SPDK_SEC_TO_USEC / time_in_usec; 314 315 printf("\t %-20s: %10.2f %10.2f %10.2f", 316 job->name, (float)time_in_usec / SPDK_SEC_TO_USEC, io_per_second, mb_per_second); 317 printf(" %10.2f %8.2f", 318 failed_per_second, timeout_per_second); 319 printf(" %10.2f %10.2f %10.2f\n", 320 average_latency, min_latency, max_latency); 321 322 stats->total_io_per_second += io_per_second; 323 stats->total_mb_per_second += mb_per_second; 324 stats->total_failed_per_second += failed_per_second; 325 stats->total_timeout_per_second += timeout_per_second; 326 stats->total_io_completed += job->io_completed + job->io_failed; 327 stats->total_tsc += latency_info.total; 328 if (min_latency < stats->min_latency) { 329 stats->min_latency = min_latency; 330 } 331 if (max_latency > stats->max_latency) { 332 stats->max_latency = max_latency; 333 } 334 } 335 336 static void 337 generate_data(void *buf, int buf_len, int block_size, void *md_buf, int md_size, 338 int num_blocks) 339 { 340 int offset_blocks = 0, md_offset, data_block_size, inner_offset; 341 342 if (buf_len < num_blocks * block_size) { 343 return; 344 } 345 346 if (md_buf == NULL) { 347 data_block_size = block_size - md_size; 348 md_buf = (char *)buf + data_block_size; 349 md_offset = block_size; 350 } else { 351 data_block_size = block_size; 352 md_offset = md_size; 353 } 354 355 while (offset_blocks < num_blocks) { 356 inner_offset = 0; 357 while (inner_offset < data_block_size) { 358 *(uint32_t *)buf = offset_blocks + inner_offset; 359 inner_offset += sizeof(uint32_t); 360 buf += sizeof(uint32_t); 361 } 362 memset(md_buf, offset_blocks, md_size); 363 md_buf += md_offset; 364 offset_blocks++; 365 } 366 } 367 368 static bool 369 copy_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size, 370 void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks) 371 { 372 if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) { 373 return false; 374 } 375 376 assert((wr_md_buf != NULL) == (rd_md_buf != NULL)); 377 378 memcpy(wr_buf, rd_buf, block_size * num_blocks); 379 380 if (wr_md_buf != NULL) { 381 memcpy(wr_md_buf, rd_md_buf, md_size * num_blocks); 382 } 383 384 return true; 385 } 386 387 static bool 388 verify_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size, 389 void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks, bool md_check) 390 { 391 int offset_blocks = 0, md_offset, data_block_size; 392 393 if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) { 394 return false; 395 } 396 397 assert((wr_md_buf != NULL) == (rd_md_buf != NULL)); 398 399 if (wr_md_buf == NULL) { 400 data_block_size = block_size - md_size; 401 wr_md_buf = (char *)wr_buf + data_block_size; 402 rd_md_buf = (char *)rd_buf + data_block_size; 403 md_offset = block_size; 404 } else { 405 data_block_size = block_size; 406 md_offset = md_size; 407 } 408 409 while (offset_blocks < num_blocks) { 410 if (memcmp(wr_buf, rd_buf, data_block_size) != 0) { 411 return false; 412 } 413 414 wr_buf += block_size; 415 rd_buf += block_size; 416 417 if (md_check) { 418 if (memcmp(wr_md_buf, rd_md_buf, md_size) != 0) { 419 return false; 420 } 421 422 wr_md_buf += md_offset; 423 rd_md_buf += md_offset; 424 } 425 426 offset_blocks++; 427 } 428 429 return true; 430 } 431 432 static void 433 free_job_config(void) 434 { 435 struct job_config *config, *tmp; 436 437 spdk_conf_free(g_bdevperf_conf); 438 g_bdevperf_conf = NULL; 439 440 TAILQ_FOREACH_SAFE(config, &job_config_list, link, tmp) { 441 TAILQ_REMOVE(&job_config_list, config, link); 442 free(config); 443 } 444 } 445 446 static void 447 bdevperf_job_free(struct bdevperf_job *job) 448 { 449 spdk_histogram_data_free(job->histogram); 450 spdk_bit_array_free(&job->outstanding); 451 spdk_zipf_free(&job->zipf); 452 free(job->name); 453 free(job); 454 } 455 456 static void 457 job_thread_exit(void *ctx) 458 { 459 spdk_thread_exit(spdk_get_thread()); 460 } 461 462 static void 463 check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count, 464 uint64_t total, uint64_t so_far) 465 { 466 double so_far_pct; 467 double **cutoff = ctx; 468 uint64_t tsc_rate; 469 470 if (count == 0) { 471 return; 472 } 473 474 tsc_rate = spdk_get_ticks_hz(); 475 so_far_pct = (double)so_far / total; 476 while (so_far_pct >= **cutoff && **cutoff > 0) { 477 printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * SPDK_SEC_TO_USEC / tsc_rate); 478 (*cutoff)++; 479 } 480 } 481 482 static void 483 print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count, 484 uint64_t total, uint64_t so_far) 485 { 486 double so_far_pct; 487 uint64_t tsc_rate; 488 489 if (count == 0) { 490 return; 491 } 492 493 tsc_rate = spdk_get_ticks_hz(); 494 so_far_pct = (double)so_far * 100 / total; 495 printf("%9.3f - %9.3f: %9.4f%% (%9ju)\n", 496 (double)start * SPDK_SEC_TO_USEC / tsc_rate, 497 (double)end * SPDK_SEC_TO_USEC / tsc_rate, 498 so_far_pct, count); 499 } 500 501 static void 502 bdevperf_test_done(void *ctx) 503 { 504 struct bdevperf_job *job, *jtmp; 505 struct bdevperf_task *task, *ttmp; 506 double average_latency = 0.0; 507 uint64_t time_in_usec; 508 int rc; 509 510 if (g_time_in_usec) { 511 g_stats.io_time_in_usec = g_time_in_usec; 512 513 if (!g_run_rc && g_performance_dump_active) { 514 spdk_thread_send_msg(spdk_get_thread(), bdevperf_test_done, NULL); 515 return; 516 } 517 } 518 519 if (g_show_performance_real_time) { 520 spdk_poller_unregister(&g_perf_timer); 521 } 522 523 if (g_shutdown) { 524 g_shutdown_tsc = spdk_get_ticks() - g_start_tsc; 525 time_in_usec = g_shutdown_tsc * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 526 g_time_in_usec = (g_time_in_usec > time_in_usec) ? time_in_usec : g_time_in_usec; 527 printf("Received shutdown signal, test time was about %.6f seconds\n", 528 (double)g_time_in_usec / SPDK_SEC_TO_USEC); 529 } 530 531 printf("\n%*s\n", 107, "Latency(us)"); 532 printf("\r %-*s: %10s %10s %10s %10s %8s %10s %10s %10s\n", 533 28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s", "Average", "min", "max"); 534 535 TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) { 536 performance_dump_job(&g_stats, job); 537 } 538 539 printf("\r ==================================================================================" 540 "=================================\n"); 541 printf("\r %-28s: %10s %10.2f %10.2f", 542 "Total", "", g_stats.total_io_per_second, g_stats.total_mb_per_second); 543 printf(" %10.2f %8.2f", 544 g_stats.total_failed_per_second, g_stats.total_timeout_per_second); 545 546 if (g_stats.total_io_completed != 0) { 547 average_latency = ((double)g_stats.total_tsc / g_stats.total_io_completed) * SPDK_SEC_TO_USEC / 548 spdk_get_ticks_hz(); 549 } 550 printf(" %10.2f %10.2f %10.2f\n", average_latency, g_stats.min_latency, g_stats.max_latency); 551 552 fflush(stdout); 553 554 if (g_latency_display_level == 0 || g_stats.total_io_completed == 0) { 555 goto clean; 556 } 557 558 printf("\n Latency summary\n"); 559 TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) { 560 printf("\r =============================================\n"); 561 printf("\r Job: %s (Core Mask 0x%s)\n", spdk_thread_get_name(job->thread), 562 spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread))); 563 564 const double *cutoff = g_latency_cutoffs; 565 566 spdk_histogram_data_iterate(job->histogram, check_cutoff, &cutoff); 567 568 printf("\n"); 569 } 570 571 if (g_latency_display_level == 1) { 572 goto clean; 573 } 574 575 printf("\r Latency histogram\n"); 576 TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) { 577 printf("\r =============================================\n"); 578 printf("\r Job: %s (Core Mask 0x%s)\n", spdk_thread_get_name(job->thread), 579 spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread))); 580 581 spdk_histogram_data_iterate(job->histogram, print_bucket, NULL); 582 printf("\n"); 583 } 584 585 clean: 586 TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) { 587 TAILQ_REMOVE(&g_bdevperf.jobs, job, link); 588 589 spdk_thread_send_msg(job->thread, job_thread_exit, NULL); 590 591 TAILQ_FOREACH_SAFE(task, &job->task_list, link, ttmp) { 592 TAILQ_REMOVE(&job->task_list, task, link); 593 spdk_free(task->buf); 594 spdk_free(task->md_buf); 595 free(task); 596 } 597 598 bdevperf_job_free(job); 599 } 600 601 rc = g_run_rc; 602 if (g_request && !g_shutdown) { 603 rpc_perform_tests_cb(); 604 if (rc != 0) { 605 spdk_app_stop(rc); 606 } 607 } else { 608 spdk_app_stop(rc); 609 } 610 } 611 612 static void 613 bdevperf_job_end(void *ctx) 614 { 615 assert(g_main_thread == spdk_get_thread()); 616 617 if (--g_bdevperf.running_jobs == 0) { 618 bdevperf_test_done(NULL); 619 } 620 } 621 622 static void 623 bdevperf_channel_get_histogram_cb(void *cb_arg, int status, struct spdk_histogram_data *histogram) 624 { 625 struct spdk_histogram_data *job_hist = cb_arg; 626 spdk_histogram_data_merge(job_hist, histogram); 627 } 628 629 static void 630 bdevperf_job_empty(struct bdevperf_job *job) 631 { 632 uint64_t end_tsc = 0; 633 634 end_tsc = spdk_get_ticks() - g_start_tsc; 635 job->run_time_in_usec = end_tsc * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 636 /* keep histogram info before channel is destroyed */ 637 spdk_bdev_channel_get_histogram(job->ch, bdevperf_channel_get_histogram_cb, 638 job->histogram); 639 spdk_put_io_channel(job->ch); 640 spdk_bdev_close(job->bdev_desc); 641 spdk_thread_send_msg(g_main_thread, bdevperf_job_end, NULL); 642 } 643 644 static void 645 bdevperf_end_task(struct bdevperf_task *task) 646 { 647 struct bdevperf_job *job = task->job; 648 649 TAILQ_INSERT_TAIL(&job->task_list, task, link); 650 if (job->is_draining) { 651 if (job->current_queue_depth == 0) { 652 bdevperf_job_empty(job); 653 } 654 } 655 } 656 657 static void 658 bdevperf_queue_io_wait_with_cb(struct bdevperf_task *task, spdk_bdev_io_wait_cb cb_fn) 659 { 660 struct bdevperf_job *job = task->job; 661 662 task->bdev_io_wait.bdev = job->bdev; 663 task->bdev_io_wait.cb_fn = cb_fn; 664 task->bdev_io_wait.cb_arg = task; 665 spdk_bdev_queue_io_wait(job->bdev, job->ch, &task->bdev_io_wait); 666 } 667 668 static int 669 bdevperf_job_drain(void *ctx) 670 { 671 struct bdevperf_job *job = ctx; 672 673 spdk_poller_unregister(&job->run_timer); 674 if (job->reset) { 675 spdk_poller_unregister(&job->reset_timer); 676 } 677 678 job->is_draining = true; 679 680 return -1; 681 } 682 683 static int 684 bdevperf_job_drain_timer(void *ctx) 685 { 686 struct bdevperf_job *job = ctx; 687 688 bdevperf_job_drain(ctx); 689 if (job->current_queue_depth == 0) { 690 bdevperf_job_empty(job); 691 } 692 693 return SPDK_POLLER_BUSY; 694 } 695 696 static void 697 bdevperf_abort_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 698 { 699 struct bdevperf_task *task = cb_arg; 700 struct bdevperf_job *job = task->job; 701 702 job->current_queue_depth--; 703 704 if (success) { 705 job->io_completed++; 706 } else { 707 job->io_failed++; 708 if (!job->continue_on_failure) { 709 bdevperf_job_drain(job); 710 g_run_rc = -1; 711 } 712 } 713 714 spdk_bdev_free_io(bdev_io); 715 bdevperf_end_task(task); 716 } 717 718 static int 719 bdevperf_verify_dif(struct bdevperf_task *task, struct iovec *iovs, int iovcnt) 720 { 721 struct bdevperf_job *job = task->job; 722 struct spdk_bdev *bdev = job->bdev; 723 struct spdk_dif_ctx dif_ctx; 724 struct spdk_dif_error err_blk = {}; 725 int rc; 726 727 rc = spdk_dif_ctx_init(&dif_ctx, 728 spdk_bdev_get_block_size(bdev), 729 spdk_bdev_get_md_size(bdev), 730 spdk_bdev_is_md_interleaved(bdev), 731 spdk_bdev_is_dif_head_of_md(bdev), 732 spdk_bdev_get_dif_type(bdev), 733 job->dif_check_flags, 734 task->offset_blocks, 0, 0, 0, 0); 735 if (rc != 0) { 736 fprintf(stderr, "Initialization of DIF context failed\n"); 737 return rc; 738 } 739 740 if (spdk_bdev_is_md_interleaved(bdev)) { 741 rc = spdk_dif_verify(iovs, iovcnt, job->io_size_blocks, &dif_ctx, &err_blk); 742 } else { 743 struct iovec md_iov = { 744 .iov_base = task->md_buf, 745 .iov_len = spdk_bdev_get_md_size(bdev) * job->io_size_blocks, 746 }; 747 748 rc = spdk_dix_verify(iovs, iovcnt, &md_iov, job->io_size_blocks, &dif_ctx, &err_blk); 749 } 750 751 if (rc != 0) { 752 fprintf(stderr, "DIF/DIX error detected. type=%d, offset=%" PRIu32 "\n", 753 err_blk.err_type, err_blk.err_offset); 754 } 755 756 return rc; 757 } 758 759 static void 760 bdevperf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 761 { 762 struct bdevperf_job *job; 763 struct bdevperf_task *task = cb_arg; 764 struct iovec *iovs; 765 int iovcnt; 766 bool md_check; 767 uint64_t offset_in_ios; 768 int rc; 769 770 job = task->job; 771 md_check = spdk_bdev_get_dif_type(job->bdev) == SPDK_DIF_DISABLE; 772 773 if (g_error_to_exit == true) { 774 bdevperf_job_drain(job); 775 } else if (!success) { 776 if (!job->reset && !job->continue_on_failure) { 777 bdevperf_job_drain(job); 778 g_run_rc = -1; 779 g_error_to_exit = true; 780 printf("task offset: %" PRIu64 " on job bdev=%s fails\n", 781 task->offset_blocks, job->name); 782 } 783 } else if (job->verify || job->reset) { 784 spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt); 785 assert(iovcnt == 1); 786 assert(iovs != NULL); 787 if (!verify_data(task->buf, job->buf_size, iovs[0].iov_base, iovs[0].iov_len, 788 spdk_bdev_get_block_size(job->bdev), 789 task->md_buf, spdk_bdev_io_get_md_buf(bdev_io), 790 spdk_bdev_get_md_size(job->bdev), 791 job->io_size_blocks, md_check)) { 792 printf("Buffer mismatch! Target: %s Disk Offset: %" PRIu64 "\n", job->name, task->offset_blocks); 793 printf(" First dword expected 0x%x got 0x%x\n", *(int *)task->buf, *(int *)iovs[0].iov_base); 794 bdevperf_job_drain(job); 795 g_run_rc = -1; 796 } 797 } else if (job->dif_check_flags != 0) { 798 if (task->io_type == SPDK_BDEV_IO_TYPE_READ && spdk_bdev_get_md_size(job->bdev) != 0) { 799 spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt); 800 assert(iovcnt == 1); 801 assert(iovs != NULL); 802 rc = bdevperf_verify_dif(task, iovs, iovcnt); 803 if (rc != 0) { 804 printf("DIF error detected. task offset: %" PRIu64 " on job bdev=%s\n", 805 task->offset_blocks, job->name); 806 807 success = false; 808 if (!job->reset && !job->continue_on_failure) { 809 bdevperf_job_drain(job); 810 g_run_rc = -1; 811 g_error_to_exit = true; 812 } 813 } 814 } 815 } 816 817 job->current_queue_depth--; 818 819 if (success) { 820 job->io_completed++; 821 } else { 822 job->io_failed++; 823 } 824 825 if (job->verify) { 826 assert(task->offset_blocks / job->io_size_blocks >= job->ios_base); 827 offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base; 828 829 assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true); 830 spdk_bit_array_clear(job->outstanding, offset_in_ios); 831 } 832 833 spdk_bdev_free_io(bdev_io); 834 835 /* 836 * is_draining indicates when time has expired for the test run 837 * and we are just waiting for the previously submitted I/O 838 * to complete. In this case, do not submit a new I/O to replace 839 * the one just completed. 840 */ 841 if (!job->is_draining) { 842 bdevperf_submit_single(job, task); 843 } else { 844 bdevperf_end_task(task); 845 } 846 } 847 848 static void 849 bdevperf_verify_submit_read(void *cb_arg) 850 { 851 struct bdevperf_job *job; 852 struct bdevperf_task *task = cb_arg; 853 int rc; 854 855 job = task->job; 856 857 /* Read the data back in */ 858 rc = spdk_bdev_read_blocks_with_md(job->bdev_desc, job->ch, NULL, NULL, 859 task->offset_blocks, job->io_size_blocks, 860 bdevperf_complete, task); 861 862 if (rc == -ENOMEM) { 863 bdevperf_queue_io_wait_with_cb(task, bdevperf_verify_submit_read); 864 } else if (rc != 0) { 865 printf("Failed to submit read: %d\n", rc); 866 bdevperf_job_drain(job); 867 g_run_rc = rc; 868 } 869 } 870 871 static void 872 bdevperf_verify_write_complete(struct spdk_bdev_io *bdev_io, bool success, 873 void *cb_arg) 874 { 875 if (success) { 876 spdk_bdev_free_io(bdev_io); 877 bdevperf_verify_submit_read(cb_arg); 878 } else { 879 bdevperf_complete(bdev_io, success, cb_arg); 880 } 881 } 882 883 static void 884 bdevperf_zcopy_populate_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 885 { 886 if (!success) { 887 bdevperf_complete(bdev_io, success, cb_arg); 888 return; 889 } 890 891 spdk_bdev_zcopy_end(bdev_io, false, bdevperf_complete, cb_arg); 892 } 893 894 static int 895 bdevperf_generate_dif(struct bdevperf_task *task) 896 { 897 struct bdevperf_job *job = task->job; 898 struct spdk_bdev *bdev = job->bdev; 899 struct spdk_dif_ctx dif_ctx; 900 int rc; 901 902 rc = spdk_dif_ctx_init(&dif_ctx, 903 spdk_bdev_get_block_size(bdev), 904 spdk_bdev_get_md_size(bdev), 905 spdk_bdev_is_md_interleaved(bdev), 906 spdk_bdev_is_dif_head_of_md(bdev), 907 spdk_bdev_get_dif_type(bdev), 908 job->dif_check_flags, 909 task->offset_blocks, 0, 0, 0, 0); 910 if (rc != 0) { 911 fprintf(stderr, "Initialization of DIF context failed\n"); 912 return rc; 913 } 914 915 if (spdk_bdev_is_md_interleaved(bdev)) { 916 rc = spdk_dif_generate(&task->iov, 1, job->io_size_blocks, &dif_ctx); 917 } else { 918 struct iovec md_iov = { 919 .iov_base = task->md_buf, 920 .iov_len = spdk_bdev_get_md_size(bdev) * job->io_size_blocks, 921 }; 922 923 rc = spdk_dix_generate(&task->iov, 1, &md_iov, job->io_size_blocks, &dif_ctx); 924 } 925 926 if (rc != 0) { 927 fprintf(stderr, "Generation of DIF/DIX failed\n"); 928 } 929 930 return rc; 931 } 932 933 static void 934 bdevperf_submit_task(void *arg) 935 { 936 struct bdevperf_task *task = arg; 937 struct bdevperf_job *job = task->job; 938 struct spdk_bdev_desc *desc; 939 struct spdk_io_channel *ch; 940 spdk_bdev_io_completion_cb cb_fn; 941 uint64_t offset_in_ios; 942 int rc = 0; 943 944 desc = job->bdev_desc; 945 ch = job->ch; 946 947 switch (task->io_type) { 948 case SPDK_BDEV_IO_TYPE_WRITE: 949 if (spdk_bdev_get_md_size(job->bdev) != 0 && job->dif_check_flags != 0) { 950 rc = bdevperf_generate_dif(task); 951 } 952 if (rc == 0) { 953 cb_fn = (job->verify || job->reset) ? bdevperf_verify_write_complete : bdevperf_complete; 954 955 if (g_zcopy) { 956 spdk_bdev_zcopy_end(task->bdev_io, true, cb_fn, task); 957 return; 958 } else { 959 rc = spdk_bdev_writev_blocks_with_md(desc, ch, &task->iov, 1, 960 task->md_buf, 961 task->offset_blocks, 962 job->io_size_blocks, 963 cb_fn, task); 964 } 965 } 966 break; 967 case SPDK_BDEV_IO_TYPE_FLUSH: 968 rc = spdk_bdev_flush_blocks(desc, ch, task->offset_blocks, 969 job->io_size_blocks, bdevperf_complete, task); 970 break; 971 case SPDK_BDEV_IO_TYPE_UNMAP: 972 rc = spdk_bdev_unmap_blocks(desc, ch, task->offset_blocks, 973 job->io_size_blocks, bdevperf_complete, task); 974 break; 975 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 976 rc = spdk_bdev_write_zeroes_blocks(desc, ch, task->offset_blocks, 977 job->io_size_blocks, bdevperf_complete, task); 978 break; 979 case SPDK_BDEV_IO_TYPE_READ: 980 if (g_zcopy) { 981 rc = spdk_bdev_zcopy_start(desc, ch, NULL, 0, task->offset_blocks, job->io_size_blocks, 982 true, bdevperf_zcopy_populate_complete, task); 983 } else { 984 rc = spdk_bdev_read_blocks_with_md(desc, ch, task->buf, task->md_buf, 985 task->offset_blocks, 986 job->io_size_blocks, 987 bdevperf_complete, task); 988 } 989 break; 990 case SPDK_BDEV_IO_TYPE_ABORT: 991 rc = spdk_bdev_abort(desc, ch, task->task_to_abort, bdevperf_abort_complete, task); 992 break; 993 default: 994 assert(false); 995 rc = -EINVAL; 996 break; 997 } 998 999 if (rc == -ENOMEM) { 1000 bdevperf_queue_io_wait_with_cb(task, bdevperf_submit_task); 1001 return; 1002 } else if (rc != 0) { 1003 printf("Failed to submit bdev_io: %d\n", rc); 1004 if (job->verify) { 1005 assert(task->offset_blocks / job->io_size_blocks >= job->ios_base); 1006 offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base; 1007 1008 assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true); 1009 spdk_bit_array_clear(job->outstanding, offset_in_ios); 1010 } 1011 bdevperf_job_drain(job); 1012 g_run_rc = rc; 1013 return; 1014 } 1015 1016 job->current_queue_depth++; 1017 } 1018 1019 static void 1020 bdevperf_zcopy_get_buf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 1021 { 1022 struct bdevperf_task *task = cb_arg; 1023 struct bdevperf_job *job = task->job; 1024 struct iovec *iovs; 1025 int iovcnt; 1026 1027 if (!success) { 1028 bdevperf_job_drain(job); 1029 g_run_rc = -1; 1030 return; 1031 } 1032 1033 task->bdev_io = bdev_io; 1034 task->io_type = SPDK_BDEV_IO_TYPE_WRITE; 1035 1036 if (job->verify || job->reset) { 1037 /* When job->verify or job->reset is enabled, task->buf is used for 1038 * verification of read after write. For write I/O, when zcopy APIs 1039 * are used, task->buf cannot be used, and data must be written to 1040 * the data buffer allocated underneath bdev layer instead. 1041 * Hence we copy task->buf to the allocated data buffer here. 1042 */ 1043 spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt); 1044 assert(iovcnt == 1); 1045 assert(iovs != NULL); 1046 1047 copy_data(iovs[0].iov_base, iovs[0].iov_len, task->buf, job->buf_size, 1048 spdk_bdev_get_block_size(job->bdev), 1049 spdk_bdev_io_get_md_buf(bdev_io), task->md_buf, 1050 spdk_bdev_get_md_size(job->bdev), job->io_size_blocks); 1051 } 1052 1053 bdevperf_submit_task(task); 1054 } 1055 1056 static void 1057 bdevperf_prep_zcopy_write_task(void *arg) 1058 { 1059 struct bdevperf_task *task = arg; 1060 struct bdevperf_job *job = task->job; 1061 int rc; 1062 1063 rc = spdk_bdev_zcopy_start(job->bdev_desc, job->ch, NULL, 0, 1064 task->offset_blocks, job->io_size_blocks, 1065 false, bdevperf_zcopy_get_buf_complete, task); 1066 if (rc != 0) { 1067 assert(rc == -ENOMEM); 1068 bdevperf_queue_io_wait_with_cb(task, bdevperf_prep_zcopy_write_task); 1069 return; 1070 } 1071 1072 job->current_queue_depth++; 1073 } 1074 1075 static struct bdevperf_task * 1076 bdevperf_job_get_task(struct bdevperf_job *job) 1077 { 1078 struct bdevperf_task *task; 1079 1080 task = TAILQ_FIRST(&job->task_list); 1081 if (!task) { 1082 printf("Task allocation failed\n"); 1083 abort(); 1084 } 1085 1086 TAILQ_REMOVE(&job->task_list, task, link); 1087 return task; 1088 } 1089 1090 static void 1091 bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task) 1092 { 1093 uint64_t offset_in_ios; 1094 1095 if (job->zipf) { 1096 offset_in_ios = spdk_zipf_generate(job->zipf); 1097 } else if (job->is_random) { 1098 offset_in_ios = rand_r(&job->seed) % job->size_in_ios; 1099 } else { 1100 offset_in_ios = job->offset_in_ios++; 1101 if (job->offset_in_ios == job->size_in_ios) { 1102 job->offset_in_ios = 0; 1103 } 1104 1105 /* Increment of offset_in_ios if there's already an outstanding IO 1106 * to that location. We only need this with job->verify as random 1107 * offsets are not supported with job->verify at this time. 1108 */ 1109 if (job->verify) { 1110 assert(spdk_bit_array_find_first_clear(job->outstanding, 0) != UINT32_MAX); 1111 1112 while (spdk_bit_array_get(job->outstanding, offset_in_ios)) { 1113 offset_in_ios = job->offset_in_ios++; 1114 if (job->offset_in_ios == job->size_in_ios) { 1115 job->offset_in_ios = 0; 1116 } 1117 } 1118 spdk_bit_array_set(job->outstanding, offset_in_ios); 1119 } 1120 } 1121 1122 /* For multi-thread to same job, offset_in_ios is relative 1123 * to the LBA range assigned for that job. job->offset_blocks 1124 * is absolute (entire bdev LBA range). 1125 */ 1126 task->offset_blocks = (offset_in_ios + job->ios_base) * job->io_size_blocks; 1127 1128 if (job->verify || job->reset) { 1129 generate_data(task->buf, job->buf_size, 1130 spdk_bdev_get_block_size(job->bdev), 1131 task->md_buf, spdk_bdev_get_md_size(job->bdev), 1132 job->io_size_blocks); 1133 if (g_zcopy) { 1134 bdevperf_prep_zcopy_write_task(task); 1135 return; 1136 } else { 1137 task->iov.iov_base = task->buf; 1138 task->iov.iov_len = job->buf_size; 1139 task->io_type = SPDK_BDEV_IO_TYPE_WRITE; 1140 } 1141 } else if (job->flush) { 1142 task->io_type = SPDK_BDEV_IO_TYPE_FLUSH; 1143 } else if (job->unmap) { 1144 task->io_type = SPDK_BDEV_IO_TYPE_UNMAP; 1145 } else if (job->write_zeroes) { 1146 task->io_type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 1147 } else if ((job->rw_percentage == 100) || 1148 (job->rw_percentage != 0 && ((rand_r(&job->seed) % 100) < job->rw_percentage))) { 1149 task->io_type = SPDK_BDEV_IO_TYPE_READ; 1150 } else { 1151 if (g_zcopy) { 1152 bdevperf_prep_zcopy_write_task(task); 1153 return; 1154 } else { 1155 task->iov.iov_base = task->buf; 1156 task->iov.iov_len = job->buf_size; 1157 task->io_type = SPDK_BDEV_IO_TYPE_WRITE; 1158 } 1159 } 1160 1161 bdevperf_submit_task(task); 1162 } 1163 1164 static int reset_job(void *arg); 1165 1166 static void 1167 reset_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 1168 { 1169 struct bdevperf_task *task = cb_arg; 1170 struct bdevperf_job *job = task->job; 1171 1172 if (!success) { 1173 printf("Reset blockdev=%s failed\n", spdk_bdev_get_name(job->bdev)); 1174 bdevperf_job_drain(job); 1175 g_run_rc = -1; 1176 } 1177 1178 TAILQ_INSERT_TAIL(&job->task_list, task, link); 1179 spdk_bdev_free_io(bdev_io); 1180 1181 job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job, 1182 10 * SPDK_SEC_TO_USEC); 1183 } 1184 1185 static int 1186 reset_job(void *arg) 1187 { 1188 struct bdevperf_job *job = arg; 1189 struct bdevperf_task *task; 1190 int rc; 1191 1192 spdk_poller_unregister(&job->reset_timer); 1193 1194 /* Do reset. */ 1195 task = bdevperf_job_get_task(job); 1196 rc = spdk_bdev_reset(job->bdev_desc, job->ch, 1197 reset_cb, task); 1198 if (rc) { 1199 printf("Reset failed: %d\n", rc); 1200 bdevperf_job_drain(job); 1201 g_run_rc = -1; 1202 } 1203 1204 return -1; 1205 } 1206 1207 static void 1208 bdevperf_timeout_cb(void *cb_arg, struct spdk_bdev_io *bdev_io) 1209 { 1210 struct bdevperf_job *job = cb_arg; 1211 struct bdevperf_task *task; 1212 1213 job->io_timeout++; 1214 1215 if (job->is_draining || !job->abort || 1216 !spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 1217 return; 1218 } 1219 1220 task = bdevperf_job_get_task(job); 1221 if (task == NULL) { 1222 return; 1223 } 1224 1225 task->task_to_abort = spdk_bdev_io_get_cb_arg(bdev_io); 1226 task->io_type = SPDK_BDEV_IO_TYPE_ABORT; 1227 1228 bdevperf_submit_task(task); 1229 } 1230 1231 static void 1232 bdevperf_job_run(void *ctx) 1233 { 1234 struct bdevperf_job *job = ctx; 1235 struct bdevperf_task *task; 1236 int i; 1237 1238 /* Submit initial I/O for this job. Each time one 1239 * completes, another will be submitted. */ 1240 1241 /* Start a timer to stop this I/O chain when the run is over */ 1242 job->run_timer = SPDK_POLLER_REGISTER(bdevperf_job_drain_timer, job, g_time_in_usec); 1243 if (job->reset) { 1244 job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job, 1245 10 * SPDK_SEC_TO_USEC); 1246 } 1247 1248 spdk_bdev_set_timeout(job->bdev_desc, g_timeout_in_sec, bdevperf_timeout_cb, job); 1249 1250 for (i = 0; i < job->queue_depth; i++) { 1251 task = bdevperf_job_get_task(job); 1252 bdevperf_submit_single(job, task); 1253 } 1254 } 1255 1256 static void 1257 _performance_dump_done(void *ctx) 1258 { 1259 struct bdevperf_aggregate_stats *stats = ctx; 1260 double average_latency; 1261 1262 printf("\r ==================================================================================" 1263 "=================================\n"); 1264 printf("\r %-28s: %10s %10.2f %10.2f", 1265 "Total", "", stats->total_io_per_second, stats->total_mb_per_second); 1266 printf(" %10.2f %8.2f", 1267 stats->total_failed_per_second, stats->total_timeout_per_second); 1268 1269 average_latency = ((double)stats->total_tsc / stats->total_io_completed) * SPDK_SEC_TO_USEC / 1270 spdk_get_ticks_hz(); 1271 printf(" %10.2f %10.2f %10.2f\n", average_latency, stats->min_latency, stats->max_latency); 1272 printf("\n"); 1273 1274 fflush(stdout); 1275 1276 g_performance_dump_active = false; 1277 1278 free(stats); 1279 } 1280 1281 static void 1282 _performance_dump(void *ctx) 1283 { 1284 struct bdevperf_aggregate_stats *stats = ctx; 1285 1286 performance_dump_job(stats, stats->current_job); 1287 1288 /* This assumes the jobs list is static after start up time. 1289 * That's true right now, but if that ever changed this would need a lock. */ 1290 stats->current_job = TAILQ_NEXT(stats->current_job, link); 1291 if (stats->current_job == NULL) { 1292 spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats); 1293 } else { 1294 spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats); 1295 } 1296 } 1297 1298 static int 1299 performance_statistics_thread(void *arg) 1300 { 1301 struct bdevperf_aggregate_stats *stats; 1302 1303 if (g_performance_dump_active) { 1304 return -1; 1305 } 1306 1307 g_performance_dump_active = true; 1308 1309 stats = calloc(1, sizeof(*stats)); 1310 if (stats == NULL) { 1311 return -1; 1312 } 1313 1314 stats->min_latency = (double)UINT64_MAX; 1315 1316 g_show_performance_period_num++; 1317 1318 stats->io_time_in_usec = g_show_performance_period_num * g_show_performance_period_in_usec; 1319 stats->ema_period = g_show_performance_ema_period; 1320 1321 /* Iterate all of the jobs to gather stats 1322 * These jobs will not get removed here until a final performance dump is run, 1323 * so this should be safe without locking. 1324 */ 1325 stats->current_job = TAILQ_FIRST(&g_bdevperf.jobs); 1326 if (stats->current_job == NULL) { 1327 spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats); 1328 } else { 1329 spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats); 1330 } 1331 1332 return -1; 1333 } 1334 1335 static void 1336 bdevperf_test(void) 1337 { 1338 struct bdevperf_job *job; 1339 1340 printf("Running I/O for %" PRIu64 " seconds...\n", g_time_in_usec / (uint64_t)SPDK_SEC_TO_USEC); 1341 fflush(stdout); 1342 1343 /* Start a timer to dump performance numbers */ 1344 g_start_tsc = spdk_get_ticks(); 1345 if (g_show_performance_real_time && !g_perf_timer) { 1346 printf("%*s\n", 107, "Latency(us)"); 1347 printf("\r %-*s: %10s %10s %10s %10s %8s %10s %10s %10s\n", 1348 28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s", "Average", "min", "max"); 1349 1350 g_perf_timer = SPDK_POLLER_REGISTER(performance_statistics_thread, NULL, 1351 g_show_performance_period_in_usec); 1352 } 1353 1354 /* Iterate jobs to start all I/O */ 1355 TAILQ_FOREACH(job, &g_bdevperf.jobs, link) { 1356 g_bdevperf.running_jobs++; 1357 spdk_thread_send_msg(job->thread, bdevperf_job_run, job); 1358 } 1359 } 1360 1361 static void 1362 bdevperf_bdev_removed(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx) 1363 { 1364 struct bdevperf_job *job = event_ctx; 1365 1366 if (SPDK_BDEV_EVENT_REMOVE == type) { 1367 bdevperf_job_drain(job); 1368 } 1369 } 1370 1371 static void 1372 bdevperf_histogram_status_cb(void *cb_arg, int status) 1373 { 1374 if (status != 0) { 1375 g_run_rc = status; 1376 if (g_continue_on_failure == false) { 1377 g_error_to_exit = true; 1378 } 1379 } 1380 1381 if (--g_bdev_count == 0) { 1382 if (g_run_rc == 0) { 1383 /* Ready to run the test */ 1384 bdevperf_test(); 1385 } else { 1386 bdevperf_test_done(NULL); 1387 } 1388 } 1389 } 1390 1391 static uint32_t g_construct_job_count = 0; 1392 1393 static void 1394 _bdevperf_enable_histogram(bool enable) 1395 { 1396 struct spdk_bdev *bdev; 1397 /* increment initial g_bdev_count so that it will never reach 0 in the middle of iteration */ 1398 g_bdev_count = 1; 1399 1400 if (g_job_bdev_name != NULL) { 1401 bdev = spdk_bdev_get_by_name(g_job_bdev_name); 1402 if (bdev) { 1403 g_bdev_count++; 1404 1405 spdk_bdev_histogram_enable(bdev, bdevperf_histogram_status_cb, NULL, enable); 1406 } else { 1407 fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name); 1408 } 1409 } else { 1410 bdev = spdk_bdev_first_leaf(); 1411 1412 while (bdev != NULL) { 1413 g_bdev_count++; 1414 1415 spdk_bdev_histogram_enable(bdev, bdevperf_histogram_status_cb, NULL, enable); 1416 bdev = spdk_bdev_next_leaf(bdev); 1417 } 1418 } 1419 1420 bdevperf_histogram_status_cb(NULL, 0); 1421 } 1422 1423 static void 1424 _bdevperf_construct_job_done(void *ctx) 1425 { 1426 if (--g_construct_job_count == 0) { 1427 if (g_run_rc != 0) { 1428 /* Something failed. */ 1429 bdevperf_test_done(NULL); 1430 return; 1431 } 1432 1433 /* always enable histogram. */ 1434 _bdevperf_enable_histogram(true); 1435 } else if (g_run_rc != 0) { 1436 /* Reset error as some jobs constructed right */ 1437 g_run_rc = 0; 1438 if (g_continue_on_failure == false) { 1439 g_error_to_exit = true; 1440 } 1441 } 1442 } 1443 1444 /* Checkformat will not allow to use inlined type, 1445 this is a workaround */ 1446 typedef struct spdk_thread *spdk_thread_t; 1447 1448 static spdk_thread_t 1449 construct_job_thread(struct spdk_cpuset *cpumask, const char *tag) 1450 { 1451 struct spdk_cpuset tmp; 1452 1453 /* This function runs on the main thread. */ 1454 assert(g_main_thread == spdk_get_thread()); 1455 1456 /* Handle default mask */ 1457 if (spdk_cpuset_count(cpumask) == 0) { 1458 cpumask = &g_all_cpuset; 1459 } 1460 1461 /* Warn user that mask might need to be changed */ 1462 spdk_cpuset_copy(&tmp, cpumask); 1463 spdk_cpuset_or(&tmp, &g_all_cpuset); 1464 if (!spdk_cpuset_equal(&tmp, &g_all_cpuset)) { 1465 fprintf(stderr, "cpumask for '%s' is too big\n", tag); 1466 } 1467 1468 return spdk_thread_create(tag, cpumask); 1469 } 1470 1471 static uint32_t 1472 _get_next_core(void) 1473 { 1474 static uint32_t current_core = SPDK_ENV_LCORE_ID_ANY; 1475 1476 if (current_core == SPDK_ENV_LCORE_ID_ANY) { 1477 current_core = spdk_env_get_first_core(); 1478 return current_core; 1479 } 1480 1481 current_core = spdk_env_get_next_core(current_core); 1482 if (current_core == SPDK_ENV_LCORE_ID_ANY) { 1483 current_core = spdk_env_get_first_core(); 1484 } 1485 1486 return current_core; 1487 } 1488 1489 static void 1490 _bdevperf_construct_job(void *ctx) 1491 { 1492 struct bdevperf_job *job = ctx; 1493 int rc; 1494 1495 rc = spdk_bdev_open_ext(spdk_bdev_get_name(job->bdev), true, bdevperf_bdev_removed, job, 1496 &job->bdev_desc); 1497 if (rc != 0) { 1498 SPDK_ERRLOG("Could not open leaf bdev %s, error=%d\n", spdk_bdev_get_name(job->bdev), rc); 1499 g_run_rc = -EINVAL; 1500 goto end; 1501 } 1502 1503 if (g_zcopy) { 1504 if (!spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 1505 printf("Test requires ZCOPY but bdev module does not support ZCOPY\n"); 1506 g_run_rc = -ENOTSUP; 1507 goto end; 1508 } 1509 } 1510 1511 job->ch = spdk_bdev_get_io_channel(job->bdev_desc); 1512 if (!job->ch) { 1513 SPDK_ERRLOG("Could not get io_channel for device %s, error=%d\n", spdk_bdev_get_name(job->bdev), 1514 rc); 1515 spdk_bdev_close(job->bdev_desc); 1516 TAILQ_REMOVE(&g_bdevperf.jobs, job, link); 1517 g_run_rc = -ENOMEM; 1518 goto end; 1519 } 1520 1521 end: 1522 spdk_thread_send_msg(g_main_thread, _bdevperf_construct_job_done, NULL); 1523 } 1524 1525 static void 1526 job_init_rw(struct bdevperf_job *job, enum job_config_rw rw) 1527 { 1528 switch (rw) { 1529 case JOB_CONFIG_RW_READ: 1530 job->rw_percentage = 100; 1531 break; 1532 case JOB_CONFIG_RW_WRITE: 1533 job->rw_percentage = 0; 1534 break; 1535 case JOB_CONFIG_RW_RANDREAD: 1536 job->is_random = true; 1537 job->rw_percentage = 100; 1538 job->seed = rand(); 1539 break; 1540 case JOB_CONFIG_RW_RANDWRITE: 1541 job->is_random = true; 1542 job->rw_percentage = 0; 1543 job->seed = rand(); 1544 break; 1545 case JOB_CONFIG_RW_RW: 1546 job->is_random = false; 1547 break; 1548 case JOB_CONFIG_RW_RANDRW: 1549 job->is_random = true; 1550 job->seed = rand(); 1551 break; 1552 case JOB_CONFIG_RW_VERIFY: 1553 job->verify = true; 1554 job->rw_percentage = 50; 1555 break; 1556 case JOB_CONFIG_RW_RESET: 1557 job->reset = true; 1558 job->verify = true; 1559 job->rw_percentage = 50; 1560 break; 1561 case JOB_CONFIG_RW_UNMAP: 1562 job->unmap = true; 1563 break; 1564 case JOB_CONFIG_RW_FLUSH: 1565 job->flush = true; 1566 break; 1567 case JOB_CONFIG_RW_WRITE_ZEROES: 1568 job->write_zeroes = true; 1569 break; 1570 } 1571 } 1572 1573 static int 1574 bdevperf_construct_job(struct spdk_bdev *bdev, struct job_config *config, 1575 struct spdk_thread *thread) 1576 { 1577 struct bdevperf_job *job; 1578 struct bdevperf_task *task; 1579 int block_size, data_block_size; 1580 int rc; 1581 int task_num, n; 1582 1583 block_size = spdk_bdev_get_block_size(bdev); 1584 data_block_size = spdk_bdev_get_data_block_size(bdev); 1585 1586 job = calloc(1, sizeof(struct bdevperf_job)); 1587 if (!job) { 1588 fprintf(stderr, "Unable to allocate memory for new job.\n"); 1589 return -ENOMEM; 1590 } 1591 1592 job->name = strdup(spdk_bdev_get_name(bdev)); 1593 if (!job->name) { 1594 fprintf(stderr, "Unable to allocate memory for job name.\n"); 1595 bdevperf_job_free(job); 1596 return -ENOMEM; 1597 } 1598 1599 job->workload_type = g_workload_type; 1600 job->io_size = config->bs; 1601 job->rw_percentage = config->rwmixread; 1602 job->continue_on_failure = g_continue_on_failure; 1603 job->queue_depth = config->iodepth; 1604 job->bdev = bdev; 1605 job->io_size_blocks = job->io_size / data_block_size; 1606 job->buf_size = job->io_size_blocks * block_size; 1607 job->abort = g_abort; 1608 job_init_rw(job, config->rw); 1609 1610 if ((job->io_size % data_block_size) != 0) { 1611 SPDK_ERRLOG("IO size (%d) is not multiples of data block size of bdev %s (%"PRIu32")\n", 1612 job->io_size, spdk_bdev_get_name(bdev), data_block_size); 1613 bdevperf_job_free(job); 1614 return -ENOTSUP; 1615 } 1616 1617 if (job->unmap && !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { 1618 printf("Skipping %s because it does not support unmap\n", spdk_bdev_get_name(bdev)); 1619 bdevperf_job_free(job); 1620 return -ENOTSUP; 1621 } 1622 1623 if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) { 1624 job->dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK; 1625 } 1626 if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) { 1627 job->dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK; 1628 } 1629 1630 job->offset_in_ios = 0; 1631 1632 if (config->length != 0) { 1633 /* Use subset of disk */ 1634 job->size_in_ios = config->length / job->io_size_blocks; 1635 job->ios_base = config->offset / job->io_size_blocks; 1636 } else { 1637 /* Use whole disk */ 1638 job->size_in_ios = spdk_bdev_get_num_blocks(bdev) / job->io_size_blocks; 1639 job->ios_base = 0; 1640 } 1641 1642 if (job->is_random && g_zipf_theta > 0) { 1643 job->zipf = spdk_zipf_create(job->size_in_ios, g_zipf_theta, 0); 1644 } 1645 1646 if (job->verify) { 1647 job->outstanding = spdk_bit_array_create(job->size_in_ios); 1648 if (job->outstanding == NULL) { 1649 SPDK_ERRLOG("Could not create outstanding array bitmap for bdev %s\n", 1650 spdk_bdev_get_name(bdev)); 1651 bdevperf_job_free(job); 1652 return -ENOMEM; 1653 } 1654 if (job->queue_depth > (int)job->size_in_ios) { 1655 SPDK_WARNLOG("Due to constraints of verify job, queue depth (-q, %d) can't exceed the number of IO " 1656 "requests which can be submitted to the bdev %s simultaneously (%"PRIu64"). " 1657 "Queue depth is limited to %"PRIu64"\n", 1658 job->queue_depth, job->name, job->size_in_ios, job->size_in_ios); 1659 job->queue_depth = (int)job->size_in_ios; 1660 } 1661 } 1662 1663 job->histogram = spdk_histogram_data_alloc(); 1664 if (job->histogram == NULL) { 1665 fprintf(stderr, "Failed to allocate histogram\n"); 1666 bdevperf_job_free(job); 1667 return -ENOMEM; 1668 } 1669 1670 TAILQ_INIT(&job->task_list); 1671 1672 task_num = job->queue_depth; 1673 if (job->reset) { 1674 task_num += 1; 1675 } 1676 if (job->abort) { 1677 task_num += job->queue_depth; 1678 } 1679 1680 TAILQ_INSERT_TAIL(&g_bdevperf.jobs, job, link); 1681 1682 for (n = 0; n < task_num; n++) { 1683 task = calloc(1, sizeof(struct bdevperf_task)); 1684 if (!task) { 1685 fprintf(stderr, "Failed to allocate task from memory\n"); 1686 return -ENOMEM; 1687 } 1688 1689 task->buf = spdk_zmalloc(job->buf_size, spdk_bdev_get_buf_align(job->bdev), NULL, 1690 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1691 if (!task->buf) { 1692 fprintf(stderr, "Cannot allocate buf for task=%p\n", task); 1693 free(task); 1694 return -ENOMEM; 1695 } 1696 1697 if (spdk_bdev_is_md_separate(job->bdev)) { 1698 task->md_buf = spdk_zmalloc(job->io_size_blocks * 1699 spdk_bdev_get_md_size(job->bdev), 0, NULL, 1700 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1701 if (!task->md_buf) { 1702 fprintf(stderr, "Cannot allocate md buf for task=%p\n", task); 1703 spdk_free(task->buf); 1704 free(task); 1705 return -ENOMEM; 1706 } 1707 } 1708 1709 task->job = job; 1710 TAILQ_INSERT_TAIL(&job->task_list, task, link); 1711 } 1712 1713 job->thread = thread; 1714 1715 g_construct_job_count++; 1716 1717 rc = spdk_thread_send_msg(thread, _bdevperf_construct_job, job); 1718 assert(rc == 0); 1719 1720 return rc; 1721 } 1722 1723 static int 1724 parse_rw(const char *str, enum job_config_rw ret) 1725 { 1726 if (str == NULL) { 1727 return ret; 1728 } 1729 1730 if (!strcmp(str, "read")) { 1731 ret = JOB_CONFIG_RW_READ; 1732 } else if (!strcmp(str, "randread")) { 1733 ret = JOB_CONFIG_RW_RANDREAD; 1734 } else if (!strcmp(str, "write")) { 1735 ret = JOB_CONFIG_RW_WRITE; 1736 } else if (!strcmp(str, "randwrite")) { 1737 ret = JOB_CONFIG_RW_RANDWRITE; 1738 } else if (!strcmp(str, "verify")) { 1739 ret = JOB_CONFIG_RW_VERIFY; 1740 } else if (!strcmp(str, "reset")) { 1741 ret = JOB_CONFIG_RW_RESET; 1742 } else if (!strcmp(str, "unmap")) { 1743 ret = JOB_CONFIG_RW_UNMAP; 1744 } else if (!strcmp(str, "write_zeroes")) { 1745 ret = JOB_CONFIG_RW_WRITE_ZEROES; 1746 } else if (!strcmp(str, "flush")) { 1747 ret = JOB_CONFIG_RW_FLUSH; 1748 } else if (!strcmp(str, "rw")) { 1749 ret = JOB_CONFIG_RW_RW; 1750 } else if (!strcmp(str, "randrw")) { 1751 ret = JOB_CONFIG_RW_RANDRW; 1752 } else { 1753 fprintf(stderr, "rw must be one of\n" 1754 "(read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n"); 1755 ret = BDEVPERF_CONFIG_ERROR; 1756 } 1757 1758 return ret; 1759 } 1760 1761 static const char * 1762 config_filename_next(const char *filename, char *out) 1763 { 1764 int i, k; 1765 1766 if (filename == NULL) { 1767 out[0] = '\0'; 1768 return NULL; 1769 } 1770 1771 if (filename[0] == ':') { 1772 filename++; 1773 } 1774 1775 for (i = 0, k = 0; 1776 filename[i] != '\0' && 1777 filename[i] != ':' && 1778 i < BDEVPERF_CONFIG_MAX_FILENAME; 1779 i++) { 1780 if (filename[i] == ' ' || filename[i] == '\t') { 1781 continue; 1782 } 1783 1784 out[k++] = filename[i]; 1785 } 1786 out[k] = 0; 1787 1788 return filename + i; 1789 } 1790 1791 static void 1792 bdevperf_construct_jobs(void) 1793 { 1794 char filename[BDEVPERF_CONFIG_MAX_FILENAME]; 1795 struct spdk_thread *thread; 1796 struct job_config *config; 1797 struct spdk_bdev *bdev; 1798 const char *filenames; 1799 int rc; 1800 1801 TAILQ_FOREACH(config, &job_config_list, link) { 1802 filenames = config->filename; 1803 1804 thread = construct_job_thread(&config->cpumask, config->name); 1805 assert(thread); 1806 1807 while (filenames) { 1808 filenames = config_filename_next(filenames, filename); 1809 if (strlen(filename) == 0) { 1810 break; 1811 } 1812 1813 bdev = spdk_bdev_get_by_name(filename); 1814 if (!bdev) { 1815 fprintf(stderr, "Unable to find bdev '%s'\n", filename); 1816 g_run_rc = -EINVAL; 1817 return; 1818 } 1819 1820 rc = bdevperf_construct_job(bdev, config, thread); 1821 if (rc < 0) { 1822 g_run_rc = rc; 1823 return; 1824 } 1825 } 1826 } 1827 } 1828 1829 static int 1830 make_cli_job_config(const char *filename, int64_t offset, uint64_t range) 1831 { 1832 struct job_config *config = calloc(1, sizeof(*config)); 1833 1834 if (config == NULL) { 1835 fprintf(stderr, "Unable to allocate memory for job config\n"); 1836 return -ENOMEM; 1837 } 1838 1839 config->name = filename; 1840 config->filename = filename; 1841 spdk_cpuset_zero(&config->cpumask); 1842 spdk_cpuset_set_cpu(&config->cpumask, _get_next_core(), true); 1843 config->bs = g_io_size; 1844 config->iodepth = g_queue_depth; 1845 config->rwmixread = g_rw_percentage; 1846 config->offset = offset; 1847 config->length = range; 1848 config->rw = parse_rw(g_workload_type, BDEVPERF_CONFIG_ERROR); 1849 if ((int)config->rw == BDEVPERF_CONFIG_ERROR) { 1850 free(config); 1851 return -EINVAL; 1852 } 1853 1854 TAILQ_INSERT_TAIL(&job_config_list, config, link); 1855 return 0; 1856 } 1857 1858 static void 1859 bdevperf_construct_multithread_job_configs(void) 1860 { 1861 struct spdk_bdev *bdev; 1862 uint32_t i; 1863 uint32_t num_cores; 1864 uint64_t blocks_per_job; 1865 int64_t offset; 1866 1867 num_cores = 0; 1868 SPDK_ENV_FOREACH_CORE(i) { 1869 num_cores++; 1870 } 1871 1872 if (num_cores == 0) { 1873 g_run_rc = -EINVAL; 1874 return; 1875 } 1876 1877 if (g_job_bdev_name != NULL) { 1878 bdev = spdk_bdev_get_by_name(g_job_bdev_name); 1879 if (!bdev) { 1880 fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name); 1881 return; 1882 } 1883 1884 blocks_per_job = spdk_bdev_get_num_blocks(bdev) / num_cores; 1885 offset = 0; 1886 1887 SPDK_ENV_FOREACH_CORE(i) { 1888 g_run_rc = make_cli_job_config(g_job_bdev_name, offset, blocks_per_job); 1889 if (g_run_rc) { 1890 return; 1891 } 1892 1893 offset += blocks_per_job; 1894 } 1895 } else { 1896 bdev = spdk_bdev_first_leaf(); 1897 while (bdev != NULL) { 1898 blocks_per_job = spdk_bdev_get_num_blocks(bdev) / num_cores; 1899 offset = 0; 1900 1901 SPDK_ENV_FOREACH_CORE(i) { 1902 g_run_rc = make_cli_job_config(spdk_bdev_get_name(bdev), 1903 offset, blocks_per_job); 1904 if (g_run_rc) { 1905 return; 1906 } 1907 1908 offset += blocks_per_job; 1909 } 1910 1911 bdev = spdk_bdev_next_leaf(bdev); 1912 } 1913 } 1914 } 1915 1916 static void 1917 bdevperf_construct_job_configs(void) 1918 { 1919 struct spdk_bdev *bdev; 1920 1921 /* There are three different modes for allocating jobs. Standard mode 1922 * (the default) creates one spdk_thread per bdev and runs the I/O job there. 1923 * 1924 * The -C flag places bdevperf into "multithread" mode, meaning it creates 1925 * one spdk_thread per bdev PER CORE, and runs a copy of the job on each. 1926 * This runs multiple threads per bdev, effectively. 1927 * 1928 * The -j flag implies "FIO" mode which tries to mimic semantic of FIO jobs. 1929 * In "FIO" mode, threads are spawned per-job instead of per-bdev. 1930 * Each FIO job can be individually parameterized by filename, cpu mask, etc, 1931 * which is different from other modes in that they only support global options. 1932 */ 1933 1934 if (g_bdevperf_conf) { 1935 goto end; 1936 } else if (g_multithread_mode) { 1937 bdevperf_construct_multithread_job_configs(); 1938 goto end; 1939 } 1940 1941 if (g_job_bdev_name != NULL) { 1942 bdev = spdk_bdev_get_by_name(g_job_bdev_name); 1943 if (bdev) { 1944 /* Construct the job */ 1945 g_run_rc = make_cli_job_config(g_job_bdev_name, 0, 0); 1946 } else { 1947 fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name); 1948 } 1949 } else { 1950 bdev = spdk_bdev_first_leaf(); 1951 1952 while (bdev != NULL) { 1953 /* Construct the job */ 1954 g_run_rc = make_cli_job_config(spdk_bdev_get_name(bdev), 0, 0); 1955 if (g_run_rc) { 1956 break; 1957 } 1958 1959 bdev = spdk_bdev_next_leaf(bdev); 1960 } 1961 } 1962 1963 end: 1964 /* Increment initial construct_jobs count so that it will never reach 0 in the middle 1965 * of iteration. 1966 */ 1967 g_construct_job_count = 1; 1968 1969 if (g_run_rc == 0) { 1970 bdevperf_construct_jobs(); 1971 } 1972 1973 _bdevperf_construct_job_done(NULL); 1974 } 1975 1976 static int 1977 parse_uint_option(struct spdk_conf_section *s, const char *name, int def) 1978 { 1979 const char *job_name; 1980 int tmp; 1981 1982 tmp = spdk_conf_section_get_intval(s, name); 1983 if (tmp == -1) { 1984 /* Field was not found. Check default value 1985 * In [global] section it is ok to have undefined values 1986 * but for other sections it is not ok */ 1987 if (def == BDEVPERF_CONFIG_UNDEFINED) { 1988 job_name = spdk_conf_section_get_name(s); 1989 if (strcmp(job_name, "global") == 0) { 1990 return def; 1991 } 1992 1993 fprintf(stderr, 1994 "Job '%s' has no '%s' assigned\n", 1995 job_name, name); 1996 return BDEVPERF_CONFIG_ERROR; 1997 } 1998 return def; 1999 } 2000 2001 /* NOTE: get_intval returns nonnegative on success */ 2002 if (tmp < 0) { 2003 fprintf(stderr, "Job '%s' has bad '%s' value.\n", 2004 spdk_conf_section_get_name(s), name); 2005 return BDEVPERF_CONFIG_ERROR; 2006 } 2007 2008 return tmp; 2009 } 2010 2011 /* CLI arguments override parameters for global sections */ 2012 static void 2013 config_set_cli_args(struct job_config *config) 2014 { 2015 if (g_job_bdev_name) { 2016 config->filename = g_job_bdev_name; 2017 } 2018 if (g_io_size > 0) { 2019 config->bs = g_io_size; 2020 } 2021 if (g_queue_depth > 0) { 2022 config->iodepth = g_queue_depth; 2023 } 2024 if (g_rw_percentage > 0) { 2025 config->rwmixread = g_rw_percentage; 2026 } 2027 if (g_workload_type) { 2028 config->rw = parse_rw(g_workload_type, config->rw); 2029 } 2030 } 2031 2032 static int 2033 read_job_config(void) 2034 { 2035 struct job_config global_default_config; 2036 struct job_config global_config; 2037 struct spdk_conf_section *s; 2038 struct job_config *config; 2039 const char *cpumask; 2040 const char *rw; 2041 bool is_global; 2042 int n = 0; 2043 int val; 2044 2045 if (g_bdevperf_conf_file == NULL) { 2046 return 0; 2047 } 2048 2049 g_bdevperf_conf = spdk_conf_allocate(); 2050 if (g_bdevperf_conf == NULL) { 2051 fprintf(stderr, "Could not allocate job config structure\n"); 2052 return 1; 2053 } 2054 2055 spdk_conf_disable_sections_merge(g_bdevperf_conf); 2056 if (spdk_conf_read(g_bdevperf_conf, g_bdevperf_conf_file)) { 2057 fprintf(stderr, "Invalid job config"); 2058 return 1; 2059 } 2060 2061 /* Initialize global defaults */ 2062 global_default_config.filename = NULL; 2063 /* Zero mask is the same as g_all_cpuset 2064 * The g_all_cpuset is not initialized yet, 2065 * so use zero mask as the default instead */ 2066 spdk_cpuset_zero(&global_default_config.cpumask); 2067 global_default_config.bs = BDEVPERF_CONFIG_UNDEFINED; 2068 global_default_config.iodepth = BDEVPERF_CONFIG_UNDEFINED; 2069 /* bdevperf has no default for -M option but in FIO the default is 50 */ 2070 global_default_config.rwmixread = 50; 2071 global_default_config.offset = 0; 2072 /* length 0 means 100% */ 2073 global_default_config.length = 0; 2074 global_default_config.rw = BDEVPERF_CONFIG_UNDEFINED; 2075 config_set_cli_args(&global_default_config); 2076 2077 if ((int)global_default_config.rw == BDEVPERF_CONFIG_ERROR) { 2078 return 1; 2079 } 2080 2081 /* There is only a single instance of global job_config 2082 * We just reset its value when we encounter new [global] section */ 2083 global_config = global_default_config; 2084 2085 for (s = spdk_conf_first_section(g_bdevperf_conf); 2086 s != NULL; 2087 s = spdk_conf_next_section(s)) { 2088 config = calloc(1, sizeof(*config)); 2089 if (config == NULL) { 2090 fprintf(stderr, "Unable to allocate memory for job config\n"); 2091 return 1; 2092 } 2093 2094 config->name = spdk_conf_section_get_name(s); 2095 is_global = strcmp(config->name, "global") == 0; 2096 2097 if (is_global) { 2098 global_config = global_default_config; 2099 } 2100 2101 config->filename = spdk_conf_section_get_val(s, "filename"); 2102 if (config->filename == NULL) { 2103 config->filename = global_config.filename; 2104 } 2105 if (!is_global) { 2106 if (config->filename == NULL) { 2107 fprintf(stderr, "Job '%s' expects 'filename' parameter\n", config->name); 2108 goto error; 2109 } else if (strnlen(config->filename, BDEVPERF_CONFIG_MAX_FILENAME) 2110 >= BDEVPERF_CONFIG_MAX_FILENAME) { 2111 fprintf(stderr, 2112 "filename for '%s' job is too long. Max length is %d\n", 2113 config->name, BDEVPERF_CONFIG_MAX_FILENAME); 2114 goto error; 2115 } 2116 } 2117 2118 cpumask = spdk_conf_section_get_val(s, "cpumask"); 2119 if (cpumask == NULL) { 2120 config->cpumask = global_config.cpumask; 2121 } else if (spdk_cpuset_parse(&config->cpumask, cpumask)) { 2122 fprintf(stderr, "Job '%s' has bad 'cpumask' value\n", config->name); 2123 goto error; 2124 } 2125 2126 config->bs = parse_uint_option(s, "bs", global_config.bs); 2127 if (config->bs == BDEVPERF_CONFIG_ERROR) { 2128 goto error; 2129 } else if (config->bs == 0) { 2130 fprintf(stderr, "'bs' of job '%s' must be greater than 0\n", config->name); 2131 goto error; 2132 } 2133 2134 config->iodepth = parse_uint_option(s, "iodepth", global_config.iodepth); 2135 if (config->iodepth == BDEVPERF_CONFIG_ERROR) { 2136 goto error; 2137 } else if (config->iodepth == 0) { 2138 fprintf(stderr, 2139 "'iodepth' of job '%s' must be greater than 0\n", 2140 config->name); 2141 goto error; 2142 } 2143 2144 config->rwmixread = parse_uint_option(s, "rwmixread", global_config.rwmixread); 2145 if (config->rwmixread == BDEVPERF_CONFIG_ERROR) { 2146 goto error; 2147 } else if (config->rwmixread > 100) { 2148 fprintf(stderr, 2149 "'rwmixread' value of '%s' job is not in 0-100 range\n", 2150 config->name); 2151 goto error; 2152 } 2153 2154 config->offset = parse_uint_option(s, "offset", global_config.offset); 2155 if (config->offset == BDEVPERF_CONFIG_ERROR) { 2156 goto error; 2157 } 2158 2159 val = parse_uint_option(s, "length", global_config.length); 2160 if (val == BDEVPERF_CONFIG_ERROR) { 2161 goto error; 2162 } 2163 config->length = val; 2164 2165 rw = spdk_conf_section_get_val(s, "rw"); 2166 config->rw = parse_rw(rw, global_config.rw); 2167 if ((int)config->rw == BDEVPERF_CONFIG_ERROR) { 2168 fprintf(stderr, "Job '%s' has bad 'rw' value\n", config->name); 2169 goto error; 2170 } else if (!is_global && (int)config->rw == BDEVPERF_CONFIG_UNDEFINED) { 2171 fprintf(stderr, "Job '%s' has no 'rw' assigned\n", config->name); 2172 goto error; 2173 } 2174 2175 if (is_global) { 2176 config_set_cli_args(config); 2177 global_config = *config; 2178 free(config); 2179 } else { 2180 TAILQ_INSERT_TAIL(&job_config_list, config, link); 2181 n++; 2182 } 2183 } 2184 2185 printf("Using job config with %d jobs\n", n); 2186 return 0; 2187 error: 2188 free(config); 2189 return 1; 2190 } 2191 2192 static void 2193 bdevperf_run(void *arg1) 2194 { 2195 uint32_t i; 2196 2197 g_main_thread = spdk_get_thread(); 2198 2199 spdk_cpuset_zero(&g_all_cpuset); 2200 SPDK_ENV_FOREACH_CORE(i) { 2201 spdk_cpuset_set_cpu(&g_all_cpuset, i, true); 2202 } 2203 2204 if (g_wait_for_tests) { 2205 /* Do not perform any tests until RPC is received */ 2206 return; 2207 } 2208 2209 bdevperf_construct_job_configs(); 2210 } 2211 2212 static void 2213 rpc_perform_tests_cb(void) 2214 { 2215 struct spdk_json_write_ctx *w; 2216 struct spdk_jsonrpc_request *request = g_request; 2217 2218 g_request = NULL; 2219 2220 if (g_run_rc == 0) { 2221 w = spdk_jsonrpc_begin_result(request); 2222 spdk_json_write_uint32(w, g_run_rc); 2223 spdk_jsonrpc_end_result(request, w); 2224 } else { 2225 spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, 2226 "bdevperf failed with error %s", spdk_strerror(-g_run_rc)); 2227 } 2228 2229 /* Reset g_run_rc to 0 for the next test run. */ 2230 g_run_rc = 0; 2231 2232 /* Reset g_stats to 0 for the next test run. */ 2233 memset(&g_stats, 0, sizeof(g_stats)); 2234 } 2235 2236 static void 2237 rpc_perform_tests(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) 2238 { 2239 if (params != NULL) { 2240 spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, 2241 "perform_tests method requires no parameters"); 2242 return; 2243 } 2244 if (g_request != NULL) { 2245 fprintf(stderr, "Another test is already in progress.\n"); 2246 spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, 2247 spdk_strerror(-EINPROGRESS)); 2248 return; 2249 } 2250 g_request = request; 2251 2252 /* Only construct job configs at the first test run. */ 2253 if (TAILQ_EMPTY(&job_config_list)) { 2254 bdevperf_construct_job_configs(); 2255 } else { 2256 bdevperf_construct_jobs(); 2257 } 2258 } 2259 SPDK_RPC_REGISTER("perform_tests", rpc_perform_tests, SPDK_RPC_RUNTIME) 2260 2261 static void 2262 _bdevperf_job_drain(void *ctx) 2263 { 2264 bdevperf_job_drain(ctx); 2265 } 2266 2267 static void 2268 spdk_bdevperf_shutdown_cb(void) 2269 { 2270 g_shutdown = true; 2271 struct bdevperf_job *job, *tmp; 2272 2273 if (g_bdevperf.running_jobs == 0) { 2274 bdevperf_test_done(NULL); 2275 return; 2276 } 2277 2278 /* Iterate jobs to stop all I/O */ 2279 TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, tmp) { 2280 spdk_thread_send_msg(job->thread, _bdevperf_job_drain, job); 2281 } 2282 } 2283 2284 static int 2285 bdevperf_parse_arg(int ch, char *arg) 2286 { 2287 long long tmp; 2288 2289 if (ch == 'w') { 2290 g_workload_type = optarg; 2291 } else if (ch == 'T') { 2292 g_job_bdev_name = optarg; 2293 } else if (ch == 'z') { 2294 g_wait_for_tests = true; 2295 } else if (ch == 'Z') { 2296 g_zcopy = true; 2297 } else if (ch == 'X') { 2298 g_abort = true; 2299 } else if (ch == 'C') { 2300 g_multithread_mode = true; 2301 } else if (ch == 'f') { 2302 g_continue_on_failure = true; 2303 } else if (ch == 'j') { 2304 g_bdevperf_conf_file = optarg; 2305 } else if (ch == 'F') { 2306 char *endptr; 2307 2308 errno = 0; 2309 g_zipf_theta = strtod(optarg, &endptr); 2310 if (errno || optarg == endptr || g_zipf_theta < 0) { 2311 fprintf(stderr, "Illegal zipf theta value %s\n", optarg); 2312 return -EINVAL; 2313 } 2314 } else if (ch == 'l') { 2315 g_latency_display_level++; 2316 } else { 2317 tmp = spdk_strtoll(optarg, 10); 2318 if (tmp < 0) { 2319 fprintf(stderr, "Parse failed for the option %c.\n", ch); 2320 return tmp; 2321 } else if (tmp >= INT_MAX) { 2322 fprintf(stderr, "Parsed option was too large %c.\n", ch); 2323 return -ERANGE; 2324 } 2325 2326 switch (ch) { 2327 case 'q': 2328 g_queue_depth = tmp; 2329 break; 2330 case 'o': 2331 g_io_size = tmp; 2332 break; 2333 case 't': 2334 g_time_in_sec = tmp; 2335 break; 2336 case 'k': 2337 g_timeout_in_sec = tmp; 2338 break; 2339 case 'M': 2340 g_rw_percentage = tmp; 2341 g_mix_specified = true; 2342 break; 2343 case 'P': 2344 g_show_performance_ema_period = tmp; 2345 break; 2346 case 'S': 2347 g_show_performance_real_time = 1; 2348 g_show_performance_period_in_usec = tmp * SPDK_SEC_TO_USEC; 2349 break; 2350 default: 2351 return -EINVAL; 2352 } 2353 } 2354 return 0; 2355 } 2356 2357 static void 2358 bdevperf_usage(void) 2359 { 2360 printf(" -q <depth> io depth\n"); 2361 printf(" -o <size> io size in bytes\n"); 2362 printf(" -w <type> io pattern type, must be one of (read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n"); 2363 printf(" -t <time> time in seconds\n"); 2364 printf(" -k <timeout> timeout in seconds to detect starved I/O (default is 0 and disabled)\n"); 2365 printf(" -M <percent> rwmixread (100 for reads, 0 for writes)\n"); 2366 printf(" -P <num> number of moving average period\n"); 2367 printf("\t\t(If set to n, show weighted mean of the previous n IO/s in real time)\n"); 2368 printf("\t\t(Formula: M = 2 / (n + 1), EMA[i+1] = IO/s * M + (1 - M) * EMA[i])\n"); 2369 printf("\t\t(only valid with -S)\n"); 2370 printf(" -S <period> show performance result in real time every <period> seconds\n"); 2371 printf(" -T <bdev> bdev to run against. Default: all available bdevs.\n"); 2372 printf(" -f continue processing I/O even after failures\n"); 2373 printf(" -F <zipf theta> use zipf distribution for random I/O\n"); 2374 printf(" -Z enable using zcopy bdev API for read or write I/O\n"); 2375 printf(" -z start bdevperf, but wait for RPC to start tests\n"); 2376 printf(" -X abort timed out I/O\n"); 2377 printf(" -C enable every core to send I/Os to each bdev\n"); 2378 printf(" -j <filename> use job config file\n"); 2379 printf(" -l display latency histogram, default: disable. -l display summary, -ll display details\n"); 2380 } 2381 2382 static int 2383 verify_test_params(struct spdk_app_opts *opts) 2384 { 2385 /* When RPC is used for starting tests and 2386 * no rpc_addr was configured for the app, 2387 * use the default address. */ 2388 if (g_wait_for_tests && opts->rpc_addr == NULL) { 2389 opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR; 2390 } 2391 2392 if (!g_bdevperf_conf_file && g_queue_depth <= 0) { 2393 goto out; 2394 } 2395 if (!g_bdevperf_conf_file && g_io_size <= 0) { 2396 goto out; 2397 } 2398 if (!g_bdevperf_conf_file && !g_workload_type) { 2399 goto out; 2400 } 2401 if (g_time_in_sec <= 0) { 2402 goto out; 2403 } 2404 g_time_in_usec = g_time_in_sec * SPDK_SEC_TO_USEC; 2405 2406 if (g_timeout_in_sec < 0) { 2407 goto out; 2408 } 2409 2410 if (g_abort && !g_timeout_in_sec) { 2411 printf("Timeout must be set for abort option, Ignoring g_abort\n"); 2412 } 2413 2414 if (g_show_performance_ema_period > 0 && 2415 g_show_performance_real_time == 0) { 2416 fprintf(stderr, "-P option must be specified with -S option\n"); 2417 return 1; 2418 } 2419 2420 if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) { 2421 printf("I/O size of %d is greater than zero copy threshold (%d).\n", 2422 g_io_size, SPDK_BDEV_LARGE_BUF_MAX_SIZE); 2423 printf("Zero copy mechanism will not be used.\n"); 2424 g_zcopy = false; 2425 } 2426 2427 if (g_bdevperf_conf_file) { 2428 /* workload_type verification happens during config file parsing */ 2429 return 0; 2430 } 2431 2432 if (!strcmp(g_workload_type, "verify") || 2433 !strcmp(g_workload_type, "reset")) { 2434 g_rw_percentage = 50; 2435 if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) { 2436 fprintf(stderr, "Unable to exceed max I/O size of %d for verify. (%d provided).\n", 2437 SPDK_BDEV_LARGE_BUF_MAX_SIZE, g_io_size); 2438 return 1; 2439 } 2440 g_verify = true; 2441 if (!strcmp(g_workload_type, "reset")) { 2442 g_reset = true; 2443 } 2444 } 2445 2446 if (!strcmp(g_workload_type, "read") || 2447 !strcmp(g_workload_type, "randread") || 2448 !strcmp(g_workload_type, "write") || 2449 !strcmp(g_workload_type, "randwrite") || 2450 !strcmp(g_workload_type, "verify") || 2451 !strcmp(g_workload_type, "reset") || 2452 !strcmp(g_workload_type, "unmap") || 2453 !strcmp(g_workload_type, "write_zeroes") || 2454 !strcmp(g_workload_type, "flush")) { 2455 if (g_mix_specified) { 2456 fprintf(stderr, "Ignoring -M option... Please use -M option" 2457 " only when using rw or randrw.\n"); 2458 } 2459 } 2460 2461 if (!strcmp(g_workload_type, "rw") || 2462 !strcmp(g_workload_type, "randrw")) { 2463 if (g_rw_percentage < 0 || g_rw_percentage > 100) { 2464 fprintf(stderr, 2465 "-M must be specified to value from 0 to 100 " 2466 "for rw or randrw.\n"); 2467 return 1; 2468 } 2469 } 2470 2471 return 0; 2472 out: 2473 spdk_app_usage(); 2474 bdevperf_usage(); 2475 return 1; 2476 } 2477 2478 int 2479 main(int argc, char **argv) 2480 { 2481 struct spdk_app_opts opts = {}; 2482 int rc; 2483 2484 /* Use the runtime PID to set the random seed */ 2485 srand(getpid()); 2486 2487 spdk_app_opts_init(&opts, sizeof(opts)); 2488 opts.name = "bdevperf"; 2489 opts.rpc_addr = NULL; 2490 opts.shutdown_cb = spdk_bdevperf_shutdown_cb; 2491 2492 if ((rc = spdk_app_parse_args(argc, argv, &opts, "Zzfq:o:t:w:k:CF:M:P:S:T:Xlj:", NULL, 2493 bdevperf_parse_arg, bdevperf_usage)) != 2494 SPDK_APP_PARSE_ARGS_SUCCESS) { 2495 return rc; 2496 } 2497 2498 if (read_job_config()) { 2499 free_job_config(); 2500 return 1; 2501 } 2502 2503 if (verify_test_params(&opts) != 0) { 2504 free_job_config(); 2505 exit(1); 2506 } 2507 2508 rc = spdk_app_start(&opts, bdevperf_run, NULL); 2509 2510 spdk_app_fini(); 2511 free_job_config(); 2512 return rc; 2513 } 2514