1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. 3 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. 4 * All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 #include "spdk/accel.h" 11 #include "spdk/endian.h" 12 #include "spdk/env.h" 13 #include "spdk/event.h" 14 #include "spdk/log.h" 15 #include "spdk/util.h" 16 #include "spdk/thread.h" 17 #include "spdk/string.h" 18 #include "spdk/rpc.h" 19 #include "spdk/bit_array.h" 20 #include "spdk/conf.h" 21 #include "spdk/zipf.h" 22 #include "spdk/histogram_data.h" 23 24 #define BDEVPERF_CONFIG_MAX_FILENAME 1024 25 #define BDEVPERF_CONFIG_UNDEFINED -1 26 #define BDEVPERF_CONFIG_ERROR -2 27 28 struct bdevperf_task { 29 struct iovec iov; 30 struct bdevperf_job *job; 31 struct spdk_bdev_io *bdev_io; 32 void *buf; 33 void *md_buf; 34 uint64_t offset_blocks; 35 struct bdevperf_task *task_to_abort; 36 enum spdk_bdev_io_type io_type; 37 TAILQ_ENTRY(bdevperf_task) link; 38 struct spdk_bdev_io_wait_entry bdev_io_wait; 39 }; 40 41 static const char *g_workload_type = NULL; 42 static int g_io_size = 0; 43 /* initialize to invalid value so we can detect if user overrides it. */ 44 static int g_rw_percentage = -1; 45 static bool g_verify = false; 46 static bool g_reset = false; 47 static bool g_continue_on_failure = false; 48 static bool g_abort = false; 49 static bool g_error_to_exit = false; 50 static int g_queue_depth = 0; 51 static uint64_t g_time_in_usec; 52 static int g_show_performance_real_time = 0; 53 static uint64_t g_show_performance_period_in_usec = 1000000; 54 static uint64_t g_show_performance_period_num = 0; 55 static uint64_t g_show_performance_ema_period = 0; 56 static int g_run_rc = 0; 57 static bool g_shutdown = false; 58 static uint64_t g_start_tsc; 59 static uint64_t g_shutdown_tsc; 60 static bool g_zcopy = false; 61 static struct spdk_thread *g_main_thread; 62 static int g_time_in_sec = 0; 63 static bool g_mix_specified = false; 64 static const char *g_job_bdev_name; 65 static bool g_wait_for_tests = false; 66 static struct spdk_jsonrpc_request *g_request = NULL; 67 static bool g_multithread_mode = false; 68 static int g_timeout_in_sec; 69 static struct spdk_conf *g_bdevperf_conf = NULL; 70 static const char *g_bdevperf_conf_file = NULL; 71 static double g_zipf_theta; 72 73 static struct spdk_cpuset g_all_cpuset; 74 static struct spdk_poller *g_perf_timer = NULL; 75 76 static void bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task); 77 static void rpc_perform_tests_cb(void); 78 79 static uint32_t g_bdev_count = 0; 80 static uint32_t g_latency_display_level; 81 82 static const double g_latency_cutoffs[] = { 83 0.01, 84 0.10, 85 0.25, 86 0.50, 87 0.75, 88 0.90, 89 0.95, 90 0.98, 91 0.99, 92 0.995, 93 0.999, 94 0.9999, 95 0.99999, 96 0.999999, 97 0.9999999, 98 -1, 99 }; 100 101 struct latency_info { 102 uint64_t min; 103 uint64_t max; 104 uint64_t total; 105 }; 106 107 struct bdevperf_job { 108 char *name; 109 struct spdk_bdev *bdev; 110 struct spdk_bdev_desc *bdev_desc; 111 struct spdk_io_channel *ch; 112 TAILQ_ENTRY(bdevperf_job) link; 113 struct spdk_thread *thread; 114 115 const char *workload_type; 116 int io_size; 117 int rw_percentage; 118 bool is_random; 119 bool verify; 120 bool reset; 121 bool continue_on_failure; 122 bool unmap; 123 bool write_zeroes; 124 bool flush; 125 bool abort; 126 int queue_depth; 127 unsigned int seed; 128 129 uint64_t io_completed; 130 uint64_t io_failed; 131 uint64_t io_timeout; 132 uint64_t prev_io_completed; 133 double ema_io_per_second; 134 int current_queue_depth; 135 uint64_t size_in_ios; 136 uint64_t ios_base; 137 uint64_t offset_in_ios; 138 uint64_t io_size_blocks; 139 uint64_t buf_size; 140 uint32_t dif_check_flags; 141 bool is_draining; 142 struct spdk_poller *run_timer; 143 struct spdk_poller *reset_timer; 144 struct spdk_bit_array *outstanding; 145 struct spdk_zipf *zipf; 146 TAILQ_HEAD(, bdevperf_task) task_list; 147 uint64_t run_time_in_usec; 148 149 /* keep channel's histogram data before being destroyed */ 150 struct spdk_histogram_data *histogram; 151 }; 152 153 struct spdk_bdevperf { 154 TAILQ_HEAD(, bdevperf_job) jobs; 155 uint32_t running_jobs; 156 }; 157 158 static struct spdk_bdevperf g_bdevperf = { 159 .jobs = TAILQ_HEAD_INITIALIZER(g_bdevperf.jobs), 160 .running_jobs = 0, 161 }; 162 163 enum job_config_rw { 164 JOB_CONFIG_RW_READ = 0, 165 JOB_CONFIG_RW_WRITE, 166 JOB_CONFIG_RW_RANDREAD, 167 JOB_CONFIG_RW_RANDWRITE, 168 JOB_CONFIG_RW_RW, 169 JOB_CONFIG_RW_RANDRW, 170 JOB_CONFIG_RW_VERIFY, 171 JOB_CONFIG_RW_RESET, 172 JOB_CONFIG_RW_UNMAP, 173 JOB_CONFIG_RW_FLUSH, 174 JOB_CONFIG_RW_WRITE_ZEROES, 175 }; 176 177 /* Storing values from a section of job config file */ 178 struct job_config { 179 const char *name; 180 const char *filename; 181 struct spdk_cpuset cpumask; 182 int bs; 183 int iodepth; 184 int rwmixread; 185 int64_t offset; 186 uint64_t length; 187 enum job_config_rw rw; 188 TAILQ_ENTRY(job_config) link; 189 }; 190 191 TAILQ_HEAD(, job_config) job_config_list 192 = TAILQ_HEAD_INITIALIZER(job_config_list); 193 194 static bool g_performance_dump_active = false; 195 196 struct bdevperf_aggregate_stats { 197 struct bdevperf_job *current_job; 198 uint64_t io_time_in_usec; 199 uint64_t ema_period; 200 double total_io_per_second; 201 double total_mb_per_second; 202 double total_failed_per_second; 203 double total_timeout_per_second; 204 double min_latency; 205 double max_latency; 206 uint64_t total_io_completed; 207 uint64_t total_tsc; 208 }; 209 210 static struct bdevperf_aggregate_stats g_stats = {.min_latency = (double)UINT64_MAX}; 211 212 /* 213 * Cumulative Moving Average (CMA): average of all data up to current 214 * Exponential Moving Average (EMA): weighted mean of the previous n data and more weight is given to recent 215 * Simple Moving Average (SMA): unweighted mean of the previous n data 216 * 217 * Bdevperf supports CMA and EMA. 218 */ 219 static double 220 get_cma_io_per_second(struct bdevperf_job *job, uint64_t io_time_in_usec) 221 { 222 return (double)job->io_completed * 1000000 / io_time_in_usec; 223 } 224 225 static double 226 get_ema_io_per_second(struct bdevperf_job *job, uint64_t ema_period) 227 { 228 double io_completed, io_per_second; 229 230 io_completed = job->io_completed; 231 io_per_second = (double)(io_completed - job->prev_io_completed) * 1000000 232 / g_show_performance_period_in_usec; 233 job->prev_io_completed = io_completed; 234 235 job->ema_io_per_second += (io_per_second - job->ema_io_per_second) * 2 236 / (ema_period + 1); 237 return job->ema_io_per_second; 238 } 239 240 static void 241 get_avg_latency(void *ctx, uint64_t start, uint64_t end, uint64_t count, 242 uint64_t total, uint64_t so_far) 243 { 244 struct latency_info *latency_info = ctx; 245 246 if (count == 0) { 247 return; 248 } 249 250 latency_info->total += (start + end) / 2 * count; 251 252 if (so_far == count) { 253 latency_info->min = start; 254 } 255 256 if (so_far == total) { 257 latency_info->max = end; 258 } 259 } 260 261 static void 262 performance_dump_job(struct bdevperf_aggregate_stats *stats, struct bdevperf_job *job) 263 { 264 double io_per_second, mb_per_second, failed_per_second, timeout_per_second; 265 double average_latency = 0.0, min_latency, max_latency; 266 uint64_t time_in_usec; 267 uint64_t tsc_rate; 268 uint64_t total_io; 269 struct latency_info latency_info = {}; 270 271 printf("\r Job: %s (Core Mask 0x%s)\n", spdk_thread_get_name(job->thread), 272 spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread))); 273 274 if (job->io_failed > 0 && !job->reset && !job->continue_on_failure) { 275 printf("\r Job: %s ended in about %.2f seconds with error\n", 276 spdk_thread_get_name(job->thread), (double)job->run_time_in_usec / 1000000); 277 } 278 if (job->verify) { 279 printf("\t Verification LBA range: start 0x%" PRIx64 " length 0x%" PRIx64 "\n", 280 job->ios_base, job->size_in_ios); 281 } 282 283 if (g_performance_dump_active == true) { 284 /* Use job's actual run time as Job has ended */ 285 if (job->io_failed > 0 && !job->continue_on_failure) { 286 time_in_usec = job->run_time_in_usec; 287 } else { 288 time_in_usec = stats->io_time_in_usec; 289 } 290 } else { 291 time_in_usec = job->run_time_in_usec; 292 } 293 294 if (stats->ema_period == 0) { 295 io_per_second = get_cma_io_per_second(job, time_in_usec); 296 } else { 297 io_per_second = get_ema_io_per_second(job, stats->ema_period); 298 } 299 300 tsc_rate = spdk_get_ticks_hz(); 301 mb_per_second = io_per_second * job->io_size / (1024 * 1024); 302 303 spdk_histogram_data_iterate(job->histogram, get_avg_latency, &latency_info); 304 305 total_io = job->io_completed + job->io_failed; 306 if (total_io != 0) { 307 average_latency = (double)latency_info.total / total_io * 1000 * 1000 / tsc_rate; 308 } 309 min_latency = (double)latency_info.min * 1000 * 1000 / tsc_rate; 310 max_latency = (double)latency_info.max * 1000 * 1000 / tsc_rate; 311 312 failed_per_second = (double)job->io_failed * 1000000 / time_in_usec; 313 timeout_per_second = (double)job->io_timeout * 1000000 / time_in_usec; 314 315 printf("\t %-20s: %10.2f %10.2f %10.2f", 316 job->name, (float)time_in_usec / 1000000, io_per_second, mb_per_second); 317 printf(" %10.2f %8.2f", 318 failed_per_second, timeout_per_second); 319 printf(" %10.2f %10.2f %10.2f\n", 320 average_latency, min_latency, max_latency); 321 322 stats->total_io_per_second += io_per_second; 323 stats->total_mb_per_second += mb_per_second; 324 stats->total_failed_per_second += failed_per_second; 325 stats->total_timeout_per_second += timeout_per_second; 326 stats->total_io_completed += job->io_completed + job->io_failed; 327 stats->total_tsc += latency_info.total; 328 if (min_latency < stats->min_latency) { 329 stats->min_latency = min_latency; 330 } 331 if (max_latency > stats->max_latency) { 332 stats->max_latency = max_latency; 333 } 334 } 335 336 static void 337 generate_data(void *buf, int buf_len, int block_size, void *md_buf, int md_size, 338 int num_blocks) 339 { 340 int offset_blocks = 0, md_offset, data_block_size, inner_offset; 341 342 if (buf_len < num_blocks * block_size) { 343 return; 344 } 345 346 if (md_buf == NULL) { 347 data_block_size = block_size - md_size; 348 md_buf = (char *)buf + data_block_size; 349 md_offset = block_size; 350 } else { 351 data_block_size = block_size; 352 md_offset = md_size; 353 } 354 355 while (offset_blocks < num_blocks) { 356 inner_offset = 0; 357 while (inner_offset < data_block_size) { 358 *(uint32_t *)buf = offset_blocks + inner_offset; 359 inner_offset += sizeof(uint32_t); 360 buf += sizeof(uint32_t); 361 } 362 memset(md_buf, offset_blocks, md_size); 363 md_buf += md_offset; 364 offset_blocks++; 365 } 366 } 367 368 static bool 369 copy_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size, 370 void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks) 371 { 372 if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) { 373 return false; 374 } 375 376 assert((wr_md_buf != NULL) == (rd_md_buf != NULL)); 377 378 memcpy(wr_buf, rd_buf, block_size * num_blocks); 379 380 if (wr_md_buf != NULL) { 381 memcpy(wr_md_buf, rd_md_buf, md_size * num_blocks); 382 } 383 384 return true; 385 } 386 387 static bool 388 verify_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size, 389 void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks, bool md_check) 390 { 391 int offset_blocks = 0, md_offset, data_block_size; 392 393 if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) { 394 return false; 395 } 396 397 assert((wr_md_buf != NULL) == (rd_md_buf != NULL)); 398 399 if (wr_md_buf == NULL) { 400 data_block_size = block_size - md_size; 401 wr_md_buf = (char *)wr_buf + data_block_size; 402 rd_md_buf = (char *)rd_buf + data_block_size; 403 md_offset = block_size; 404 } else { 405 data_block_size = block_size; 406 md_offset = md_size; 407 } 408 409 while (offset_blocks < num_blocks) { 410 if (memcmp(wr_buf, rd_buf, data_block_size) != 0) { 411 return false; 412 } 413 414 wr_buf += block_size; 415 rd_buf += block_size; 416 417 if (md_check) { 418 if (memcmp(wr_md_buf, rd_md_buf, md_size) != 0) { 419 return false; 420 } 421 422 wr_md_buf += md_offset; 423 rd_md_buf += md_offset; 424 } 425 426 offset_blocks++; 427 } 428 429 return true; 430 } 431 432 static void 433 free_job_config(void) 434 { 435 struct job_config *config, *tmp; 436 437 spdk_conf_free(g_bdevperf_conf); 438 g_bdevperf_conf = NULL; 439 440 TAILQ_FOREACH_SAFE(config, &job_config_list, link, tmp) { 441 TAILQ_REMOVE(&job_config_list, config, link); 442 free(config); 443 } 444 } 445 446 static void 447 bdevperf_job_free(struct bdevperf_job *job) 448 { 449 spdk_histogram_data_free(job->histogram); 450 spdk_bit_array_free(&job->outstanding); 451 spdk_zipf_free(&job->zipf); 452 free(job->name); 453 free(job); 454 } 455 456 static void 457 job_thread_exit(void *ctx) 458 { 459 spdk_thread_exit(spdk_get_thread()); 460 } 461 462 static void 463 check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count, 464 uint64_t total, uint64_t so_far) 465 { 466 double so_far_pct; 467 double **cutoff = ctx; 468 uint64_t tsc_rate; 469 470 if (count == 0) { 471 return; 472 } 473 474 tsc_rate = spdk_get_ticks_hz(); 475 so_far_pct = (double)so_far / total; 476 while (so_far_pct >= **cutoff && **cutoff > 0) { 477 printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * 1000 * 1000 / tsc_rate); 478 (*cutoff)++; 479 } 480 } 481 482 static void 483 print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count, 484 uint64_t total, uint64_t so_far) 485 { 486 double so_far_pct; 487 uint64_t tsc_rate; 488 489 if (count == 0) { 490 return; 491 } 492 493 tsc_rate = spdk_get_ticks_hz(); 494 so_far_pct = (double)so_far * 100 / total; 495 printf("%9.3f - %9.3f: %9.4f%% (%9ju)\n", 496 (double)start * 1000 * 1000 / tsc_rate, 497 (double)end * 1000 * 1000 / tsc_rate, 498 so_far_pct, count); 499 } 500 501 static void 502 bdevperf_test_done(void *ctx) 503 { 504 struct bdevperf_job *job, *jtmp; 505 struct bdevperf_task *task, *ttmp; 506 double average_latency = 0.0; 507 uint64_t time_in_usec; 508 int rc; 509 510 if (g_time_in_usec) { 511 g_stats.io_time_in_usec = g_time_in_usec; 512 513 if (!g_run_rc && g_performance_dump_active) { 514 spdk_thread_send_msg(spdk_get_thread(), bdevperf_test_done, NULL); 515 return; 516 } 517 } 518 519 if (g_show_performance_real_time) { 520 spdk_poller_unregister(&g_perf_timer); 521 } 522 523 if (g_shutdown) { 524 g_shutdown_tsc = spdk_get_ticks() - g_start_tsc; 525 time_in_usec = g_shutdown_tsc * 1000000 / spdk_get_ticks_hz(); 526 g_time_in_usec = (g_time_in_usec > time_in_usec) ? time_in_usec : g_time_in_usec; 527 printf("Received shutdown signal, test time was about %.6f seconds\n", 528 (double)g_time_in_usec / 1000000); 529 } 530 531 printf("\n%*s\n", 107, "Latency(us)"); 532 printf("\r %-*s: %10s %10s %10s %10s %8s %10s %10s %10s\n", 533 28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s", "Average", "min", "max"); 534 535 TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) { 536 performance_dump_job(&g_stats, job); 537 } 538 539 printf("\r ==================================================================================" 540 "=================================\n"); 541 printf("\r %-28s: %10s %10.2f %10.2f", 542 "Total", "", g_stats.total_io_per_second, g_stats.total_mb_per_second); 543 printf(" %10.2f %8.2f", 544 g_stats.total_failed_per_second, g_stats.total_timeout_per_second); 545 546 if (g_stats.total_io_completed != 0) { 547 average_latency = ((double)g_stats.total_tsc / g_stats.total_io_completed) * 1000 * 1000 / 548 spdk_get_ticks_hz(); 549 } 550 printf(" %10.2f %10.2f %10.2f\n", average_latency, g_stats.min_latency, g_stats.max_latency); 551 552 fflush(stdout); 553 554 if (g_latency_display_level == 0 || g_stats.total_io_completed == 0) { 555 goto clean; 556 } 557 558 printf("\n Latency summary\n"); 559 TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) { 560 printf("\r =============================================\n"); 561 printf("\r Job: %s (Core Mask 0x%s)\n", spdk_thread_get_name(job->thread), 562 spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread))); 563 564 const double *cutoff = g_latency_cutoffs; 565 566 spdk_histogram_data_iterate(job->histogram, check_cutoff, &cutoff); 567 568 printf("\n"); 569 } 570 571 if (g_latency_display_level == 1) { 572 goto clean; 573 } 574 575 printf("\r Latency histogram\n"); 576 TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) { 577 printf("\r =============================================\n"); 578 printf("\r Job: %s (Core Mask 0x%s)\n", spdk_thread_get_name(job->thread), 579 spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread))); 580 581 spdk_histogram_data_iterate(job->histogram, print_bucket, NULL); 582 printf("\n"); 583 } 584 585 clean: 586 TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) { 587 TAILQ_REMOVE(&g_bdevperf.jobs, job, link); 588 589 spdk_thread_send_msg(job->thread, job_thread_exit, NULL); 590 591 TAILQ_FOREACH_SAFE(task, &job->task_list, link, ttmp) { 592 TAILQ_REMOVE(&job->task_list, task, link); 593 spdk_free(task->buf); 594 spdk_free(task->md_buf); 595 free(task); 596 } 597 598 bdevperf_job_free(job); 599 } 600 601 rc = g_run_rc; 602 if (g_request && !g_shutdown) { 603 rpc_perform_tests_cb(); 604 if (rc != 0) { 605 spdk_app_stop(rc); 606 } 607 } else { 608 spdk_app_stop(rc); 609 } 610 } 611 612 static void 613 bdevperf_job_end(void *ctx) 614 { 615 assert(g_main_thread == spdk_get_thread()); 616 617 if (--g_bdevperf.running_jobs == 0) { 618 bdevperf_test_done(NULL); 619 } 620 } 621 622 static void 623 bdevperf_channel_get_histogram_cb(void *cb_arg, int status, struct spdk_histogram_data *histogram) 624 { 625 struct spdk_histogram_data *job_hist = cb_arg; 626 spdk_histogram_data_merge(job_hist, histogram); 627 } 628 629 static void 630 bdevperf_end_task(struct bdevperf_task *task) 631 { 632 struct bdevperf_job *job = task->job; 633 uint64_t end_tsc = 0; 634 635 TAILQ_INSERT_TAIL(&job->task_list, task, link); 636 if (job->is_draining) { 637 if (job->current_queue_depth == 0) { 638 end_tsc = spdk_get_ticks() - g_start_tsc; 639 job->run_time_in_usec = end_tsc * 1000000 / spdk_get_ticks_hz(); 640 641 /* keep histogram info before channel is destroyed */ 642 spdk_bdev_channel_get_histogram(job->bdev, job->ch, bdevperf_channel_get_histogram_cb, 643 job->histogram); 644 645 spdk_put_io_channel(job->ch); 646 spdk_bdev_close(job->bdev_desc); 647 spdk_thread_send_msg(g_main_thread, bdevperf_job_end, NULL); 648 } 649 } 650 } 651 652 static void 653 bdevperf_queue_io_wait_with_cb(struct bdevperf_task *task, spdk_bdev_io_wait_cb cb_fn) 654 { 655 struct bdevperf_job *job = task->job; 656 657 task->bdev_io_wait.bdev = job->bdev; 658 task->bdev_io_wait.cb_fn = cb_fn; 659 task->bdev_io_wait.cb_arg = task; 660 spdk_bdev_queue_io_wait(job->bdev, job->ch, &task->bdev_io_wait); 661 } 662 663 static int 664 bdevperf_job_drain(void *ctx) 665 { 666 struct bdevperf_job *job = ctx; 667 668 spdk_poller_unregister(&job->run_timer); 669 if (job->reset) { 670 spdk_poller_unregister(&job->reset_timer); 671 } 672 673 job->is_draining = true; 674 675 return -1; 676 } 677 678 static void 679 bdevperf_abort_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 680 { 681 struct bdevperf_task *task = cb_arg; 682 struct bdevperf_job *job = task->job; 683 684 job->current_queue_depth--; 685 686 if (success) { 687 job->io_completed++; 688 } else { 689 job->io_failed++; 690 if (!job->continue_on_failure) { 691 bdevperf_job_drain(job); 692 g_run_rc = -1; 693 } 694 } 695 696 spdk_bdev_free_io(bdev_io); 697 bdevperf_end_task(task); 698 } 699 700 static int 701 bdevperf_verify_dif(struct bdevperf_task *task, struct iovec *iovs, int iovcnt) 702 { 703 struct bdevperf_job *job = task->job; 704 struct spdk_bdev *bdev = job->bdev; 705 struct spdk_dif_ctx dif_ctx; 706 struct spdk_dif_error err_blk = {}; 707 int rc; 708 709 rc = spdk_dif_ctx_init(&dif_ctx, 710 spdk_bdev_get_block_size(bdev), 711 spdk_bdev_get_md_size(bdev), 712 spdk_bdev_is_md_interleaved(bdev), 713 spdk_bdev_is_dif_head_of_md(bdev), 714 spdk_bdev_get_dif_type(bdev), 715 job->dif_check_flags, 716 task->offset_blocks, 0, 0, 0, 0); 717 if (rc != 0) { 718 fprintf(stderr, "Initialization of DIF context failed\n"); 719 return rc; 720 } 721 722 if (spdk_bdev_is_md_interleaved(bdev)) { 723 rc = spdk_dif_verify(iovs, iovcnt, job->io_size_blocks, &dif_ctx, &err_blk); 724 } else { 725 struct iovec md_iov = { 726 .iov_base = task->md_buf, 727 .iov_len = spdk_bdev_get_md_size(bdev) * job->io_size_blocks, 728 }; 729 730 rc = spdk_dix_verify(iovs, iovcnt, &md_iov, job->io_size_blocks, &dif_ctx, &err_blk); 731 } 732 733 if (rc != 0) { 734 fprintf(stderr, "DIF/DIX error detected. type=%d, offset=%" PRIu32 "\n", 735 err_blk.err_type, err_blk.err_offset); 736 } 737 738 return rc; 739 } 740 741 static void 742 bdevperf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 743 { 744 struct bdevperf_job *job; 745 struct bdevperf_task *task = cb_arg; 746 struct iovec *iovs; 747 int iovcnt; 748 bool md_check; 749 uint64_t offset_in_ios; 750 int rc; 751 752 job = task->job; 753 md_check = spdk_bdev_get_dif_type(job->bdev) == SPDK_DIF_DISABLE; 754 755 if (g_error_to_exit == true) { 756 bdevperf_job_drain(job); 757 } else if (!success) { 758 if (!job->reset && !job->continue_on_failure) { 759 bdevperf_job_drain(job); 760 g_run_rc = -1; 761 g_error_to_exit = true; 762 printf("task offset: %" PRIu64 " on job bdev=%s fails\n", 763 task->offset_blocks, job->name); 764 } 765 } else if (job->verify || job->reset) { 766 spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt); 767 assert(iovcnt == 1); 768 assert(iovs != NULL); 769 if (!verify_data(task->buf, job->buf_size, iovs[0].iov_base, iovs[0].iov_len, 770 spdk_bdev_get_block_size(job->bdev), 771 task->md_buf, spdk_bdev_io_get_md_buf(bdev_io), 772 spdk_bdev_get_md_size(job->bdev), 773 job->io_size_blocks, md_check)) { 774 printf("Buffer mismatch! Target: %s Disk Offset: %" PRIu64 "\n", job->name, task->offset_blocks); 775 printf(" First dword expected 0x%x got 0x%x\n", *(int *)task->buf, *(int *)iovs[0].iov_base); 776 bdevperf_job_drain(job); 777 g_run_rc = -1; 778 } 779 } else if (job->dif_check_flags != 0) { 780 if (task->io_type == SPDK_BDEV_IO_TYPE_READ && spdk_bdev_get_md_size(job->bdev) != 0) { 781 spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt); 782 assert(iovcnt == 1); 783 assert(iovs != NULL); 784 rc = bdevperf_verify_dif(task, iovs, iovcnt); 785 if (rc != 0) { 786 printf("DIF error detected. task offset: %" PRIu64 " on job bdev=%s\n", 787 task->offset_blocks, job->name); 788 789 success = false; 790 if (!job->reset && !job->continue_on_failure) { 791 bdevperf_job_drain(job); 792 g_run_rc = -1; 793 g_error_to_exit = true; 794 } 795 } 796 } 797 } 798 799 job->current_queue_depth--; 800 801 if (success) { 802 job->io_completed++; 803 } else { 804 job->io_failed++; 805 } 806 807 if (job->verify) { 808 assert(task->offset_blocks / job->io_size_blocks >= job->ios_base); 809 offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base; 810 811 assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true); 812 spdk_bit_array_clear(job->outstanding, offset_in_ios); 813 } 814 815 spdk_bdev_free_io(bdev_io); 816 817 /* 818 * is_draining indicates when time has expired for the test run 819 * and we are just waiting for the previously submitted I/O 820 * to complete. In this case, do not submit a new I/O to replace 821 * the one just completed. 822 */ 823 if (!job->is_draining) { 824 bdevperf_submit_single(job, task); 825 } else { 826 bdevperf_end_task(task); 827 } 828 } 829 830 static void 831 bdevperf_verify_submit_read(void *cb_arg) 832 { 833 struct bdevperf_job *job; 834 struct bdevperf_task *task = cb_arg; 835 int rc; 836 837 job = task->job; 838 839 /* Read the data back in */ 840 rc = spdk_bdev_read_blocks_with_md(job->bdev_desc, job->ch, NULL, NULL, 841 task->offset_blocks, job->io_size_blocks, 842 bdevperf_complete, task); 843 844 if (rc == -ENOMEM) { 845 bdevperf_queue_io_wait_with_cb(task, bdevperf_verify_submit_read); 846 } else if (rc != 0) { 847 printf("Failed to submit read: %d\n", rc); 848 bdevperf_job_drain(job); 849 g_run_rc = rc; 850 } 851 } 852 853 static void 854 bdevperf_verify_write_complete(struct spdk_bdev_io *bdev_io, bool success, 855 void *cb_arg) 856 { 857 if (success) { 858 spdk_bdev_free_io(bdev_io); 859 bdevperf_verify_submit_read(cb_arg); 860 } else { 861 bdevperf_complete(bdev_io, success, cb_arg); 862 } 863 } 864 865 static void 866 bdevperf_zcopy_populate_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 867 { 868 if (!success) { 869 bdevperf_complete(bdev_io, success, cb_arg); 870 return; 871 } 872 873 spdk_bdev_zcopy_end(bdev_io, false, bdevperf_complete, cb_arg); 874 } 875 876 static int 877 bdevperf_generate_dif(struct bdevperf_task *task) 878 { 879 struct bdevperf_job *job = task->job; 880 struct spdk_bdev *bdev = job->bdev; 881 struct spdk_dif_ctx dif_ctx; 882 int rc; 883 884 rc = spdk_dif_ctx_init(&dif_ctx, 885 spdk_bdev_get_block_size(bdev), 886 spdk_bdev_get_md_size(bdev), 887 spdk_bdev_is_md_interleaved(bdev), 888 spdk_bdev_is_dif_head_of_md(bdev), 889 spdk_bdev_get_dif_type(bdev), 890 job->dif_check_flags, 891 task->offset_blocks, 0, 0, 0, 0); 892 if (rc != 0) { 893 fprintf(stderr, "Initialization of DIF context failed\n"); 894 return rc; 895 } 896 897 if (spdk_bdev_is_md_interleaved(bdev)) { 898 rc = spdk_dif_generate(&task->iov, 1, job->io_size_blocks, &dif_ctx); 899 } else { 900 struct iovec md_iov = { 901 .iov_base = task->md_buf, 902 .iov_len = spdk_bdev_get_md_size(bdev) * job->io_size_blocks, 903 }; 904 905 rc = spdk_dix_generate(&task->iov, 1, &md_iov, job->io_size_blocks, &dif_ctx); 906 } 907 908 if (rc != 0) { 909 fprintf(stderr, "Generation of DIF/DIX failed\n"); 910 } 911 912 return rc; 913 } 914 915 static void 916 bdevperf_submit_task(void *arg) 917 { 918 struct bdevperf_task *task = arg; 919 struct bdevperf_job *job = task->job; 920 struct spdk_bdev_desc *desc; 921 struct spdk_io_channel *ch; 922 spdk_bdev_io_completion_cb cb_fn; 923 uint64_t offset_in_ios; 924 int rc = 0; 925 926 desc = job->bdev_desc; 927 ch = job->ch; 928 929 switch (task->io_type) { 930 case SPDK_BDEV_IO_TYPE_WRITE: 931 if (spdk_bdev_get_md_size(job->bdev) != 0 && job->dif_check_flags != 0) { 932 rc = bdevperf_generate_dif(task); 933 } 934 if (rc == 0) { 935 cb_fn = (job->verify || job->reset) ? bdevperf_verify_write_complete : bdevperf_complete; 936 937 if (g_zcopy) { 938 spdk_bdev_zcopy_end(task->bdev_io, true, cb_fn, task); 939 return; 940 } else { 941 rc = spdk_bdev_writev_blocks_with_md(desc, ch, &task->iov, 1, 942 task->md_buf, 943 task->offset_blocks, 944 job->io_size_blocks, 945 cb_fn, task); 946 } 947 } 948 break; 949 case SPDK_BDEV_IO_TYPE_FLUSH: 950 rc = spdk_bdev_flush_blocks(desc, ch, task->offset_blocks, 951 job->io_size_blocks, bdevperf_complete, task); 952 break; 953 case SPDK_BDEV_IO_TYPE_UNMAP: 954 rc = spdk_bdev_unmap_blocks(desc, ch, task->offset_blocks, 955 job->io_size_blocks, bdevperf_complete, task); 956 break; 957 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 958 rc = spdk_bdev_write_zeroes_blocks(desc, ch, task->offset_blocks, 959 job->io_size_blocks, bdevperf_complete, task); 960 break; 961 case SPDK_BDEV_IO_TYPE_READ: 962 if (g_zcopy) { 963 rc = spdk_bdev_zcopy_start(desc, ch, NULL, 0, task->offset_blocks, job->io_size_blocks, 964 true, bdevperf_zcopy_populate_complete, task); 965 } else { 966 rc = spdk_bdev_read_blocks_with_md(desc, ch, task->buf, task->md_buf, 967 task->offset_blocks, 968 job->io_size_blocks, 969 bdevperf_complete, task); 970 } 971 break; 972 case SPDK_BDEV_IO_TYPE_ABORT: 973 rc = spdk_bdev_abort(desc, ch, task->task_to_abort, bdevperf_abort_complete, task); 974 break; 975 default: 976 assert(false); 977 rc = -EINVAL; 978 break; 979 } 980 981 if (rc == -ENOMEM) { 982 bdevperf_queue_io_wait_with_cb(task, bdevperf_submit_task); 983 return; 984 } else if (rc != 0) { 985 printf("Failed to submit bdev_io: %d\n", rc); 986 if (job->verify) { 987 assert(task->offset_blocks / job->io_size_blocks >= job->ios_base); 988 offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base; 989 990 assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true); 991 spdk_bit_array_clear(job->outstanding, offset_in_ios); 992 } 993 bdevperf_job_drain(job); 994 g_run_rc = rc; 995 return; 996 } 997 998 job->current_queue_depth++; 999 } 1000 1001 static void 1002 bdevperf_zcopy_get_buf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 1003 { 1004 struct bdevperf_task *task = cb_arg; 1005 struct bdevperf_job *job = task->job; 1006 struct iovec *iovs; 1007 int iovcnt; 1008 1009 if (!success) { 1010 bdevperf_job_drain(job); 1011 g_run_rc = -1; 1012 return; 1013 } 1014 1015 task->bdev_io = bdev_io; 1016 task->io_type = SPDK_BDEV_IO_TYPE_WRITE; 1017 1018 if (job->verify || job->reset) { 1019 /* When job->verify or job->reset is enabled, task->buf is used for 1020 * verification of read after write. For write I/O, when zcopy APIs 1021 * are used, task->buf cannot be used, and data must be written to 1022 * the data buffer allocated underneath bdev layer instead. 1023 * Hence we copy task->buf to the allocated data buffer here. 1024 */ 1025 spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt); 1026 assert(iovcnt == 1); 1027 assert(iovs != NULL); 1028 1029 copy_data(iovs[0].iov_base, iovs[0].iov_len, task->buf, job->buf_size, 1030 spdk_bdev_get_block_size(job->bdev), 1031 spdk_bdev_io_get_md_buf(bdev_io), task->md_buf, 1032 spdk_bdev_get_md_size(job->bdev), job->io_size_blocks); 1033 } 1034 1035 bdevperf_submit_task(task); 1036 } 1037 1038 static void 1039 bdevperf_prep_zcopy_write_task(void *arg) 1040 { 1041 struct bdevperf_task *task = arg; 1042 struct bdevperf_job *job = task->job; 1043 int rc; 1044 1045 rc = spdk_bdev_zcopy_start(job->bdev_desc, job->ch, NULL, 0, 1046 task->offset_blocks, job->io_size_blocks, 1047 false, bdevperf_zcopy_get_buf_complete, task); 1048 if (rc != 0) { 1049 assert(rc == -ENOMEM); 1050 bdevperf_queue_io_wait_with_cb(task, bdevperf_prep_zcopy_write_task); 1051 return; 1052 } 1053 1054 job->current_queue_depth++; 1055 } 1056 1057 static struct bdevperf_task * 1058 bdevperf_job_get_task(struct bdevperf_job *job) 1059 { 1060 struct bdevperf_task *task; 1061 1062 task = TAILQ_FIRST(&job->task_list); 1063 if (!task) { 1064 printf("Task allocation failed\n"); 1065 abort(); 1066 } 1067 1068 TAILQ_REMOVE(&job->task_list, task, link); 1069 return task; 1070 } 1071 1072 static void 1073 bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task) 1074 { 1075 uint64_t offset_in_ios; 1076 1077 if (job->zipf) { 1078 offset_in_ios = spdk_zipf_generate(job->zipf); 1079 } else if (job->is_random) { 1080 offset_in_ios = rand_r(&job->seed) % job->size_in_ios; 1081 } else { 1082 offset_in_ios = job->offset_in_ios++; 1083 if (job->offset_in_ios == job->size_in_ios) { 1084 job->offset_in_ios = 0; 1085 } 1086 1087 /* Increment of offset_in_ios if there's already an outstanding IO 1088 * to that location. We only need this with job->verify as random 1089 * offsets are not supported with job->verify at this time. 1090 */ 1091 if (job->verify) { 1092 assert(spdk_bit_array_find_first_clear(job->outstanding, 0) != UINT32_MAX); 1093 1094 while (spdk_bit_array_get(job->outstanding, offset_in_ios)) { 1095 offset_in_ios = job->offset_in_ios++; 1096 if (job->offset_in_ios == job->size_in_ios) { 1097 job->offset_in_ios = 0; 1098 } 1099 } 1100 spdk_bit_array_set(job->outstanding, offset_in_ios); 1101 } 1102 } 1103 1104 /* For multi-thread to same job, offset_in_ios is relative 1105 * to the LBA range assigned for that job. job->offset_blocks 1106 * is absolute (entire bdev LBA range). 1107 */ 1108 task->offset_blocks = (offset_in_ios + job->ios_base) * job->io_size_blocks; 1109 1110 if (job->verify || job->reset) { 1111 generate_data(task->buf, job->buf_size, 1112 spdk_bdev_get_block_size(job->bdev), 1113 task->md_buf, spdk_bdev_get_md_size(job->bdev), 1114 job->io_size_blocks); 1115 if (g_zcopy) { 1116 bdevperf_prep_zcopy_write_task(task); 1117 return; 1118 } else { 1119 task->iov.iov_base = task->buf; 1120 task->iov.iov_len = job->buf_size; 1121 task->io_type = SPDK_BDEV_IO_TYPE_WRITE; 1122 } 1123 } else if (job->flush) { 1124 task->io_type = SPDK_BDEV_IO_TYPE_FLUSH; 1125 } else if (job->unmap) { 1126 task->io_type = SPDK_BDEV_IO_TYPE_UNMAP; 1127 } else if (job->write_zeroes) { 1128 task->io_type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 1129 } else if ((job->rw_percentage == 100) || 1130 (job->rw_percentage != 0 && ((rand_r(&job->seed) % 100) < job->rw_percentage))) { 1131 task->io_type = SPDK_BDEV_IO_TYPE_READ; 1132 } else { 1133 if (g_zcopy) { 1134 bdevperf_prep_zcopy_write_task(task); 1135 return; 1136 } else { 1137 task->iov.iov_base = task->buf; 1138 task->iov.iov_len = job->buf_size; 1139 task->io_type = SPDK_BDEV_IO_TYPE_WRITE; 1140 } 1141 } 1142 1143 bdevperf_submit_task(task); 1144 } 1145 1146 static int reset_job(void *arg); 1147 1148 static void 1149 reset_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 1150 { 1151 struct bdevperf_task *task = cb_arg; 1152 struct bdevperf_job *job = task->job; 1153 1154 if (!success) { 1155 printf("Reset blockdev=%s failed\n", spdk_bdev_get_name(job->bdev)); 1156 bdevperf_job_drain(job); 1157 g_run_rc = -1; 1158 } 1159 1160 TAILQ_INSERT_TAIL(&job->task_list, task, link); 1161 spdk_bdev_free_io(bdev_io); 1162 1163 job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job, 1164 10 * 1000000); 1165 } 1166 1167 static int 1168 reset_job(void *arg) 1169 { 1170 struct bdevperf_job *job = arg; 1171 struct bdevperf_task *task; 1172 int rc; 1173 1174 spdk_poller_unregister(&job->reset_timer); 1175 1176 /* Do reset. */ 1177 task = bdevperf_job_get_task(job); 1178 rc = spdk_bdev_reset(job->bdev_desc, job->ch, 1179 reset_cb, task); 1180 if (rc) { 1181 printf("Reset failed: %d\n", rc); 1182 bdevperf_job_drain(job); 1183 g_run_rc = -1; 1184 } 1185 1186 return -1; 1187 } 1188 1189 static void 1190 bdevperf_timeout_cb(void *cb_arg, struct spdk_bdev_io *bdev_io) 1191 { 1192 struct bdevperf_job *job = cb_arg; 1193 struct bdevperf_task *task; 1194 1195 job->io_timeout++; 1196 1197 if (job->is_draining || !job->abort || 1198 !spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 1199 return; 1200 } 1201 1202 task = bdevperf_job_get_task(job); 1203 if (task == NULL) { 1204 return; 1205 } 1206 1207 task->task_to_abort = spdk_bdev_io_get_cb_arg(bdev_io); 1208 task->io_type = SPDK_BDEV_IO_TYPE_ABORT; 1209 1210 bdevperf_submit_task(task); 1211 } 1212 1213 static void 1214 bdevperf_job_run(void *ctx) 1215 { 1216 struct bdevperf_job *job = ctx; 1217 struct bdevperf_task *task; 1218 int i; 1219 1220 /* Submit initial I/O for this job. Each time one 1221 * completes, another will be submitted. */ 1222 1223 /* Start a timer to stop this I/O chain when the run is over */ 1224 job->run_timer = SPDK_POLLER_REGISTER(bdevperf_job_drain, job, g_time_in_usec); 1225 if (job->reset) { 1226 job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job, 1227 10 * 1000000); 1228 } 1229 1230 spdk_bdev_set_timeout(job->bdev_desc, g_timeout_in_sec, bdevperf_timeout_cb, job); 1231 1232 for (i = 0; i < job->queue_depth; i++) { 1233 task = bdevperf_job_get_task(job); 1234 bdevperf_submit_single(job, task); 1235 } 1236 } 1237 1238 static void 1239 _performance_dump_done(void *ctx) 1240 { 1241 struct bdevperf_aggregate_stats *stats = ctx; 1242 double average_latency; 1243 1244 printf("\r ==================================================================================" 1245 "=================================\n"); 1246 printf("\r %-28s: %10s %10.2f %10.2f", 1247 "Total", "", stats->total_io_per_second, stats->total_mb_per_second); 1248 printf(" %10.2f %8.2f", 1249 stats->total_failed_per_second, stats->total_timeout_per_second); 1250 1251 average_latency = ((double)stats->total_tsc / stats->total_io_completed) * 1000 * 1000 / 1252 spdk_get_ticks_hz(); 1253 printf(" %10.2f %10.2f %10.2f\n", average_latency, stats->min_latency, stats->max_latency); 1254 printf("\n"); 1255 1256 fflush(stdout); 1257 1258 g_performance_dump_active = false; 1259 1260 free(stats); 1261 } 1262 1263 static void 1264 _performance_dump(void *ctx) 1265 { 1266 struct bdevperf_aggregate_stats *stats = ctx; 1267 1268 performance_dump_job(stats, stats->current_job); 1269 1270 /* This assumes the jobs list is static after start up time. 1271 * That's true right now, but if that ever changed this would need a lock. */ 1272 stats->current_job = TAILQ_NEXT(stats->current_job, link); 1273 if (stats->current_job == NULL) { 1274 spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats); 1275 } else { 1276 spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats); 1277 } 1278 } 1279 1280 static int 1281 performance_statistics_thread(void *arg) 1282 { 1283 struct bdevperf_aggregate_stats *stats; 1284 1285 if (g_performance_dump_active) { 1286 return -1; 1287 } 1288 1289 g_performance_dump_active = true; 1290 1291 stats = calloc(1, sizeof(*stats)); 1292 if (stats == NULL) { 1293 return -1; 1294 } 1295 1296 stats->min_latency = (double)UINT64_MAX; 1297 1298 g_show_performance_period_num++; 1299 1300 stats->io_time_in_usec = g_show_performance_period_num * g_show_performance_period_in_usec; 1301 stats->ema_period = g_show_performance_ema_period; 1302 1303 /* Iterate all of the jobs to gather stats 1304 * These jobs will not get removed here until a final performance dump is run, 1305 * so this should be safe without locking. 1306 */ 1307 stats->current_job = TAILQ_FIRST(&g_bdevperf.jobs); 1308 if (stats->current_job == NULL) { 1309 spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats); 1310 } else { 1311 spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats); 1312 } 1313 1314 return -1; 1315 } 1316 1317 static void 1318 bdevperf_test(void) 1319 { 1320 struct bdevperf_job *job; 1321 1322 printf("Running I/O for %" PRIu64 " seconds...\n", g_time_in_usec / 1000000); 1323 fflush(stdout); 1324 1325 /* Start a timer to dump performance numbers */ 1326 g_start_tsc = spdk_get_ticks(); 1327 if (g_show_performance_real_time && !g_perf_timer) { 1328 printf("%*s\n", 107, "Latency(us)"); 1329 printf("\r %-*s: %10s %10s %10s %10s %8s %10s %10s %10s\n", 1330 28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s", "Average", "min", "max"); 1331 1332 g_perf_timer = SPDK_POLLER_REGISTER(performance_statistics_thread, NULL, 1333 g_show_performance_period_in_usec); 1334 } 1335 1336 /* Iterate jobs to start all I/O */ 1337 TAILQ_FOREACH(job, &g_bdevperf.jobs, link) { 1338 g_bdevperf.running_jobs++; 1339 spdk_thread_send_msg(job->thread, bdevperf_job_run, job); 1340 } 1341 } 1342 1343 static void 1344 bdevperf_bdev_removed(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx) 1345 { 1346 struct bdevperf_job *job = event_ctx; 1347 1348 if (SPDK_BDEV_EVENT_REMOVE == type) { 1349 bdevperf_job_drain(job); 1350 } 1351 } 1352 1353 static void 1354 bdevperf_histogram_status_cb(void *cb_arg, int status) 1355 { 1356 if (status != 0) { 1357 g_run_rc = status; 1358 if (g_continue_on_failure == false) { 1359 g_error_to_exit = true; 1360 } 1361 } 1362 1363 if (--g_bdev_count == 0) { 1364 if (g_run_rc == 0) { 1365 /* Ready to run the test */ 1366 bdevperf_test(); 1367 } else { 1368 bdevperf_test_done(NULL); 1369 } 1370 } 1371 } 1372 1373 static uint32_t g_construct_job_count = 0; 1374 1375 static void 1376 _bdevperf_enable_histogram(bool enable) 1377 { 1378 struct spdk_bdev *bdev; 1379 /* increment initial g_bdev_count so that it will never reach 0 in the middle of iteration */ 1380 g_bdev_count = 1; 1381 1382 if (g_job_bdev_name != NULL) { 1383 bdev = spdk_bdev_get_by_name(g_job_bdev_name); 1384 if (bdev) { 1385 g_bdev_count++; 1386 1387 spdk_bdev_histogram_enable(bdev, bdevperf_histogram_status_cb, NULL, enable); 1388 } else { 1389 fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name); 1390 } 1391 } else { 1392 bdev = spdk_bdev_first_leaf(); 1393 1394 while (bdev != NULL) { 1395 g_bdev_count++; 1396 1397 spdk_bdev_histogram_enable(bdev, bdevperf_histogram_status_cb, NULL, enable); 1398 bdev = spdk_bdev_next_leaf(bdev); 1399 } 1400 } 1401 1402 bdevperf_histogram_status_cb(NULL, 0); 1403 } 1404 1405 static void 1406 _bdevperf_construct_job_done(void *ctx) 1407 { 1408 if (--g_construct_job_count == 0) { 1409 if (g_run_rc != 0) { 1410 /* Something failed. */ 1411 bdevperf_test_done(NULL); 1412 return; 1413 } 1414 1415 /* always enable histogram. */ 1416 _bdevperf_enable_histogram(true); 1417 } else if (g_run_rc != 0) { 1418 /* Reset error as some jobs constructed right */ 1419 g_run_rc = 0; 1420 if (g_continue_on_failure == false) { 1421 g_error_to_exit = true; 1422 } 1423 } 1424 } 1425 1426 /* Checkformat will not allow to use inlined type, 1427 this is a workaround */ 1428 typedef struct spdk_thread *spdk_thread_t; 1429 1430 static spdk_thread_t 1431 construct_job_thread(struct spdk_cpuset *cpumask, const char *tag) 1432 { 1433 struct spdk_cpuset tmp; 1434 1435 /* This function runs on the main thread. */ 1436 assert(g_main_thread == spdk_get_thread()); 1437 1438 /* Handle default mask */ 1439 if (spdk_cpuset_count(cpumask) == 0) { 1440 cpumask = &g_all_cpuset; 1441 } 1442 1443 /* Warn user that mask might need to be changed */ 1444 spdk_cpuset_copy(&tmp, cpumask); 1445 spdk_cpuset_or(&tmp, &g_all_cpuset); 1446 if (!spdk_cpuset_equal(&tmp, &g_all_cpuset)) { 1447 fprintf(stderr, "cpumask for '%s' is too big\n", tag); 1448 } 1449 1450 return spdk_thread_create(tag, cpumask); 1451 } 1452 1453 static uint32_t 1454 _get_next_core(void) 1455 { 1456 static uint32_t current_core = SPDK_ENV_LCORE_ID_ANY; 1457 1458 if (current_core == SPDK_ENV_LCORE_ID_ANY) { 1459 current_core = spdk_env_get_first_core(); 1460 return current_core; 1461 } 1462 1463 current_core = spdk_env_get_next_core(current_core); 1464 if (current_core == SPDK_ENV_LCORE_ID_ANY) { 1465 current_core = spdk_env_get_first_core(); 1466 } 1467 1468 return current_core; 1469 } 1470 1471 static void 1472 _bdevperf_construct_job(void *ctx) 1473 { 1474 struct bdevperf_job *job = ctx; 1475 int rc; 1476 1477 rc = spdk_bdev_open_ext(spdk_bdev_get_name(job->bdev), true, bdevperf_bdev_removed, job, 1478 &job->bdev_desc); 1479 if (rc != 0) { 1480 SPDK_ERRLOG("Could not open leaf bdev %s, error=%d\n", spdk_bdev_get_name(job->bdev), rc); 1481 g_run_rc = -EINVAL; 1482 goto end; 1483 } 1484 1485 if (g_zcopy) { 1486 if (!spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 1487 printf("Test requires ZCOPY but bdev module does not support ZCOPY\n"); 1488 g_run_rc = -ENOTSUP; 1489 goto end; 1490 } 1491 } 1492 1493 job->ch = spdk_bdev_get_io_channel(job->bdev_desc); 1494 if (!job->ch) { 1495 SPDK_ERRLOG("Could not get io_channel for device %s, error=%d\n", spdk_bdev_get_name(job->bdev), 1496 rc); 1497 spdk_bdev_close(job->bdev_desc); 1498 TAILQ_REMOVE(&g_bdevperf.jobs, job, link); 1499 g_run_rc = -ENOMEM; 1500 goto end; 1501 } 1502 1503 end: 1504 spdk_thread_send_msg(g_main_thread, _bdevperf_construct_job_done, NULL); 1505 } 1506 1507 static void 1508 job_init_rw(struct bdevperf_job *job, enum job_config_rw rw) 1509 { 1510 switch (rw) { 1511 case JOB_CONFIG_RW_READ: 1512 job->rw_percentage = 100; 1513 break; 1514 case JOB_CONFIG_RW_WRITE: 1515 job->rw_percentage = 0; 1516 break; 1517 case JOB_CONFIG_RW_RANDREAD: 1518 job->is_random = true; 1519 job->rw_percentage = 100; 1520 job->seed = rand(); 1521 break; 1522 case JOB_CONFIG_RW_RANDWRITE: 1523 job->is_random = true; 1524 job->rw_percentage = 0; 1525 job->seed = rand(); 1526 break; 1527 case JOB_CONFIG_RW_RW: 1528 job->is_random = false; 1529 break; 1530 case JOB_CONFIG_RW_RANDRW: 1531 job->is_random = true; 1532 job->seed = rand(); 1533 break; 1534 case JOB_CONFIG_RW_VERIFY: 1535 job->verify = true; 1536 job->rw_percentage = 50; 1537 break; 1538 case JOB_CONFIG_RW_RESET: 1539 job->reset = true; 1540 job->verify = true; 1541 job->rw_percentage = 50; 1542 break; 1543 case JOB_CONFIG_RW_UNMAP: 1544 job->unmap = true; 1545 break; 1546 case JOB_CONFIG_RW_FLUSH: 1547 job->flush = true; 1548 break; 1549 case JOB_CONFIG_RW_WRITE_ZEROES: 1550 job->write_zeroes = true; 1551 break; 1552 } 1553 } 1554 1555 static int 1556 bdevperf_construct_job(struct spdk_bdev *bdev, struct job_config *config, 1557 struct spdk_thread *thread) 1558 { 1559 struct bdevperf_job *job; 1560 struct bdevperf_task *task; 1561 int block_size, data_block_size; 1562 int rc; 1563 int task_num, n; 1564 1565 block_size = spdk_bdev_get_block_size(bdev); 1566 data_block_size = spdk_bdev_get_data_block_size(bdev); 1567 1568 job = calloc(1, sizeof(struct bdevperf_job)); 1569 if (!job) { 1570 fprintf(stderr, "Unable to allocate memory for new job.\n"); 1571 return -ENOMEM; 1572 } 1573 1574 job->name = strdup(spdk_bdev_get_name(bdev)); 1575 if (!job->name) { 1576 fprintf(stderr, "Unable to allocate memory for job name.\n"); 1577 bdevperf_job_free(job); 1578 return -ENOMEM; 1579 } 1580 1581 job->workload_type = g_workload_type; 1582 job->io_size = config->bs; 1583 job->rw_percentage = config->rwmixread; 1584 job->continue_on_failure = g_continue_on_failure; 1585 job->queue_depth = config->iodepth; 1586 job->bdev = bdev; 1587 job->io_size_blocks = job->io_size / data_block_size; 1588 job->buf_size = job->io_size_blocks * block_size; 1589 job->abort = g_abort; 1590 job_init_rw(job, config->rw); 1591 1592 if ((job->io_size % data_block_size) != 0) { 1593 SPDK_ERRLOG("IO size (%d) is not multiples of data block size of bdev %s (%"PRIu32")\n", 1594 job->io_size, spdk_bdev_get_name(bdev), data_block_size); 1595 bdevperf_job_free(job); 1596 return -ENOTSUP; 1597 } 1598 1599 if (job->unmap && !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { 1600 printf("Skipping %s because it does not support unmap\n", spdk_bdev_get_name(bdev)); 1601 bdevperf_job_free(job); 1602 return -ENOTSUP; 1603 } 1604 1605 if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) { 1606 job->dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK; 1607 } 1608 if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) { 1609 job->dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK; 1610 } 1611 1612 job->offset_in_ios = 0; 1613 1614 if (config->length != 0) { 1615 /* Use subset of disk */ 1616 job->size_in_ios = config->length / job->io_size_blocks; 1617 job->ios_base = config->offset / job->io_size_blocks; 1618 } else { 1619 /* Use whole disk */ 1620 job->size_in_ios = spdk_bdev_get_num_blocks(bdev) / job->io_size_blocks; 1621 job->ios_base = 0; 1622 } 1623 1624 if (job->is_random && g_zipf_theta > 0) { 1625 job->zipf = spdk_zipf_create(job->size_in_ios, g_zipf_theta, 0); 1626 } 1627 1628 if (job->verify) { 1629 job->outstanding = spdk_bit_array_create(job->size_in_ios); 1630 if (job->outstanding == NULL) { 1631 SPDK_ERRLOG("Could not create outstanding array bitmap for bdev %s\n", 1632 spdk_bdev_get_name(bdev)); 1633 bdevperf_job_free(job); 1634 return -ENOMEM; 1635 } 1636 } 1637 1638 job->histogram = spdk_histogram_data_alloc(); 1639 if (job->histogram == NULL) { 1640 fprintf(stderr, "Failed to allocate histogram\n"); 1641 bdevperf_job_free(job); 1642 return -ENOMEM; 1643 } 1644 1645 TAILQ_INIT(&job->task_list); 1646 1647 task_num = job->queue_depth; 1648 if (job->reset) { 1649 task_num += 1; 1650 } 1651 if (job->abort) { 1652 task_num += job->queue_depth; 1653 } 1654 1655 TAILQ_INSERT_TAIL(&g_bdevperf.jobs, job, link); 1656 1657 for (n = 0; n < task_num; n++) { 1658 task = calloc(1, sizeof(struct bdevperf_task)); 1659 if (!task) { 1660 fprintf(stderr, "Failed to allocate task from memory\n"); 1661 return -ENOMEM; 1662 } 1663 1664 task->buf = spdk_zmalloc(job->buf_size, spdk_bdev_get_buf_align(job->bdev), NULL, 1665 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1666 if (!task->buf) { 1667 fprintf(stderr, "Cannot allocate buf for task=%p\n", task); 1668 free(task); 1669 return -ENOMEM; 1670 } 1671 1672 if (spdk_bdev_is_md_separate(job->bdev)) { 1673 task->md_buf = spdk_zmalloc(job->io_size_blocks * 1674 spdk_bdev_get_md_size(job->bdev), 0, NULL, 1675 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1676 if (!task->md_buf) { 1677 fprintf(stderr, "Cannot allocate md buf for task=%p\n", task); 1678 spdk_free(task->buf); 1679 free(task); 1680 return -ENOMEM; 1681 } 1682 } 1683 1684 task->job = job; 1685 TAILQ_INSERT_TAIL(&job->task_list, task, link); 1686 } 1687 1688 job->thread = thread; 1689 1690 g_construct_job_count++; 1691 1692 rc = spdk_thread_send_msg(thread, _bdevperf_construct_job, job); 1693 assert(rc == 0); 1694 1695 return rc; 1696 } 1697 1698 static int 1699 parse_rw(const char *str, enum job_config_rw ret) 1700 { 1701 if (str == NULL) { 1702 return ret; 1703 } 1704 1705 if (!strcmp(str, "read")) { 1706 ret = JOB_CONFIG_RW_READ; 1707 } else if (!strcmp(str, "randread")) { 1708 ret = JOB_CONFIG_RW_RANDREAD; 1709 } else if (!strcmp(str, "write")) { 1710 ret = JOB_CONFIG_RW_WRITE; 1711 } else if (!strcmp(str, "randwrite")) { 1712 ret = JOB_CONFIG_RW_RANDWRITE; 1713 } else if (!strcmp(str, "verify")) { 1714 ret = JOB_CONFIG_RW_VERIFY; 1715 } else if (!strcmp(str, "reset")) { 1716 ret = JOB_CONFIG_RW_RESET; 1717 } else if (!strcmp(str, "unmap")) { 1718 ret = JOB_CONFIG_RW_UNMAP; 1719 } else if (!strcmp(str, "write_zeroes")) { 1720 ret = JOB_CONFIG_RW_WRITE_ZEROES; 1721 } else if (!strcmp(str, "flush")) { 1722 ret = JOB_CONFIG_RW_FLUSH; 1723 } else if (!strcmp(str, "rw")) { 1724 ret = JOB_CONFIG_RW_RW; 1725 } else if (!strcmp(str, "randrw")) { 1726 ret = JOB_CONFIG_RW_RANDRW; 1727 } else { 1728 fprintf(stderr, "rw must be one of\n" 1729 "(read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n"); 1730 ret = BDEVPERF_CONFIG_ERROR; 1731 } 1732 1733 return ret; 1734 } 1735 1736 static const char * 1737 config_filename_next(const char *filename, char *out) 1738 { 1739 int i, k; 1740 1741 if (filename == NULL) { 1742 out[0] = '\0'; 1743 return NULL; 1744 } 1745 1746 if (filename[0] == ':') { 1747 filename++; 1748 } 1749 1750 for (i = 0, k = 0; 1751 filename[i] != '\0' && 1752 filename[i] != ':' && 1753 i < BDEVPERF_CONFIG_MAX_FILENAME; 1754 i++) { 1755 if (filename[i] == ' ' || filename[i] == '\t') { 1756 continue; 1757 } 1758 1759 out[k++] = filename[i]; 1760 } 1761 out[k] = 0; 1762 1763 return filename + i; 1764 } 1765 1766 static void 1767 bdevperf_construct_jobs(void) 1768 { 1769 char filename[BDEVPERF_CONFIG_MAX_FILENAME]; 1770 struct spdk_thread *thread; 1771 struct job_config *config; 1772 struct spdk_bdev *bdev; 1773 const char *filenames; 1774 int rc; 1775 1776 TAILQ_FOREACH(config, &job_config_list, link) { 1777 filenames = config->filename; 1778 1779 thread = construct_job_thread(&config->cpumask, config->name); 1780 assert(thread); 1781 1782 while (filenames) { 1783 filenames = config_filename_next(filenames, filename); 1784 if (strlen(filename) == 0) { 1785 break; 1786 } 1787 1788 bdev = spdk_bdev_get_by_name(filename); 1789 if (!bdev) { 1790 fprintf(stderr, "Unable to find bdev '%s'\n", filename); 1791 g_run_rc = -EINVAL; 1792 return; 1793 } 1794 1795 rc = bdevperf_construct_job(bdev, config, thread); 1796 if (rc < 0) { 1797 g_run_rc = rc; 1798 return; 1799 } 1800 } 1801 } 1802 } 1803 1804 static int 1805 make_cli_job_config(const char *filename, int64_t offset, uint64_t range) 1806 { 1807 struct job_config *config = calloc(1, sizeof(*config)); 1808 1809 if (config == NULL) { 1810 fprintf(stderr, "Unable to allocate memory for job config\n"); 1811 return -ENOMEM; 1812 } 1813 1814 config->name = filename; 1815 config->filename = filename; 1816 spdk_cpuset_zero(&config->cpumask); 1817 spdk_cpuset_set_cpu(&config->cpumask, _get_next_core(), true); 1818 config->bs = g_io_size; 1819 config->iodepth = g_queue_depth; 1820 config->rwmixread = g_rw_percentage; 1821 config->offset = offset; 1822 config->length = range; 1823 config->rw = parse_rw(g_workload_type, BDEVPERF_CONFIG_ERROR); 1824 if ((int)config->rw == BDEVPERF_CONFIG_ERROR) { 1825 free(config); 1826 return -EINVAL; 1827 } 1828 1829 TAILQ_INSERT_TAIL(&job_config_list, config, link); 1830 return 0; 1831 } 1832 1833 static void 1834 bdevperf_construct_multithread_job_configs(void) 1835 { 1836 struct spdk_bdev *bdev; 1837 uint32_t i; 1838 uint32_t num_cores; 1839 uint64_t blocks_per_job; 1840 int64_t offset; 1841 1842 num_cores = 0; 1843 SPDK_ENV_FOREACH_CORE(i) { 1844 num_cores++; 1845 } 1846 1847 if (num_cores == 0) { 1848 g_run_rc = -EINVAL; 1849 return; 1850 } 1851 1852 if (g_job_bdev_name != NULL) { 1853 bdev = spdk_bdev_get_by_name(g_job_bdev_name); 1854 if (!bdev) { 1855 fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name); 1856 return; 1857 } 1858 1859 blocks_per_job = spdk_bdev_get_num_blocks(bdev) / num_cores; 1860 offset = 0; 1861 1862 SPDK_ENV_FOREACH_CORE(i) { 1863 g_run_rc = make_cli_job_config(g_job_bdev_name, offset, blocks_per_job); 1864 if (g_run_rc) { 1865 return; 1866 } 1867 1868 offset += blocks_per_job; 1869 } 1870 } else { 1871 bdev = spdk_bdev_first_leaf(); 1872 while (bdev != NULL) { 1873 blocks_per_job = spdk_bdev_get_num_blocks(bdev) / num_cores; 1874 offset = 0; 1875 1876 SPDK_ENV_FOREACH_CORE(i) { 1877 g_run_rc = make_cli_job_config(spdk_bdev_get_name(bdev), 1878 offset, blocks_per_job); 1879 if (g_run_rc) { 1880 return; 1881 } 1882 1883 offset += blocks_per_job; 1884 } 1885 1886 bdev = spdk_bdev_next_leaf(bdev); 1887 } 1888 } 1889 } 1890 1891 static void 1892 bdevperf_construct_job_configs(void) 1893 { 1894 struct spdk_bdev *bdev; 1895 1896 /* There are three different modes for allocating jobs. Standard mode 1897 * (the default) creates one spdk_thread per bdev and runs the I/O job there. 1898 * 1899 * The -C flag places bdevperf into "multithread" mode, meaning it creates 1900 * one spdk_thread per bdev PER CORE, and runs a copy of the job on each. 1901 * This runs multiple threads per bdev, effectively. 1902 * 1903 * The -j flag implies "FIO" mode which tries to mimic semantic of FIO jobs. 1904 * In "FIO" mode, threads are spawned per-job instead of per-bdev. 1905 * Each FIO job can be individually parameterized by filename, cpu mask, etc, 1906 * which is different from other modes in that they only support global options. 1907 */ 1908 1909 if (g_bdevperf_conf) { 1910 goto end; 1911 } else if (g_multithread_mode) { 1912 bdevperf_construct_multithread_job_configs(); 1913 goto end; 1914 } 1915 1916 if (g_job_bdev_name != NULL) { 1917 bdev = spdk_bdev_get_by_name(g_job_bdev_name); 1918 if (bdev) { 1919 /* Construct the job */ 1920 g_run_rc = make_cli_job_config(g_job_bdev_name, 0, 0); 1921 } else { 1922 fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name); 1923 } 1924 } else { 1925 bdev = spdk_bdev_first_leaf(); 1926 1927 while (bdev != NULL) { 1928 /* Construct the job */ 1929 g_run_rc = make_cli_job_config(spdk_bdev_get_name(bdev), 0, 0); 1930 if (g_run_rc) { 1931 break; 1932 } 1933 1934 bdev = spdk_bdev_next_leaf(bdev); 1935 } 1936 } 1937 1938 end: 1939 /* Increment initial construct_jobs count so that it will never reach 0 in the middle 1940 * of iteration. 1941 */ 1942 g_construct_job_count = 1; 1943 1944 if (g_run_rc == 0) { 1945 bdevperf_construct_jobs(); 1946 } 1947 1948 _bdevperf_construct_job_done(NULL); 1949 } 1950 1951 static int 1952 parse_uint_option(struct spdk_conf_section *s, const char *name, int def) 1953 { 1954 const char *job_name; 1955 int tmp; 1956 1957 tmp = spdk_conf_section_get_intval(s, name); 1958 if (tmp == -1) { 1959 /* Field was not found. Check default value 1960 * In [global] section it is ok to have undefined values 1961 * but for other sections it is not ok */ 1962 if (def == BDEVPERF_CONFIG_UNDEFINED) { 1963 job_name = spdk_conf_section_get_name(s); 1964 if (strcmp(job_name, "global") == 0) { 1965 return def; 1966 } 1967 1968 fprintf(stderr, 1969 "Job '%s' has no '%s' assigned\n", 1970 job_name, name); 1971 return BDEVPERF_CONFIG_ERROR; 1972 } 1973 return def; 1974 } 1975 1976 /* NOTE: get_intval returns nonnegative on success */ 1977 if (tmp < 0) { 1978 fprintf(stderr, "Job '%s' has bad '%s' value.\n", 1979 spdk_conf_section_get_name(s), name); 1980 return BDEVPERF_CONFIG_ERROR; 1981 } 1982 1983 return tmp; 1984 } 1985 1986 /* CLI arguments override parameters for global sections */ 1987 static void 1988 config_set_cli_args(struct job_config *config) 1989 { 1990 if (g_job_bdev_name) { 1991 config->filename = g_job_bdev_name; 1992 } 1993 if (g_io_size > 0) { 1994 config->bs = g_io_size; 1995 } 1996 if (g_queue_depth > 0) { 1997 config->iodepth = g_queue_depth; 1998 } 1999 if (g_rw_percentage > 0) { 2000 config->rwmixread = g_rw_percentage; 2001 } 2002 if (g_workload_type) { 2003 config->rw = parse_rw(g_workload_type, config->rw); 2004 } 2005 } 2006 2007 static int 2008 read_job_config(void) 2009 { 2010 struct job_config global_default_config; 2011 struct job_config global_config; 2012 struct spdk_conf_section *s; 2013 struct job_config *config; 2014 const char *cpumask; 2015 const char *rw; 2016 bool is_global; 2017 int n = 0; 2018 int val; 2019 2020 if (g_bdevperf_conf_file == NULL) { 2021 return 0; 2022 } 2023 2024 g_bdevperf_conf = spdk_conf_allocate(); 2025 if (g_bdevperf_conf == NULL) { 2026 fprintf(stderr, "Could not allocate job config structure\n"); 2027 return 1; 2028 } 2029 2030 spdk_conf_disable_sections_merge(g_bdevperf_conf); 2031 if (spdk_conf_read(g_bdevperf_conf, g_bdevperf_conf_file)) { 2032 fprintf(stderr, "Invalid job config"); 2033 return 1; 2034 } 2035 2036 /* Initialize global defaults */ 2037 global_default_config.filename = NULL; 2038 /* Zero mask is the same as g_all_cpuset 2039 * The g_all_cpuset is not initialized yet, 2040 * so use zero mask as the default instead */ 2041 spdk_cpuset_zero(&global_default_config.cpumask); 2042 global_default_config.bs = BDEVPERF_CONFIG_UNDEFINED; 2043 global_default_config.iodepth = BDEVPERF_CONFIG_UNDEFINED; 2044 /* bdevperf has no default for -M option but in FIO the default is 50 */ 2045 global_default_config.rwmixread = 50; 2046 global_default_config.offset = 0; 2047 /* length 0 means 100% */ 2048 global_default_config.length = 0; 2049 global_default_config.rw = BDEVPERF_CONFIG_UNDEFINED; 2050 config_set_cli_args(&global_default_config); 2051 2052 if ((int)global_default_config.rw == BDEVPERF_CONFIG_ERROR) { 2053 return 1; 2054 } 2055 2056 /* There is only a single instance of global job_config 2057 * We just reset its value when we encounter new [global] section */ 2058 global_config = global_default_config; 2059 2060 for (s = spdk_conf_first_section(g_bdevperf_conf); 2061 s != NULL; 2062 s = spdk_conf_next_section(s)) { 2063 config = calloc(1, sizeof(*config)); 2064 if (config == NULL) { 2065 fprintf(stderr, "Unable to allocate memory for job config\n"); 2066 return 1; 2067 } 2068 2069 config->name = spdk_conf_section_get_name(s); 2070 is_global = strcmp(config->name, "global") == 0; 2071 2072 if (is_global) { 2073 global_config = global_default_config; 2074 } 2075 2076 config->filename = spdk_conf_section_get_val(s, "filename"); 2077 if (config->filename == NULL) { 2078 config->filename = global_config.filename; 2079 } 2080 if (!is_global) { 2081 if (config->filename == NULL) { 2082 fprintf(stderr, "Job '%s' expects 'filename' parameter\n", config->name); 2083 goto error; 2084 } else if (strnlen(config->filename, BDEVPERF_CONFIG_MAX_FILENAME) 2085 >= BDEVPERF_CONFIG_MAX_FILENAME) { 2086 fprintf(stderr, 2087 "filename for '%s' job is too long. Max length is %d\n", 2088 config->name, BDEVPERF_CONFIG_MAX_FILENAME); 2089 goto error; 2090 } 2091 } 2092 2093 cpumask = spdk_conf_section_get_val(s, "cpumask"); 2094 if (cpumask == NULL) { 2095 config->cpumask = global_config.cpumask; 2096 } else if (spdk_cpuset_parse(&config->cpumask, cpumask)) { 2097 fprintf(stderr, "Job '%s' has bad 'cpumask' value\n", config->name); 2098 goto error; 2099 } 2100 2101 config->bs = parse_uint_option(s, "bs", global_config.bs); 2102 if (config->bs == BDEVPERF_CONFIG_ERROR) { 2103 goto error; 2104 } else if (config->bs == 0) { 2105 fprintf(stderr, "'bs' of job '%s' must be greater than 0\n", config->name); 2106 goto error; 2107 } 2108 2109 config->iodepth = parse_uint_option(s, "iodepth", global_config.iodepth); 2110 if (config->iodepth == BDEVPERF_CONFIG_ERROR) { 2111 goto error; 2112 } else if (config->iodepth == 0) { 2113 fprintf(stderr, 2114 "'iodepth' of job '%s' must be greater than 0\n", 2115 config->name); 2116 goto error; 2117 } 2118 2119 config->rwmixread = parse_uint_option(s, "rwmixread", global_config.rwmixread); 2120 if (config->rwmixread == BDEVPERF_CONFIG_ERROR) { 2121 goto error; 2122 } else if (config->rwmixread > 100) { 2123 fprintf(stderr, 2124 "'rwmixread' value of '%s' job is not in 0-100 range\n", 2125 config->name); 2126 goto error; 2127 } 2128 2129 config->offset = parse_uint_option(s, "offset", global_config.offset); 2130 if (config->offset == BDEVPERF_CONFIG_ERROR) { 2131 goto error; 2132 } 2133 2134 val = parse_uint_option(s, "length", global_config.length); 2135 if (val == BDEVPERF_CONFIG_ERROR) { 2136 goto error; 2137 } 2138 config->length = val; 2139 2140 rw = spdk_conf_section_get_val(s, "rw"); 2141 config->rw = parse_rw(rw, global_config.rw); 2142 if ((int)config->rw == BDEVPERF_CONFIG_ERROR) { 2143 fprintf(stderr, "Job '%s' has bad 'rw' value\n", config->name); 2144 goto error; 2145 } else if (!is_global && (int)config->rw == BDEVPERF_CONFIG_UNDEFINED) { 2146 fprintf(stderr, "Job '%s' has no 'rw' assigned\n", config->name); 2147 goto error; 2148 } 2149 2150 if (is_global) { 2151 config_set_cli_args(config); 2152 global_config = *config; 2153 free(config); 2154 } else { 2155 TAILQ_INSERT_TAIL(&job_config_list, config, link); 2156 n++; 2157 } 2158 } 2159 2160 printf("Using job config with %d jobs\n", n); 2161 return 0; 2162 error: 2163 free(config); 2164 return 1; 2165 } 2166 2167 static void 2168 bdevperf_run(void *arg1) 2169 { 2170 uint32_t i; 2171 2172 g_main_thread = spdk_get_thread(); 2173 2174 spdk_cpuset_zero(&g_all_cpuset); 2175 SPDK_ENV_FOREACH_CORE(i) { 2176 spdk_cpuset_set_cpu(&g_all_cpuset, i, true); 2177 } 2178 2179 if (g_wait_for_tests) { 2180 /* Do not perform any tests until RPC is received */ 2181 return; 2182 } 2183 2184 bdevperf_construct_job_configs(); 2185 } 2186 2187 static void 2188 rpc_perform_tests_cb(void) 2189 { 2190 struct spdk_json_write_ctx *w; 2191 struct spdk_jsonrpc_request *request = g_request; 2192 2193 g_request = NULL; 2194 2195 if (g_run_rc == 0) { 2196 w = spdk_jsonrpc_begin_result(request); 2197 spdk_json_write_uint32(w, g_run_rc); 2198 spdk_jsonrpc_end_result(request, w); 2199 } else { 2200 spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, 2201 "bdevperf failed with error %s", spdk_strerror(-g_run_rc)); 2202 } 2203 2204 /* Reset g_run_rc to 0 for the next test run. */ 2205 g_run_rc = 0; 2206 2207 /* Reset g_stats to 0 for the next test run. */ 2208 memset(&g_stats, 0, sizeof(g_stats)); 2209 } 2210 2211 static void 2212 rpc_perform_tests(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) 2213 { 2214 if (params != NULL) { 2215 spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, 2216 "perform_tests method requires no parameters"); 2217 return; 2218 } 2219 if (g_request != NULL) { 2220 fprintf(stderr, "Another test is already in progress.\n"); 2221 spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, 2222 spdk_strerror(-EINPROGRESS)); 2223 return; 2224 } 2225 g_request = request; 2226 2227 /* Only construct job configs at the first test run. */ 2228 if (TAILQ_EMPTY(&job_config_list)) { 2229 bdevperf_construct_job_configs(); 2230 } else { 2231 bdevperf_construct_jobs(); 2232 } 2233 } 2234 SPDK_RPC_REGISTER("perform_tests", rpc_perform_tests, SPDK_RPC_RUNTIME) 2235 2236 static void 2237 _bdevperf_job_drain(void *ctx) 2238 { 2239 bdevperf_job_drain(ctx); 2240 } 2241 2242 static void 2243 spdk_bdevperf_shutdown_cb(void) 2244 { 2245 g_shutdown = true; 2246 struct bdevperf_job *job, *tmp; 2247 2248 if (g_bdevperf.running_jobs == 0) { 2249 bdevperf_test_done(NULL); 2250 return; 2251 } 2252 2253 /* Iterate jobs to stop all I/O */ 2254 TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, tmp) { 2255 spdk_thread_send_msg(job->thread, _bdevperf_job_drain, job); 2256 } 2257 } 2258 2259 static int 2260 bdevperf_parse_arg(int ch, char *arg) 2261 { 2262 long long tmp; 2263 2264 if (ch == 'w') { 2265 g_workload_type = optarg; 2266 } else if (ch == 'T') { 2267 g_job_bdev_name = optarg; 2268 } else if (ch == 'z') { 2269 g_wait_for_tests = true; 2270 } else if (ch == 'Z') { 2271 g_zcopy = true; 2272 } else if (ch == 'X') { 2273 g_abort = true; 2274 } else if (ch == 'C') { 2275 g_multithread_mode = true; 2276 } else if (ch == 'f') { 2277 g_continue_on_failure = true; 2278 } else if (ch == 'j') { 2279 g_bdevperf_conf_file = optarg; 2280 } else if (ch == 'F') { 2281 char *endptr; 2282 2283 errno = 0; 2284 g_zipf_theta = strtod(optarg, &endptr); 2285 if (errno || optarg == endptr || g_zipf_theta < 0) { 2286 fprintf(stderr, "Illegal zipf theta value %s\n", optarg); 2287 return -EINVAL; 2288 } 2289 } else if (ch == 'l') { 2290 g_latency_display_level++; 2291 } else { 2292 tmp = spdk_strtoll(optarg, 10); 2293 if (tmp < 0) { 2294 fprintf(stderr, "Parse failed for the option %c.\n", ch); 2295 return tmp; 2296 } else if (tmp >= INT_MAX) { 2297 fprintf(stderr, "Parsed option was too large %c.\n", ch); 2298 return -ERANGE; 2299 } 2300 2301 switch (ch) { 2302 case 'q': 2303 g_queue_depth = tmp; 2304 break; 2305 case 'o': 2306 g_io_size = tmp; 2307 break; 2308 case 't': 2309 g_time_in_sec = tmp; 2310 break; 2311 case 'k': 2312 g_timeout_in_sec = tmp; 2313 break; 2314 case 'M': 2315 g_rw_percentage = tmp; 2316 g_mix_specified = true; 2317 break; 2318 case 'P': 2319 g_show_performance_ema_period = tmp; 2320 break; 2321 case 'S': 2322 g_show_performance_real_time = 1; 2323 g_show_performance_period_in_usec = tmp * 1000000; 2324 break; 2325 default: 2326 return -EINVAL; 2327 } 2328 } 2329 return 0; 2330 } 2331 2332 static void 2333 bdevperf_usage(void) 2334 { 2335 printf(" -q <depth> io depth\n"); 2336 printf(" -o <size> io size in bytes\n"); 2337 printf(" -w <type> io pattern type, must be one of (read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n"); 2338 printf(" -t <time> time in seconds\n"); 2339 printf(" -k <timeout> timeout in seconds to detect starved I/O (default is 0 and disabled)\n"); 2340 printf(" -M <percent> rwmixread (100 for reads, 0 for writes)\n"); 2341 printf(" -P <num> number of moving average period\n"); 2342 printf("\t\t(If set to n, show weighted mean of the previous n IO/s in real time)\n"); 2343 printf("\t\t(Formula: M = 2 / (n + 1), EMA[i+1] = IO/s * M + (1 - M) * EMA[i])\n"); 2344 printf("\t\t(only valid with -S)\n"); 2345 printf(" -S <period> show performance result in real time every <period> seconds\n"); 2346 printf(" -T <bdev> bdev to run against. Default: all available bdevs.\n"); 2347 printf(" -f continue processing I/O even after failures\n"); 2348 printf(" -F <zipf theta> use zipf distribution for random I/O\n"); 2349 printf(" -Z enable using zcopy bdev API for read or write I/O\n"); 2350 printf(" -z start bdevperf, but wait for RPC to start tests\n"); 2351 printf(" -X abort timed out I/O\n"); 2352 printf(" -C enable every core to send I/Os to each bdev\n"); 2353 printf(" -j <filename> use job config file\n"); 2354 printf(" -l display latency histogram, default: disable. -l display summary, -ll display details\n"); 2355 } 2356 2357 static int 2358 verify_test_params(struct spdk_app_opts *opts) 2359 { 2360 /* When RPC is used for starting tests and 2361 * no rpc_addr was configured for the app, 2362 * use the default address. */ 2363 if (g_wait_for_tests && opts->rpc_addr == NULL) { 2364 opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR; 2365 } 2366 2367 if (!g_bdevperf_conf_file && g_queue_depth <= 0) { 2368 goto out; 2369 } 2370 if (!g_bdevperf_conf_file && g_io_size <= 0) { 2371 goto out; 2372 } 2373 if (!g_bdevperf_conf_file && !g_workload_type) { 2374 goto out; 2375 } 2376 if (g_time_in_sec <= 0) { 2377 goto out; 2378 } 2379 g_time_in_usec = g_time_in_sec * 1000000LL; 2380 2381 if (g_timeout_in_sec < 0) { 2382 goto out; 2383 } 2384 2385 if (g_abort && !g_timeout_in_sec) { 2386 printf("Timeout must be set for abort option, Ignoring g_abort\n"); 2387 } 2388 2389 if (g_show_performance_ema_period > 0 && 2390 g_show_performance_real_time == 0) { 2391 fprintf(stderr, "-P option must be specified with -S option\n"); 2392 return 1; 2393 } 2394 2395 if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) { 2396 printf("I/O size of %d is greater than zero copy threshold (%d).\n", 2397 g_io_size, SPDK_BDEV_LARGE_BUF_MAX_SIZE); 2398 printf("Zero copy mechanism will not be used.\n"); 2399 g_zcopy = false; 2400 } 2401 2402 if (g_bdevperf_conf_file) { 2403 /* workload_type verification happens during config file parsing */ 2404 return 0; 2405 } 2406 2407 if (!strcmp(g_workload_type, "verify") || 2408 !strcmp(g_workload_type, "reset")) { 2409 g_rw_percentage = 50; 2410 if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) { 2411 fprintf(stderr, "Unable to exceed max I/O size of %d for verify. (%d provided).\n", 2412 SPDK_BDEV_LARGE_BUF_MAX_SIZE, g_io_size); 2413 return 1; 2414 } 2415 g_verify = true; 2416 if (!strcmp(g_workload_type, "reset")) { 2417 g_reset = true; 2418 } 2419 } 2420 2421 if (!strcmp(g_workload_type, "read") || 2422 !strcmp(g_workload_type, "randread") || 2423 !strcmp(g_workload_type, "write") || 2424 !strcmp(g_workload_type, "randwrite") || 2425 !strcmp(g_workload_type, "verify") || 2426 !strcmp(g_workload_type, "reset") || 2427 !strcmp(g_workload_type, "unmap") || 2428 !strcmp(g_workload_type, "write_zeroes") || 2429 !strcmp(g_workload_type, "flush")) { 2430 if (g_mix_specified) { 2431 fprintf(stderr, "Ignoring -M option... Please use -M option" 2432 " only when using rw or randrw.\n"); 2433 } 2434 } 2435 2436 if (!strcmp(g_workload_type, "rw") || 2437 !strcmp(g_workload_type, "randrw")) { 2438 if (g_rw_percentage < 0 || g_rw_percentage > 100) { 2439 fprintf(stderr, 2440 "-M must be specified to value from 0 to 100 " 2441 "for rw or randrw.\n"); 2442 return 1; 2443 } 2444 } 2445 2446 return 0; 2447 out: 2448 spdk_app_usage(); 2449 bdevperf_usage(); 2450 return 1; 2451 } 2452 2453 int 2454 main(int argc, char **argv) 2455 { 2456 struct spdk_app_opts opts = {}; 2457 int rc; 2458 2459 /* Use the runtime PID to set the random seed */ 2460 srand(getpid()); 2461 2462 spdk_app_opts_init(&opts, sizeof(opts)); 2463 opts.name = "bdevperf"; 2464 opts.rpc_addr = NULL; 2465 opts.shutdown_cb = spdk_bdevperf_shutdown_cb; 2466 2467 if ((rc = spdk_app_parse_args(argc, argv, &opts, "Zzfq:o:t:w:k:CF:M:P:S:T:Xlj:", NULL, 2468 bdevperf_parse_arg, bdevperf_usage)) != 2469 SPDK_APP_PARSE_ARGS_SUCCESS) { 2470 return rc; 2471 } 2472 2473 if (read_job_config()) { 2474 free_job_config(); 2475 return 1; 2476 } 2477 2478 if (verify_test_params(&opts) != 0) { 2479 free_job_config(); 2480 exit(1); 2481 } 2482 2483 rc = spdk_app_start(&opts, bdevperf_run, NULL); 2484 2485 spdk_app_fini(); 2486 free_job_config(); 2487 return rc; 2488 } 2489