xref: /spdk/examples/nvme/arbitration/arbitration.c (revision c39647df83e4be9bcc49025132c48bf2414ef8b1)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "spdk/log.h"
37 #include "spdk/nvme.h"
38 #include "spdk/env.h"
39 #include "spdk/string.h"
40 #include "spdk/nvme_intel.h"
41 #include "spdk/string.h"
42 
43 struct ctrlr_entry {
44 	struct spdk_nvme_ctrlr			*ctrlr;
45 	struct spdk_nvme_intel_rw_latency_page	latency_page;
46 	TAILQ_ENTRY(ctrlr_entry)		link;
47 	char					name[1024];
48 };
49 
50 struct ns_entry {
51 	struct {
52 		struct spdk_nvme_ctrlr		*ctrlr;
53 		struct spdk_nvme_ns		*ns;
54 	} nvme;
55 
56 	TAILQ_ENTRY(ns_entry)			link;
57 	uint32_t				io_size_blocks;
58 	uint64_t				size_in_ios;
59 	char					name[1024];
60 };
61 
62 struct ns_worker_ctx {
63 	struct ns_entry				*entry;
64 	uint64_t				io_completed;
65 	uint64_t				current_queue_depth;
66 	uint64_t				offset_in_ios;
67 	bool					is_draining;
68 	struct spdk_nvme_qpair			*qpair;
69 	TAILQ_ENTRY(ns_worker_ctx)		link;
70 };
71 
72 struct arb_task {
73 	struct ns_worker_ctx			*ns_ctx;
74 	void					*buf;
75 };
76 
77 struct worker_thread {
78 	TAILQ_HEAD(, ns_worker_ctx)		ns_ctx;
79 	TAILQ_ENTRY(worker_thread)		link;
80 	unsigned				lcore;
81 	enum spdk_nvme_qprio			qprio;
82 };
83 
84 struct arb_context {
85 	int					shm_id;
86 	int					outstanding_commands;
87 	int					num_namespaces;
88 	int					num_workers;
89 	int					rw_percentage;
90 	int					is_random;
91 	int					queue_depth;
92 	int					time_in_sec;
93 	int					io_count;
94 	uint8_t					latency_tracking_enable;
95 	uint8_t					arbitration_mechanism;
96 	uint8_t					arbitration_config;
97 	uint32_t				io_size_bytes;
98 	uint32_t				max_completions;
99 	uint64_t				tsc_rate;
100 	const char				*core_mask;
101 	const char				*workload_type;
102 };
103 
104 struct feature {
105 	uint32_t				result;
106 	bool					valid;
107 };
108 
109 static struct spdk_mempool *task_pool		= NULL;
110 
111 static TAILQ_HEAD(, ctrlr_entry) g_controllers	= TAILQ_HEAD_INITIALIZER(g_controllers);
112 static TAILQ_HEAD(, ns_entry) g_namespaces	= TAILQ_HEAD_INITIALIZER(g_namespaces);
113 static TAILQ_HEAD(, worker_thread) g_workers	= TAILQ_HEAD_INITIALIZER(g_workers);
114 
115 static struct feature features[SPDK_NVME_FEAT_ARBITRATION + 1] = {};
116 static struct spdk_nvme_transport_id g_trid = {};
117 
118 static struct arb_context g_arbitration = {
119 	.shm_id					= -1,
120 	.outstanding_commands			= 0,
121 	.num_workers				= 0,
122 	.num_namespaces				= 0,
123 	.rw_percentage				= 50,
124 	.queue_depth				= 64,
125 	.time_in_sec				= 60,
126 	.io_count				= 100000,
127 	.latency_tracking_enable		= 0,
128 	.arbitration_mechanism			= SPDK_NVME_CC_AMS_RR,
129 	.arbitration_config			= 0,
130 	.io_size_bytes				= 131072,
131 	.max_completions			= 0,
132 	/* Default 4 cores for urgent/high/medium/low */
133 	.core_mask				= "0xf",
134 	.workload_type				= "randrw",
135 };
136 
137 static int g_dpdk_mem = 0;
138 static bool g_dpdk_mem_single_seg = false;
139 
140 /*
141  * For weighted round robin arbitration mechanism, the smaller value between
142  * weight and burst will be picked to execute the commands in one queue.
143  */
144 #define USER_SPECIFIED_HIGH_PRIORITY_WEIGHT	32
145 #define USER_SPECIFIED_MEDIUM_PRIORITY_WEIGHT	16
146 #define USER_SPECIFIED_LOW_PRIORITY_WEIGHT	8
147 
148 static void task_complete(struct arb_task *task);
149 
150 static void io_complete(void *ctx, const struct spdk_nvme_cpl *completion);
151 
152 static void get_arb_feature(struct spdk_nvme_ctrlr *ctrlr);
153 
154 static int set_arb_feature(struct spdk_nvme_ctrlr *ctrlr);
155 
156 static const char *print_qprio(enum spdk_nvme_qprio);
157 
158 
159 static void
160 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
161 {
162 	struct ns_entry *entry;
163 	const struct spdk_nvme_ctrlr_data *cdata;
164 
165 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
166 
167 	if (spdk_nvme_ns_get_size(ns) < g_arbitration.io_size_bytes ||
168 	    spdk_nvme_ns_get_extended_sector_size(ns) > g_arbitration.io_size_bytes ||
169 	    g_arbitration.io_size_bytes % spdk_nvme_ns_get_extended_sector_size(ns)) {
170 		printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
171 		       "ns size %" PRIu64 " / block size %u for I/O size %u\n",
172 		       cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns),
173 		       spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_extended_sector_size(ns),
174 		       g_arbitration.io_size_bytes);
175 		return;
176 	}
177 
178 	entry = malloc(sizeof(struct ns_entry));
179 	if (entry == NULL) {
180 		perror("ns_entry malloc");
181 		exit(1);
182 	}
183 
184 	entry->nvme.ctrlr = ctrlr;
185 	entry->nvme.ns = ns;
186 
187 	entry->size_in_ios = spdk_nvme_ns_get_size(ns) / g_arbitration.io_size_bytes;
188 	entry->io_size_blocks = g_arbitration.io_size_bytes / spdk_nvme_ns_get_sector_size(ns);
189 
190 	snprintf(entry->name, 44, "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
191 
192 	g_arbitration.num_namespaces++;
193 	TAILQ_INSERT_TAIL(&g_namespaces, entry, link);
194 }
195 
196 static void
197 enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl)
198 {
199 	if (spdk_nvme_cpl_is_error(cpl)) {
200 		printf("enable_latency_tracking_complete failed\n");
201 	}
202 	g_arbitration.outstanding_commands--;
203 }
204 
205 static void
206 set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable)
207 {
208 	int res;
209 	union spdk_nvme_intel_feat_latency_tracking latency_tracking;
210 
211 	if (enable) {
212 		latency_tracking.bits.enable = 0x01;
213 	} else {
214 		latency_tracking.bits.enable = 0x00;
215 	}
216 
217 	res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING,
218 					      latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL);
219 	if (res) {
220 		printf("fail to allocate nvme request.\n");
221 		return;
222 	}
223 	g_arbitration.outstanding_commands++;
224 
225 	while (g_arbitration.outstanding_commands) {
226 		spdk_nvme_ctrlr_process_admin_completions(ctrlr);
227 	}
228 }
229 
230 static void
231 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
232 {
233 	uint32_t nsid;
234 	struct spdk_nvme_ns *ns;
235 	struct ctrlr_entry *entry = calloc(1, sizeof(struct ctrlr_entry));
236 	union spdk_nvme_cap_register cap = spdk_nvme_ctrlr_get_regs_cap(ctrlr);
237 	const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr);
238 
239 	if (entry == NULL) {
240 		perror("ctrlr_entry malloc");
241 		exit(1);
242 	}
243 
244 	snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
245 
246 	entry->ctrlr = ctrlr;
247 	TAILQ_INSERT_TAIL(&g_controllers, entry, link);
248 
249 	if ((g_arbitration.latency_tracking_enable != 0) &&
250 	    spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
251 		set_latency_tracking_feature(ctrlr, true);
252 	}
253 
254 	for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); nsid != 0;
255 	     nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
256 		ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
257 		if (ns == NULL) {
258 			continue;
259 		}
260 		register_ns(ctrlr, ns);
261 	}
262 
263 	if (g_arbitration.arbitration_mechanism == SPDK_NVME_CAP_AMS_WRR &&
264 	    (cap.bits.ams & SPDK_NVME_CAP_AMS_WRR)) {
265 		get_arb_feature(ctrlr);
266 
267 		if (g_arbitration.arbitration_config != 0) {
268 			set_arb_feature(ctrlr);
269 			get_arb_feature(ctrlr);
270 		}
271 	}
272 }
273 
274 static __thread unsigned int seed = 0;
275 
276 static void
277 submit_single_io(struct ns_worker_ctx *ns_ctx)
278 {
279 	struct arb_task		*task = NULL;
280 	uint64_t		offset_in_ios;
281 	int			rc;
282 	struct ns_entry		*entry = ns_ctx->entry;
283 
284 	task = spdk_mempool_get(task_pool);
285 	if (!task) {
286 		fprintf(stderr, "Failed to get task from task_pool\n");
287 		exit(1);
288 	}
289 
290 	task->buf = spdk_dma_zmalloc(g_arbitration.io_size_bytes, 0x200, NULL);
291 	if (!task->buf) {
292 		spdk_mempool_put(task_pool, task);
293 		fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n");
294 		exit(1);
295 	}
296 
297 	task->ns_ctx = ns_ctx;
298 
299 	if (g_arbitration.is_random) {
300 		offset_in_ios = rand_r(&seed) % entry->size_in_ios;
301 	} else {
302 		offset_in_ios = ns_ctx->offset_in_ios++;
303 		if (ns_ctx->offset_in_ios == entry->size_in_ios) {
304 			ns_ctx->offset_in_ios = 0;
305 		}
306 	}
307 
308 	if ((g_arbitration.rw_percentage == 100) ||
309 	    (g_arbitration.rw_percentage != 0 &&
310 	     ((rand_r(&seed) % 100) < g_arbitration.rw_percentage))) {
311 		rc = spdk_nvme_ns_cmd_read(entry->nvme.ns, ns_ctx->qpair, task->buf,
312 					   offset_in_ios * entry->io_size_blocks,
313 					   entry->io_size_blocks, io_complete, task, 0);
314 	} else {
315 		rc = spdk_nvme_ns_cmd_write(entry->nvme.ns, ns_ctx->qpair, task->buf,
316 					    offset_in_ios * entry->io_size_blocks,
317 					    entry->io_size_blocks, io_complete, task, 0);
318 	}
319 
320 	if (rc != 0) {
321 		fprintf(stderr, "starting I/O failed\n");
322 	} else {
323 		ns_ctx->current_queue_depth++;
324 	}
325 }
326 
327 static void
328 task_complete(struct arb_task *task)
329 {
330 	struct ns_worker_ctx	*ns_ctx;
331 
332 	ns_ctx = task->ns_ctx;
333 	ns_ctx->current_queue_depth--;
334 	ns_ctx->io_completed++;
335 
336 	spdk_dma_free(task->buf);
337 	spdk_mempool_put(task_pool, task);
338 
339 	/*
340 	 * is_draining indicates when time has expired for the test run
341 	 * and we are just waiting for the previously submitted I/O
342 	 * to complete.  In this case, do not submit a new I/O to replace
343 	 * the one just completed.
344 	 */
345 	if (!ns_ctx->is_draining) {
346 		submit_single_io(ns_ctx);
347 	}
348 }
349 
350 static void
351 io_complete(void *ctx, const struct spdk_nvme_cpl *completion)
352 {
353 	task_complete((struct arb_task *)ctx);
354 }
355 
356 static void
357 check_io(struct ns_worker_ctx *ns_ctx)
358 {
359 	spdk_nvme_qpair_process_completions(ns_ctx->qpair, g_arbitration.max_completions);
360 }
361 
362 static void
363 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth)
364 {
365 	while (queue_depth-- > 0) {
366 		submit_single_io(ns_ctx);
367 	}
368 }
369 
370 static void
371 drain_io(struct ns_worker_ctx *ns_ctx)
372 {
373 	ns_ctx->is_draining = true;
374 	while (ns_ctx->current_queue_depth > 0) {
375 		check_io(ns_ctx);
376 	}
377 }
378 
379 static int
380 init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx, enum spdk_nvme_qprio qprio)
381 {
382 	struct spdk_nvme_ctrlr *ctrlr = ns_ctx->entry->nvme.ctrlr;
383 	struct spdk_nvme_io_qpair_opts opts;
384 
385 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
386 	opts.qprio = qprio;
387 
388 	ns_ctx->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
389 	if (!ns_ctx->qpair) {
390 		printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
391 		return 1;
392 	}
393 
394 	return 0;
395 }
396 
397 static void
398 cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
399 {
400 	spdk_nvme_ctrlr_free_io_qpair(ns_ctx->qpair);
401 }
402 
403 static void
404 cleanup(uint32_t task_count)
405 {
406 	struct ns_entry *entry, *tmp_entry;
407 	struct worker_thread *worker, *tmp_worker;
408 	struct ns_worker_ctx *ns_ctx, *tmp_ns_ctx;
409 
410 	TAILQ_FOREACH_SAFE(entry, &g_namespaces, link, tmp_entry) {
411 		TAILQ_REMOVE(&g_namespaces, entry, link);
412 		free(entry);
413 	};
414 
415 	TAILQ_FOREACH_SAFE(worker, &g_workers, link, tmp_worker) {
416 		TAILQ_REMOVE(&g_workers, worker, link);
417 
418 		/* ns_worker_ctx is a list in the worker */
419 		TAILQ_FOREACH_SAFE(ns_ctx, &worker->ns_ctx, link, tmp_ns_ctx) {
420 			TAILQ_REMOVE(&worker->ns_ctx, ns_ctx, link);
421 			free(ns_ctx);
422 		}
423 
424 		free(worker);
425 	};
426 
427 	if (spdk_mempool_count(task_pool) != (size_t)task_count) {
428 		fprintf(stderr, "task_pool count is %zu but should be %u\n",
429 			spdk_mempool_count(task_pool), task_count);
430 	}
431 	spdk_mempool_free(task_pool);
432 }
433 
434 static int
435 work_fn(void *arg)
436 {
437 	uint64_t tsc_end;
438 	struct worker_thread *worker = (struct worker_thread *)arg;
439 	struct ns_worker_ctx *ns_ctx;
440 
441 	printf("Starting thread on core %u with %s\n", worker->lcore, print_qprio(worker->qprio));
442 
443 	/* Allocate a queue pair for each namespace. */
444 	TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
445 		if (init_ns_worker_ctx(ns_ctx, worker->qprio) != 0) {
446 			printf("ERROR: init_ns_worker_ctx() failed\n");
447 			return 1;
448 		}
449 	}
450 
451 	tsc_end = spdk_get_ticks() + g_arbitration.time_in_sec * g_arbitration.tsc_rate;
452 
453 	/* Submit initial I/O for each namespace. */
454 	TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
455 		submit_io(ns_ctx, g_arbitration.queue_depth);
456 	}
457 
458 	while (1) {
459 		/*
460 		 * Check for completed I/O for each controller. A new
461 		 * I/O will be submitted in the io_complete callback
462 		 * to replace each I/O that is completed.
463 		 */
464 		TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
465 			check_io(ns_ctx);
466 		}
467 
468 		if (spdk_get_ticks() > tsc_end) {
469 			break;
470 		}
471 	}
472 
473 	TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
474 		drain_io(ns_ctx);
475 		cleanup_ns_worker_ctx(ns_ctx);
476 	}
477 
478 	return 0;
479 }
480 
481 static void
482 usage(char *program_name)
483 {
484 	printf("%s options", program_name);
485 	printf("\t\n");
486 	printf("\t[-d DPDK huge memory size in MB]\n");
487 	printf("\t[-q io depth]\n");
488 	printf("\t[-s io size in bytes]\n");
489 	printf("\t[-w io pattern type, must be one of\n");
490 	printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
491 	printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
492 #ifdef DEBUG
493 	printf("\t[-L enable debug logging]\n");
494 #else
495 	printf("\t[-L enable debug logging (flag disabled, must reconfigure with --enable-debug)\n");
496 #endif
497 	spdk_log_usage(stdout, "\t\t-L");
498 	printf("\t[-l enable latency tracking, default: disabled]\n");
499 	printf("\t\t(0 - disabled; 1 - enabled)\n");
500 	printf("\t[-t time in seconds]\n");
501 	printf("\t[-c core mask for I/O submission/completion.]\n");
502 	printf("\t\t(default: 0xf - 4 cores)]\n");
503 	printf("\t[-m max completions per poll]\n");
504 	printf("\t\t(default: 0 - unlimited)\n");
505 	printf("\t[-a arbitration mechanism, must be one of below]\n");
506 	printf("\t\t(0, 1, 2)]\n");
507 	printf("\t\t(0: default round robin mechanism)]\n");
508 	printf("\t\t(1: weighted round robin mechanism)]\n");
509 	printf("\t\t(2: vendor specific mechanism)]\n");
510 	printf("\t[-b enable arbitration user configuration, default: disabled]\n");
511 	printf("\t\t(0 - disabled; 1 - enabled)\n");
512 	printf("\t[-n subjected IOs for performance comparison]\n");
513 	printf("\t[-i shared memory group ID]\n");
514 	printf("\t[-r remote NVMe over Fabrics target address]\n");
515 	printf("\t[-g use single file descriptor for DPDK memory segments]\n");
516 }
517 
518 static const char *
519 print_qprio(enum spdk_nvme_qprio qprio)
520 {
521 	switch (qprio) {
522 	case SPDK_NVME_QPRIO_URGENT:
523 		return "urgent priority queue";
524 	case SPDK_NVME_QPRIO_HIGH:
525 		return "high priority queue";
526 	case SPDK_NVME_QPRIO_MEDIUM:
527 		return "medium priority queue";
528 	case SPDK_NVME_QPRIO_LOW:
529 		return "low priority queue";
530 	default:
531 		return "invalid priority queue";
532 	}
533 }
534 
535 
536 static void
537 print_configuration(char *program_name)
538 {
539 	printf("%s run with configuration:\n", program_name);
540 	printf("%s -q %d -s %d -w %s -M %d -l %d -t %d -c %s -m %d -a %d -b %d -n %d -i %d\n",
541 	       program_name,
542 	       g_arbitration.queue_depth,
543 	       g_arbitration.io_size_bytes,
544 	       g_arbitration.workload_type,
545 	       g_arbitration.rw_percentage,
546 	       g_arbitration.latency_tracking_enable,
547 	       g_arbitration.time_in_sec,
548 	       g_arbitration.core_mask,
549 	       g_arbitration.max_completions,
550 	       g_arbitration.arbitration_mechanism,
551 	       g_arbitration.arbitration_config,
552 	       g_arbitration.io_count,
553 	       g_arbitration.shm_id);
554 }
555 
556 
557 static void
558 print_performance(void)
559 {
560 	float io_per_second, sent_all_io_in_secs;
561 	struct worker_thread	*worker;
562 	struct ns_worker_ctx	*ns_ctx;
563 
564 	TAILQ_FOREACH(worker, &g_workers, link) {
565 		TAILQ_FOREACH(ns_ctx, &worker->ns_ctx, link) {
566 			io_per_second = (float)ns_ctx->io_completed / g_arbitration.time_in_sec;
567 			sent_all_io_in_secs = g_arbitration.io_count / io_per_second;
568 			printf("%-43.43s core %u: %8.2f IO/s %8.2f secs/%d ios\n",
569 			       ns_ctx->entry->name, worker->lcore,
570 			       io_per_second, sent_all_io_in_secs, g_arbitration.io_count);
571 		}
572 	}
573 	printf("========================================================\n");
574 
575 	printf("\n");
576 }
577 
578 static void
579 print_latency_page(struct ctrlr_entry *entry)
580 {
581 	int i;
582 
583 	printf("\n");
584 	printf("%s\n", entry->name);
585 	printf("--------------------------------------------------------\n");
586 
587 	for (i = 0; i < 32; i++) {
588 		if (entry->latency_page.buckets_32us[i])
589 			printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32,
590 			       entry->latency_page.buckets_32us[i]);
591 	}
592 	for (i = 0; i < 31; i++) {
593 		if (entry->latency_page.buckets_1ms[i])
594 			printf("Bucket %dms - %dms: %d\n", i + 1, i + 2,
595 			       entry->latency_page.buckets_1ms[i]);
596 	}
597 	for (i = 0; i < 31; i++) {
598 		if (entry->latency_page.buckets_32ms[i])
599 			printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32,
600 			       entry->latency_page.buckets_32ms[i]);
601 	}
602 }
603 
604 static void
605 print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page)
606 {
607 	struct ctrlr_entry	*ctrlr;
608 
609 	printf("%s Latency Statistics:\n", op_name);
610 	printf("========================================================\n");
611 	TAILQ_FOREACH(ctrlr, &g_controllers, link) {
612 		if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
613 			if (spdk_nvme_ctrlr_cmd_get_log_page(
614 				    ctrlr->ctrlr, log_page,
615 				    SPDK_NVME_GLOBAL_NS_TAG,
616 				    &ctrlr->latency_page,
617 				    sizeof(struct spdk_nvme_intel_rw_latency_page),
618 				    0,
619 				    enable_latency_tracking_complete,
620 				    NULL)) {
621 				printf("nvme_ctrlr_cmd_get_log_page() failed\n");
622 				exit(1);
623 			}
624 
625 			g_arbitration.outstanding_commands++;
626 		} else {
627 			printf("Controller %s: %s latency statistics not supported\n",
628 			       ctrlr->name, op_name);
629 		}
630 	}
631 
632 	while (g_arbitration.outstanding_commands) {
633 		TAILQ_FOREACH(ctrlr, &g_controllers, link) {
634 			spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr);
635 		}
636 	}
637 
638 	TAILQ_FOREACH(ctrlr, &g_controllers, link) {
639 		if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
640 			print_latency_page(ctrlr);
641 		}
642 	}
643 	printf("\n");
644 }
645 
646 static void
647 print_stats(void)
648 {
649 	print_performance();
650 	if (g_arbitration.latency_tracking_enable) {
651 		if (g_arbitration.rw_percentage != 0) {
652 			print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY);
653 		}
654 		if (g_arbitration.rw_percentage != 100) {
655 			print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY);
656 		}
657 	}
658 }
659 
660 static int
661 parse_args(int argc, char **argv)
662 {
663 	const char *workload_type	= NULL;
664 	int op				= 0;
665 	bool mix_specified		= false;
666 	int				rc;
667 	long int val;
668 
669 	spdk_nvme_trid_populate_transport(&g_trid, SPDK_NVME_TRANSPORT_PCIE);
670 	snprintf(g_trid.subnqn, sizeof(g_trid.subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
671 
672 	while ((op = getopt(argc, argv, "a:b:c:d:ghi:l:m:n:q:r:s:t:w:M:L:")) != -1) {
673 		switch (op) {
674 		case 'c':
675 			g_arbitration.core_mask = optarg;
676 			break;
677 		case 'd':
678 			g_dpdk_mem = spdk_strtol(optarg, 10);
679 			if (g_dpdk_mem < 0) {
680 				fprintf(stderr, "Invalid DPDK memory size\n");
681 				return g_dpdk_mem;
682 			}
683 			break;
684 		case 'w':
685 			g_arbitration.workload_type = optarg;
686 			break;
687 		case 'r':
688 			if (spdk_nvme_transport_id_parse(&g_trid, optarg) != 0) {
689 				fprintf(stderr, "Error parsing transport address\n");
690 				return 1;
691 			}
692 			break;
693 		case 'g':
694 			g_dpdk_mem_single_seg = true;
695 			break;
696 		case 'h':
697 		case '?':
698 			usage(argv[0]);
699 			return 1;
700 		case 'L':
701 			rc = spdk_log_set_flag(optarg);
702 			if (rc < 0) {
703 				fprintf(stderr, "unknown flag\n");
704 				usage(argv[0]);
705 				exit(EXIT_FAILURE);
706 			}
707 #ifdef DEBUG
708 			spdk_log_set_print_level(SPDK_LOG_DEBUG);
709 #endif
710 			break;
711 		default:
712 			val = spdk_strtol(optarg, 10);
713 			if (val < 0) {
714 				fprintf(stderr, "Converting a string to integer failed\n");
715 				return val;
716 			}
717 			switch (op) {
718 			case 'i':
719 				g_arbitration.shm_id = val;
720 				break;
721 			case 'l':
722 				g_arbitration.latency_tracking_enable = val;
723 				break;
724 			case 'm':
725 				g_arbitration.max_completions = val;
726 				break;
727 			case 'q':
728 				g_arbitration.queue_depth = val;
729 				break;
730 			case 's':
731 				g_arbitration.io_size_bytes = val;
732 				break;
733 			case 't':
734 				g_arbitration.time_in_sec = val;
735 				break;
736 			case 'M':
737 				g_arbitration.rw_percentage = val;
738 				mix_specified = true;
739 				break;
740 			case 'a':
741 				g_arbitration.arbitration_mechanism = val;
742 				break;
743 			case 'b':
744 				g_arbitration.arbitration_config = val;
745 				break;
746 			case 'n':
747 				g_arbitration.io_count = val;
748 				break;
749 			default:
750 				usage(argv[0]);
751 				return -EINVAL;
752 			}
753 		}
754 	}
755 
756 	workload_type = g_arbitration.workload_type;
757 
758 	if (strcmp(workload_type, "read") &&
759 	    strcmp(workload_type, "write") &&
760 	    strcmp(workload_type, "randread") &&
761 	    strcmp(workload_type, "randwrite") &&
762 	    strcmp(workload_type, "rw") &&
763 	    strcmp(workload_type, "randrw")) {
764 		fprintf(stderr,
765 			"io pattern type must be one of\n"
766 			"(read, write, randread, randwrite, rw, randrw)\n");
767 		return 1;
768 	}
769 
770 	if (!strcmp(workload_type, "read") ||
771 	    !strcmp(workload_type, "randread")) {
772 		g_arbitration.rw_percentage = 100;
773 	}
774 
775 	if (!strcmp(workload_type, "write") ||
776 	    !strcmp(workload_type, "randwrite")) {
777 		g_arbitration.rw_percentage = 0;
778 	}
779 
780 	if (!strcmp(workload_type, "read") ||
781 	    !strcmp(workload_type, "randread") ||
782 	    !strcmp(workload_type, "write") ||
783 	    !strcmp(workload_type, "randwrite")) {
784 		if (mix_specified) {
785 			fprintf(stderr, "Ignoring -M option... Please use -M option"
786 				" only when using rw or randrw.\n");
787 		}
788 	}
789 
790 	if (!strcmp(workload_type, "rw") ||
791 	    !strcmp(workload_type, "randrw")) {
792 		if (g_arbitration.rw_percentage < 0 || g_arbitration.rw_percentage > 100) {
793 			fprintf(stderr,
794 				"-M must be specified to value from 0 to 100 "
795 				"for rw or randrw.\n");
796 			return 1;
797 		}
798 	}
799 
800 	if (!strcmp(workload_type, "read") ||
801 	    !strcmp(workload_type, "write") ||
802 	    !strcmp(workload_type, "rw")) {
803 		g_arbitration.is_random = 0;
804 	} else {
805 		g_arbitration.is_random = 1;
806 	}
807 
808 	if (g_arbitration.latency_tracking_enable != 0 &&
809 	    g_arbitration.latency_tracking_enable != 1) {
810 		fprintf(stderr,
811 			"-l must be specified to value 0 or 1.\n");
812 		return 1;
813 	}
814 
815 	switch (g_arbitration.arbitration_mechanism) {
816 	case SPDK_NVME_CC_AMS_RR:
817 	case SPDK_NVME_CC_AMS_WRR:
818 	case SPDK_NVME_CC_AMS_VS:
819 		break;
820 	default:
821 		fprintf(stderr,
822 			"-a must be specified to value 0, 1, or 7.\n");
823 		return 1;
824 	}
825 
826 	if (g_arbitration.arbitration_config != 0 &&
827 	    g_arbitration.arbitration_config != 1) {
828 		fprintf(stderr,
829 			"-b must be specified to value 0 or 1.\n");
830 		return 1;
831 	} else if (g_arbitration.arbitration_config == 1 &&
832 		   g_arbitration.arbitration_mechanism != SPDK_NVME_CC_AMS_WRR) {
833 		fprintf(stderr,
834 			"-a must be specified to 1 (WRR) together.\n");
835 		return 1;
836 	}
837 
838 	return 0;
839 }
840 
841 static int
842 register_workers(void)
843 {
844 	uint32_t i;
845 	struct worker_thread *worker;
846 	enum spdk_nvme_qprio qprio = SPDK_NVME_QPRIO_URGENT;
847 
848 	SPDK_ENV_FOREACH_CORE(i) {
849 		worker = calloc(1, sizeof(*worker));
850 		if (worker == NULL) {
851 			fprintf(stderr, "Unable to allocate worker\n");
852 			return -1;
853 		}
854 
855 		TAILQ_INIT(&worker->ns_ctx);
856 		worker->lcore = i;
857 		TAILQ_INSERT_TAIL(&g_workers, worker, link);
858 		g_arbitration.num_workers++;
859 
860 		if (g_arbitration.arbitration_mechanism == SPDK_NVME_CAP_AMS_WRR) {
861 			qprio++;
862 		}
863 
864 		worker->qprio = qprio & SPDK_NVME_CREATE_IO_SQ_QPRIO_MASK;
865 	}
866 
867 	return 0;
868 }
869 
870 static bool
871 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
872 	 struct spdk_nvme_ctrlr_opts *opts)
873 {
874 	/* Update with user specified arbitration configuration */
875 	opts->arb_mechanism = g_arbitration.arbitration_mechanism;
876 
877 	printf("Attaching to %s\n", trid->traddr);
878 
879 	return true;
880 }
881 
882 static void
883 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
884 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
885 {
886 	printf("Attached to %s\n", trid->traddr);
887 
888 	/* Update with actual arbitration configuration in use */
889 	g_arbitration.arbitration_mechanism = opts->arb_mechanism;
890 
891 	register_ctrlr(ctrlr);
892 }
893 
894 static int
895 register_controllers(void)
896 {
897 	printf("Initializing NVMe Controllers\n");
898 
899 	if (spdk_nvme_probe(&g_trid, NULL, probe_cb, attach_cb, NULL) != 0) {
900 		fprintf(stderr, "spdk_nvme_probe() failed\n");
901 		return 1;
902 	}
903 
904 	if (g_arbitration.num_namespaces == 0) {
905 		fprintf(stderr, "No valid namespaces to continue IO testing\n");
906 		return 1;
907 	}
908 
909 	return 0;
910 }
911 
912 static void
913 unregister_controllers(void)
914 {
915 	struct ctrlr_entry *entry, *tmp;
916 	struct spdk_nvme_detach_ctx *detach_ctx = NULL;
917 
918 	TAILQ_FOREACH_SAFE(entry, &g_controllers, link, tmp) {
919 		TAILQ_REMOVE(&g_controllers, entry, link);
920 		if (g_arbitration.latency_tracking_enable &&
921 		    spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
922 			set_latency_tracking_feature(entry->ctrlr, false);
923 		}
924 		spdk_nvme_detach_async(entry->ctrlr, &detach_ctx);
925 		free(entry);
926 	}
927 
928 	while (detach_ctx && spdk_nvme_detach_poll_async(detach_ctx) == -EAGAIN) {
929 		;
930 	}
931 }
932 
933 static int
934 associate_workers_with_ns(void)
935 {
936 	struct ns_entry		*entry = TAILQ_FIRST(&g_namespaces);
937 	struct worker_thread	*worker = TAILQ_FIRST(&g_workers);
938 	struct ns_worker_ctx	*ns_ctx;
939 	int			i, count;
940 
941 	count = g_arbitration.num_namespaces > g_arbitration.num_workers ?
942 		g_arbitration.num_namespaces : g_arbitration.num_workers;
943 
944 	for (i = 0; i < count; i++) {
945 		if (entry == NULL) {
946 			break;
947 		}
948 
949 		ns_ctx = malloc(sizeof(struct ns_worker_ctx));
950 		if (!ns_ctx) {
951 			return 1;
952 		}
953 		memset(ns_ctx, 0, sizeof(*ns_ctx));
954 
955 		printf("Associating %s with lcore %d\n", entry->name, worker->lcore);
956 		ns_ctx->entry = entry;
957 		TAILQ_INSERT_TAIL(&worker->ns_ctx, ns_ctx, link);
958 
959 		worker = TAILQ_NEXT(worker, link);
960 		if (worker == NULL) {
961 			worker = TAILQ_FIRST(&g_workers);
962 		}
963 
964 		entry = TAILQ_NEXT(entry, link);
965 		if (entry == NULL) {
966 			entry = TAILQ_FIRST(&g_namespaces);
967 		}
968 
969 	}
970 
971 	return 0;
972 }
973 
974 static void
975 get_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl)
976 {
977 	struct feature *feature = cb_arg;
978 	int fid = feature - features;
979 
980 	if (spdk_nvme_cpl_is_error(cpl)) {
981 		printf("get_feature(0x%02X) failed\n", fid);
982 	} else {
983 		feature->result = cpl->cdw0;
984 		feature->valid = true;
985 	}
986 
987 	g_arbitration.outstanding_commands--;
988 }
989 
990 static int
991 get_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t fid)
992 {
993 	struct spdk_nvme_cmd cmd = {};
994 	struct feature *feature = &features[fid];
995 
996 	feature->valid = false;
997 
998 	cmd.opc = SPDK_NVME_OPC_GET_FEATURES;
999 	cmd.cdw10_bits.get_features.fid = fid;
1000 
1001 	return spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &cmd, NULL, 0, get_feature_completion, feature);
1002 }
1003 
1004 static void
1005 get_arb_feature(struct spdk_nvme_ctrlr *ctrlr)
1006 {
1007 	get_feature(ctrlr, SPDK_NVME_FEAT_ARBITRATION);
1008 
1009 	g_arbitration.outstanding_commands++;
1010 
1011 	while (g_arbitration.outstanding_commands) {
1012 		spdk_nvme_ctrlr_process_admin_completions(ctrlr);
1013 	}
1014 
1015 	if (features[SPDK_NVME_FEAT_ARBITRATION].valid) {
1016 		union spdk_nvme_cmd_cdw11 arb;
1017 		arb.feat_arbitration.raw = features[SPDK_NVME_FEAT_ARBITRATION].result;
1018 
1019 		printf("Current Arbitration Configuration\n");
1020 		printf("===========\n");
1021 		printf("Arbitration Burst:           ");
1022 		if (arb.feat_arbitration.bits.ab == SPDK_NVME_ARBITRATION_BURST_UNLIMITED) {
1023 			printf("no limit\n");
1024 		} else {
1025 			printf("%u\n", 1u << arb.feat_arbitration.bits.ab);
1026 		}
1027 
1028 		printf("Low Priority Weight:         %u\n", arb.feat_arbitration.bits.lpw + 1);
1029 		printf("Medium Priority Weight:      %u\n", arb.feat_arbitration.bits.mpw + 1);
1030 		printf("High Priority Weight:        %u\n", arb.feat_arbitration.bits.hpw + 1);
1031 		printf("\n");
1032 	}
1033 }
1034 
1035 static void
1036 set_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl)
1037 {
1038 	struct feature *feature = cb_arg;
1039 	int fid = feature - features;
1040 
1041 	if (spdk_nvme_cpl_is_error(cpl)) {
1042 		printf("set_feature(0x%02X) failed\n", fid);
1043 		feature->valid = false;
1044 	} else {
1045 		printf("Set Arbitration Feature Successfully\n");
1046 	}
1047 
1048 	g_arbitration.outstanding_commands--;
1049 }
1050 
1051 static int
1052 set_arb_feature(struct spdk_nvme_ctrlr *ctrlr)
1053 {
1054 	int ret;
1055 	struct spdk_nvme_cmd cmd = {};
1056 
1057 	cmd.opc = SPDK_NVME_OPC_SET_FEATURES;
1058 	cmd.cdw10_bits.set_features.fid = SPDK_NVME_FEAT_ARBITRATION;
1059 
1060 	g_arbitration.outstanding_commands = 0;
1061 
1062 	if (features[SPDK_NVME_FEAT_ARBITRATION].valid) {
1063 		cmd.cdw11_bits.feat_arbitration.bits.ab = SPDK_NVME_ARBITRATION_BURST_UNLIMITED;
1064 		cmd.cdw11_bits.feat_arbitration.bits.lpw = USER_SPECIFIED_LOW_PRIORITY_WEIGHT;
1065 		cmd.cdw11_bits.feat_arbitration.bits.mpw = USER_SPECIFIED_MEDIUM_PRIORITY_WEIGHT;
1066 		cmd.cdw11_bits.feat_arbitration.bits.hpw = USER_SPECIFIED_HIGH_PRIORITY_WEIGHT;
1067 	}
1068 
1069 	ret = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &cmd, NULL, 0,
1070 					    set_feature_completion, &features[SPDK_NVME_FEAT_ARBITRATION]);
1071 	if (ret) {
1072 		printf("Set Arbitration Feature: Failed 0x%x\n", ret);
1073 		return 1;
1074 	}
1075 
1076 	g_arbitration.outstanding_commands++;
1077 
1078 	while (g_arbitration.outstanding_commands) {
1079 		spdk_nvme_ctrlr_process_admin_completions(ctrlr);
1080 	}
1081 
1082 	if (!features[SPDK_NVME_FEAT_ARBITRATION].valid) {
1083 		printf("Set Arbitration Feature failed and use default configuration\n");
1084 	}
1085 
1086 	return 0;
1087 }
1088 
1089 int
1090 main(int argc, char **argv)
1091 {
1092 	int rc;
1093 	struct worker_thread *worker, *main_worker;
1094 	unsigned main_core;
1095 	char task_pool_name[30];
1096 	uint32_t task_count = 0;
1097 	struct spdk_env_opts opts;
1098 
1099 	rc = parse_args(argc, argv);
1100 	if (rc != 0) {
1101 		return rc;
1102 	}
1103 
1104 	spdk_env_opts_init(&opts);
1105 	opts.name = "arb";
1106 	opts.mem_size = g_dpdk_mem;
1107 	opts.hugepage_single_segments = g_dpdk_mem_single_seg;
1108 	opts.core_mask = g_arbitration.core_mask;
1109 	opts.shm_id = g_arbitration.shm_id;
1110 	if (spdk_env_init(&opts) < 0) {
1111 		return 1;
1112 	}
1113 
1114 	g_arbitration.tsc_rate = spdk_get_ticks_hz();
1115 
1116 	if (register_workers() != 0) {
1117 		rc = 1;
1118 		goto exit;
1119 	}
1120 
1121 	if (register_controllers() != 0) {
1122 		rc = 1;
1123 		goto exit;
1124 	}
1125 
1126 	if (associate_workers_with_ns() != 0) {
1127 		rc = 1;
1128 		goto exit;
1129 	}
1130 
1131 	snprintf(task_pool_name, sizeof(task_pool_name), "task_pool_%d", getpid());
1132 
1133 	/*
1134 	 * The task_count will be dynamically calculated based on the
1135 	 * number of attached active namespaces, queue depth and number
1136 	 * of cores (workers) involved in the IO perations.
1137 	 */
1138 	task_count = g_arbitration.num_namespaces > g_arbitration.num_workers ?
1139 		     g_arbitration.num_namespaces : g_arbitration.num_workers;
1140 	task_count *= g_arbitration.queue_depth;
1141 
1142 	task_pool = spdk_mempool_create(task_pool_name, task_count,
1143 					sizeof(struct arb_task), 0, SPDK_ENV_SOCKET_ID_ANY);
1144 	if (task_pool == NULL) {
1145 		fprintf(stderr, "could not initialize task pool\n");
1146 		rc = 1;
1147 		goto exit;
1148 	}
1149 
1150 	print_configuration(argv[0]);
1151 
1152 	printf("Initialization complete. Launching workers.\n");
1153 
1154 	/* Launch all of the secondary workers */
1155 	main_core = spdk_env_get_current_core();
1156 	main_worker = NULL;
1157 	TAILQ_FOREACH(worker, &g_workers, link) {
1158 		if (worker->lcore != main_core) {
1159 			spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker);
1160 		} else {
1161 			assert(main_worker == NULL);
1162 			main_worker = worker;
1163 		}
1164 	}
1165 
1166 	assert(main_worker != NULL);
1167 	rc = work_fn(main_worker);
1168 
1169 	spdk_env_thread_wait_all();
1170 
1171 	print_stats();
1172 
1173 exit:
1174 	unregister_controllers();
1175 	cleanup(task_count);
1176 
1177 	spdk_env_fini();
1178 
1179 	if (rc != 0) {
1180 		fprintf(stderr, "%s: errors occurred\n", argv[0]);
1181 	}
1182 
1183 	return rc;
1184 }
1185