xref: /spdk/examples/nvme/arbitration/arbitration.c (revision fa2d95b3fe66e7f5c543eaef89fa00d4eaa0e6e7)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "spdk/nvme.h"
37 #include "spdk/env.h"
38 #include "spdk/string.h"
39 #include "spdk/nvme_intel.h"
40 
41 struct ctrlr_entry {
42 	struct spdk_nvme_ctrlr			*ctrlr;
43 	struct spdk_nvme_intel_rw_latency_page	latency_page;
44 	struct ctrlr_entry			*next;
45 	char					name[1024];
46 };
47 
48 struct ns_entry {
49 	struct {
50 		struct spdk_nvme_ctrlr		*ctrlr;
51 		struct spdk_nvme_ns		*ns;
52 	} nvme;
53 
54 	struct ns_entry				*next;
55 	uint32_t				io_size_blocks;
56 	uint64_t				size_in_ios;
57 	char					name[1024];
58 };
59 
60 struct ns_worker_ctx {
61 	struct ns_entry				*entry;
62 	uint64_t				io_completed;
63 	uint64_t				current_queue_depth;
64 	uint64_t				offset_in_ios;
65 	bool					is_draining;
66 	struct spdk_nvme_qpair			*qpair;
67 	struct ns_worker_ctx			*next;
68 };
69 
70 struct arb_task {
71 	struct ns_worker_ctx			*ns_ctx;
72 	void					*buf;
73 };
74 
75 struct worker_thread {
76 	struct ns_worker_ctx			*ns_ctx;
77 	struct worker_thread			*next;
78 	unsigned				lcore;
79 	enum spdk_nvme_qprio			qprio;
80 };
81 
82 struct arb_context {
83 	int					shm_id;
84 	int					outstanding_commands;
85 	int					num_namespaces;
86 	int					num_workers;
87 	int					rw_percentage;
88 	int					is_random;
89 	int					queue_depth;
90 	int					time_in_sec;
91 	int					io_count;
92 	uint8_t					latency_tracking_enable;
93 	uint8_t					arbitration_mechanism;
94 	uint8_t					arbitration_config;
95 	uint32_t				io_size_bytes;
96 	uint32_t				max_completions;
97 	uint64_t				tsc_rate;
98 	const char				*core_mask;
99 	const char				*workload_type;
100 };
101 
102 struct feature {
103 	uint32_t				result;
104 	bool					valid;
105 };
106 
107 static struct spdk_mempool *task_pool		= NULL;
108 
109 static struct ctrlr_entry *g_controllers	= NULL;
110 static struct ns_entry *g_namespaces		= NULL;
111 static struct worker_thread *g_workers		= NULL;
112 
113 static struct feature features[256];
114 
115 static struct arb_context g_arbitration = {
116 	.shm_id					= -1,
117 	.outstanding_commands			= 0,
118 	.num_workers				= 0,
119 	.num_namespaces				= 0,
120 	.rw_percentage				= 50,
121 	.queue_depth				= 64,
122 	.time_in_sec				= 60,
123 	.io_count				= 100000,
124 	.latency_tracking_enable		= 0,
125 	.arbitration_mechanism			= SPDK_NVME_CC_AMS_RR,
126 	.arbitration_config			= 0,
127 	.io_size_bytes				= 131072,
128 	.max_completions			= 0,
129 	/* Default 4 cores for urgent/high/medium/low */
130 	.core_mask				= "0xf",
131 	.workload_type				= "randrw",
132 };
133 
134 /*
135  * For weighted round robin arbitration mechanism, the smaller value between
136  * weight and burst will be picked to execute the commands in one queue.
137  */
138 #define USER_SPECIFIED_HIGH_PRIORITY_WEIGHT	32
139 #define USER_SPECIFIED_MEDIUM_PRIORITY_WEIGHT	16
140 #define USER_SPECIFIED_LOW_PRIORITY_WEIGHT	8
141 #define USER_SPECIFIED_ARBITRATION_BURST	7	/* No limit */
142 
143 /*
144  * Description of dword for priority weight and arbitration burst
145  * ------------------------------------------------------------------------------
146  *     31 : 24      |       23 : 16      |    15 : 08      | 07 : 03  | 02 : 00
147  * ------------------------------------------------------------------------------
148  * High Prio Weight | Medium Prio Weight | Low Prio Weight | Reserved | Arb Burst
149  * ------------------------------------------------------------------------------
150  *
151  * The priority weights are zero based value.
152  */
153 #define SPDK_NVME_HIGH_PRIO_WEIGHT_SHIFT	24
154 #define SPDK_NVME_MED_PRIO_WEIGHT_SHIFT		16
155 #define SPDK_NVME_LOW_PRIO_WEIGHT_SHIFT		8
156 #define SPDK_NVME_PRIO_WEIGHT_MASK		0xFF
157 #define SPDK_NVME_ARB_BURST_MASK		0x7
158 
159 #define SPDK_NVME_QPRIO_MAX			(SPDK_NVME_QPRIO_LOW + 1)
160 
161 static void task_complete(struct arb_task *task);
162 
163 static void io_complete(void *ctx, const struct spdk_nvme_cpl *completion);
164 
165 static void get_arb_feature(struct spdk_nvme_ctrlr *ctrlr);
166 
167 static int set_arb_feature(struct spdk_nvme_ctrlr *ctrlr);
168 
169 static const char *print_qprio(enum spdk_nvme_qprio);
170 
171 
172 static void
173 register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
174 {
175 	struct ns_entry *entry;
176 	const struct spdk_nvme_ctrlr_data *cdata;
177 
178 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
179 
180 	if (!spdk_nvme_ns_is_active(ns)) {
181 		printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
182 		       cdata->mn, cdata->sn,
183 		       spdk_nvme_ns_get_id(ns));
184 		return;
185 	}
186 
187 	if (spdk_nvme_ns_get_size(ns) < g_arbitration.io_size_bytes ||
188 	    spdk_nvme_ns_get_sector_size(ns) > g_arbitration.io_size_bytes) {
189 		printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid "
190 		       "ns size %" PRIu64 " / block size %u for I/O size %u\n",
191 		       cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns),
192 		       spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_sector_size(ns),
193 		       g_arbitration.io_size_bytes);
194 		return;
195 	}
196 
197 	entry = malloc(sizeof(struct ns_entry));
198 	if (entry == NULL) {
199 		perror("ns_entry malloc");
200 		exit(1);
201 	}
202 
203 	entry->nvme.ctrlr = ctrlr;
204 	entry->nvme.ns = ns;
205 
206 	entry->size_in_ios = spdk_nvme_ns_get_size(ns) / g_arbitration.io_size_bytes;
207 	entry->io_size_blocks = g_arbitration.io_size_bytes / spdk_nvme_ns_get_sector_size(ns);
208 
209 	snprintf(entry->name, 44, "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
210 
211 	g_arbitration.num_namespaces++;
212 	entry->next = g_namespaces;
213 	g_namespaces = entry;
214 }
215 
216 static void
217 enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl)
218 {
219 	if (spdk_nvme_cpl_is_error(cpl)) {
220 		printf("enable_latency_tracking_complete failed\n");
221 	}
222 	g_arbitration.outstanding_commands--;
223 }
224 
225 static void
226 set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable)
227 {
228 	int res;
229 	union spdk_nvme_intel_feat_latency_tracking latency_tracking;
230 
231 	if (enable) {
232 		latency_tracking.bits.enable = 0x01;
233 	} else {
234 		latency_tracking.bits.enable = 0x00;
235 	}
236 
237 	res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING,
238 					      latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL);
239 	if (res) {
240 		printf("fail to allocate nvme request.\n");
241 		return;
242 	}
243 	g_arbitration.outstanding_commands++;
244 
245 	while (g_arbitration.outstanding_commands) {
246 		spdk_nvme_ctrlr_process_admin_completions(ctrlr);
247 	}
248 }
249 
250 static void
251 register_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
252 {
253 	int nsid, num_ns;
254 	struct spdk_nvme_ns *ns;
255 	struct ctrlr_entry *entry = calloc(1, sizeof(struct ctrlr_entry));
256 	const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr);
257 
258 	if (entry == NULL) {
259 		perror("ctrlr_entry malloc");
260 		exit(1);
261 	}
262 
263 	snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
264 
265 	entry->ctrlr = ctrlr;
266 	entry->next = g_controllers;
267 	g_controllers = entry;
268 
269 	if ((g_arbitration.latency_tracking_enable != 0) &&
270 	    spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
271 		set_latency_tracking_feature(ctrlr, true);
272 	}
273 
274 	num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
275 	for (nsid = 1; nsid <= num_ns; nsid++) {
276 		ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
277 		if (ns == NULL) {
278 			continue;
279 		}
280 		register_ns(ctrlr, ns);
281 	}
282 
283 	if (g_arbitration.arbitration_mechanism == SPDK_NVME_CAP_AMS_WRR) {
284 		get_arb_feature(ctrlr);
285 
286 		if (g_arbitration.arbitration_config != 0) {
287 			set_arb_feature(ctrlr);
288 			get_arb_feature(ctrlr);
289 		}
290 	}
291 }
292 
293 static __thread unsigned int seed = 0;
294 
295 static void
296 submit_single_io(struct ns_worker_ctx *ns_ctx)
297 {
298 	struct arb_task		*task = NULL;
299 	uint64_t		offset_in_ios;
300 	int			rc;
301 	struct ns_entry		*entry = ns_ctx->entry;
302 
303 	task = spdk_mempool_get(task_pool);
304 	if (!task) {
305 		fprintf(stderr, "Failed to get task from task_pool\n");
306 		exit(1);
307 	}
308 
309 	task->buf = spdk_dma_zmalloc(g_arbitration.io_size_bytes, 0x200, NULL);
310 	if (!task->buf) {
311 		spdk_mempool_put(task_pool, task);
312 		fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n");
313 		exit(1);
314 	}
315 
316 	task->ns_ctx = ns_ctx;
317 
318 	if (g_arbitration.is_random) {
319 		offset_in_ios = rand_r(&seed) % entry->size_in_ios;
320 	} else {
321 		offset_in_ios = ns_ctx->offset_in_ios++;
322 		if (ns_ctx->offset_in_ios == entry->size_in_ios) {
323 			ns_ctx->offset_in_ios = 0;
324 		}
325 	}
326 
327 	if ((g_arbitration.rw_percentage == 100) ||
328 	    (g_arbitration.rw_percentage != 0 &&
329 	     ((rand_r(&seed) % 100) < g_arbitration.rw_percentage))) {
330 		rc = spdk_nvme_ns_cmd_read(entry->nvme.ns, ns_ctx->qpair, task->buf,
331 					   offset_in_ios * entry->io_size_blocks,
332 					   entry->io_size_blocks, io_complete, task, 0);
333 	} else {
334 		rc = spdk_nvme_ns_cmd_write(entry->nvme.ns, ns_ctx->qpair, task->buf,
335 					    offset_in_ios * entry->io_size_blocks,
336 					    entry->io_size_blocks, io_complete, task, 0);
337 	}
338 
339 	if (rc != 0) {
340 		fprintf(stderr, "starting I/O failed\n");
341 	}
342 
343 	ns_ctx->current_queue_depth++;
344 }
345 
346 static void
347 task_complete(struct arb_task *task)
348 {
349 	struct ns_worker_ctx	*ns_ctx;
350 
351 	ns_ctx = task->ns_ctx;
352 	ns_ctx->current_queue_depth--;
353 	ns_ctx->io_completed++;
354 
355 	spdk_dma_free(task->buf);
356 	spdk_mempool_put(task_pool, task);
357 
358 	/*
359 	 * is_draining indicates when time has expired for the test run
360 	 * and we are just waiting for the previously submitted I/O
361 	 * to complete.  In this case, do not submit a new I/O to replace
362 	 * the one just completed.
363 	 */
364 	if (!ns_ctx->is_draining) {
365 		submit_single_io(ns_ctx);
366 	}
367 }
368 
369 static void
370 io_complete(void *ctx, const struct spdk_nvme_cpl *completion)
371 {
372 	task_complete((struct arb_task *)ctx);
373 }
374 
375 static void
376 check_io(struct ns_worker_ctx *ns_ctx)
377 {
378 	spdk_nvme_qpair_process_completions(ns_ctx->qpair, g_arbitration.max_completions);
379 }
380 
381 static void
382 submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth)
383 {
384 	while (queue_depth-- > 0) {
385 		submit_single_io(ns_ctx);
386 	}
387 }
388 
389 static void
390 drain_io(struct ns_worker_ctx *ns_ctx)
391 {
392 	ns_ctx->is_draining = true;
393 	while (ns_ctx->current_queue_depth > 0) {
394 		check_io(ns_ctx);
395 	}
396 }
397 
398 static int
399 init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx, enum spdk_nvme_qprio qprio)
400 {
401 	struct spdk_nvme_ctrlr *ctrlr = ns_ctx->entry->nvme.ctrlr;
402 	struct spdk_nvme_io_qpair_opts opts;
403 
404 	spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
405 	opts.qprio = qprio;
406 
407 	ns_ctx->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
408 	if (!ns_ctx->qpair) {
409 		printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n");
410 		return 1;
411 	}
412 
413 	return 0;
414 }
415 
416 static void
417 cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
418 {
419 	spdk_nvme_ctrlr_free_io_qpair(ns_ctx->qpair);
420 }
421 
422 static void
423 cleanup(uint32_t task_count)
424 {
425 	struct ns_entry *entry			= g_namespaces;
426 	struct ns_entry *next_entry		= NULL;
427 	struct worker_thread *worker		= g_workers;
428 	struct worker_thread *next_worker	= NULL;
429 
430 	while (entry) {
431 		next_entry = entry->next;
432 		free(entry);
433 		entry = next_entry;
434 	};
435 
436 	while (worker) {
437 		next_worker = worker->next;
438 		free(worker->ns_ctx);
439 		free(worker);
440 		worker = next_worker;
441 	};
442 
443 	if (spdk_mempool_count(task_pool) != (size_t)task_count) {
444 		fprintf(stderr, "task_pool count is %zu but should be %u\n",
445 			spdk_mempool_count(task_pool), task_count);
446 	}
447 	spdk_mempool_free(task_pool);
448 }
449 
450 static int
451 work_fn(void *arg)
452 {
453 	uint64_t tsc_end;
454 	struct worker_thread *worker = (struct worker_thread *)arg;
455 	struct ns_worker_ctx *ns_ctx = NULL;
456 
457 	printf("Starting thread on core %u with %s\n", worker->lcore, print_qprio(worker->qprio));
458 
459 	/* Allocate a queue pair for each namespace. */
460 	ns_ctx = worker->ns_ctx;
461 	while (ns_ctx != NULL) {
462 		if (init_ns_worker_ctx(ns_ctx, worker->qprio) != 0) {
463 			printf("ERROR: init_ns_worker_ctx() failed\n");
464 			return 1;
465 		}
466 		ns_ctx = ns_ctx->next;
467 	}
468 
469 	tsc_end = spdk_get_ticks() + g_arbitration.time_in_sec * g_arbitration.tsc_rate;
470 
471 	/* Submit initial I/O for each namespace. */
472 	ns_ctx = worker->ns_ctx;
473 
474 	while (ns_ctx != NULL) {
475 		submit_io(ns_ctx, g_arbitration.queue_depth);
476 		ns_ctx = ns_ctx->next;
477 	}
478 
479 	while (1) {
480 		/*
481 		 * Check for completed I/O for each controller. A new
482 		 * I/O will be submitted in the io_complete callback
483 		 * to replace each I/O that is completed.
484 		 */
485 		ns_ctx = worker->ns_ctx;
486 		while (ns_ctx != NULL) {
487 			check_io(ns_ctx);
488 			ns_ctx = ns_ctx->next;
489 		}
490 
491 		if (spdk_get_ticks() > tsc_end) {
492 			break;
493 		}
494 	}
495 
496 	ns_ctx = worker->ns_ctx;
497 	while (ns_ctx != NULL) {
498 		drain_io(ns_ctx);
499 		cleanup_ns_worker_ctx(ns_ctx);
500 		ns_ctx = ns_ctx->next;
501 	}
502 
503 	return 0;
504 }
505 
506 static void
507 usage(char *program_name)
508 {
509 	printf("%s options", program_name);
510 	printf("\n");
511 	printf("\t[-q io depth]\n");
512 	printf("\t[-s io size in bytes]\n");
513 	printf("\t[-w io pattern type, must be one of\n");
514 	printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n");
515 	printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n");
516 	printf("\t[-l enable latency tracking, default: disabled]\n");
517 	printf("\t\t(0 - disabled; 1 - enabled)\n");
518 	printf("\t[-t time in seconds]\n");
519 	printf("\t[-c core mask for I/O submission/completion.]\n");
520 	printf("\t\t(default: 0xf - 4 cores)]\n");
521 	printf("\t[-m max completions per poll]\n");
522 	printf("\t\t(default: 0 - unlimited)\n");
523 	printf("\t[-a arbitration mechanism, must be one of below]\n");
524 	printf("\t\t(0, 1, 2)]\n");
525 	printf("\t\t(0: default round robin mechanism)]\n");
526 	printf("\t\t(1: weighted round robin mechanism)]\n");
527 	printf("\t\t(2: vendor specific mechanism)]\n");
528 	printf("\t[-b enable arbitration user configuration, default: disabled]\n");
529 	printf("\t\t(0 - disabled; 1 - enabled)\n");
530 	printf("\t[-n subjected IOs for performance comparison]\n");
531 	printf("\t[-i shared memory group ID]\n");
532 }
533 
534 static const char *
535 print_qprio(enum spdk_nvme_qprio qprio)
536 {
537 	switch (qprio) {
538 	case SPDK_NVME_QPRIO_URGENT:
539 		return "urgent priority queue";
540 	case SPDK_NVME_QPRIO_HIGH:
541 		return "high priority queue";
542 	case SPDK_NVME_QPRIO_MEDIUM:
543 		return "medium priority queue";
544 	case SPDK_NVME_QPRIO_LOW:
545 		return "low priority queue";
546 	default:
547 		return "invalid priority queue";
548 	}
549 }
550 
551 
552 static void
553 print_configuration(char *program_name)
554 {
555 	printf("%s run with configuration:\n", program_name);
556 	printf("%s -q %d -s %d -w %s -M %d -l %d -t %d -c %s -m %d -a %d -b %d -n %d -i %d\n",
557 	       program_name,
558 	       g_arbitration.queue_depth,
559 	       g_arbitration.io_size_bytes,
560 	       g_arbitration.workload_type,
561 	       g_arbitration.rw_percentage,
562 	       g_arbitration.latency_tracking_enable,
563 	       g_arbitration.time_in_sec,
564 	       g_arbitration.core_mask,
565 	       g_arbitration.max_completions,
566 	       g_arbitration.arbitration_mechanism,
567 	       g_arbitration.arbitration_config,
568 	       g_arbitration.io_count,
569 	       g_arbitration.shm_id);
570 }
571 
572 
573 static void
574 print_performance(void)
575 {
576 	float io_per_second, sent_all_io_in_secs;
577 	struct worker_thread	*worker;
578 	struct ns_worker_ctx	*ns_ctx;
579 
580 	worker = g_workers;
581 	while (worker) {
582 		ns_ctx = worker->ns_ctx;
583 		while (ns_ctx) {
584 			io_per_second = (float)ns_ctx->io_completed / g_arbitration.time_in_sec;
585 			sent_all_io_in_secs = g_arbitration.io_count / io_per_second;
586 			printf("%-43.43s core %u: %8.2f IO/s %8.2f secs/%d ios\n",
587 			       ns_ctx->entry->name, worker->lcore,
588 			       io_per_second, sent_all_io_in_secs, g_arbitration.io_count);
589 			ns_ctx = ns_ctx->next;
590 		}
591 		worker = worker->next;
592 	}
593 	printf("========================================================\n");
594 
595 	printf("\n");
596 }
597 
598 static void
599 print_latency_page(struct ctrlr_entry *entry)
600 {
601 	int i;
602 
603 	printf("\n");
604 	printf("%s\n", entry->name);
605 	printf("--------------------------------------------------------\n");
606 
607 	for (i = 0; i < 32; i++) {
608 		if (entry->latency_page.buckets_32us[i])
609 			printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32,
610 			       entry->latency_page.buckets_32us[i]);
611 	}
612 	for (i = 0; i < 31; i++) {
613 		if (entry->latency_page.buckets_1ms[i])
614 			printf("Bucket %dms - %dms: %d\n", i + 1, i + 2,
615 			       entry->latency_page.buckets_1ms[i]);
616 	}
617 	for (i = 0; i < 31; i++) {
618 		if (entry->latency_page.buckets_32ms[i])
619 			printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32,
620 			       entry->latency_page.buckets_32ms[i]);
621 	}
622 }
623 
624 static void
625 print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page)
626 {
627 	struct ctrlr_entry	*ctrlr;
628 
629 	printf("%s Latency Statistics:\n", op_name);
630 	printf("========================================================\n");
631 	ctrlr = g_controllers;
632 	while (ctrlr) {
633 		if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
634 			if (spdk_nvme_ctrlr_cmd_get_log_page(
635 				    ctrlr->ctrlr, log_page,
636 				    SPDK_NVME_GLOBAL_NS_TAG,
637 				    &ctrlr->latency_page,
638 				    sizeof(struct spdk_nvme_intel_rw_latency_page),
639 				    0,
640 				    enable_latency_tracking_complete,
641 				    NULL)) {
642 				printf("nvme_ctrlr_cmd_get_log_page() failed\n");
643 				exit(1);
644 			}
645 
646 			g_arbitration.outstanding_commands++;
647 		} else {
648 			printf("Controller %s: %s latency statistics not supported\n",
649 			       ctrlr->name, op_name);
650 		}
651 		ctrlr = ctrlr->next;
652 	}
653 
654 	while (g_arbitration.outstanding_commands) {
655 		ctrlr = g_controllers;
656 		while (ctrlr) {
657 			spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr);
658 			ctrlr = ctrlr->next;
659 		}
660 	}
661 
662 	ctrlr = g_controllers;
663 	while (ctrlr) {
664 		if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) {
665 			print_latency_page(ctrlr);
666 		}
667 		ctrlr = ctrlr->next;
668 	}
669 	printf("\n");
670 }
671 
672 static void
673 print_stats(void)
674 {
675 	print_performance();
676 	if (g_arbitration.latency_tracking_enable) {
677 		if (g_arbitration.rw_percentage != 0) {
678 			print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY);
679 		}
680 		if (g_arbitration.rw_percentage != 100) {
681 			print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY);
682 		}
683 	}
684 }
685 
686 static int
687 parse_args(int argc, char **argv)
688 {
689 	const char *workload_type	= NULL;
690 	int op				= 0;
691 	bool mix_specified		= false;
692 	long int val;
693 
694 	while ((op = getopt(argc, argv, "c:l:i:m:q:s:t:w:M:a:b:n:h")) != -1) {
695 		switch (op) {
696 		case 'c':
697 			g_arbitration.core_mask = optarg;
698 			break;
699 		case 'w':
700 			g_arbitration.workload_type = optarg;
701 			break;
702 		case 'h':
703 		case '?':
704 			usage(argv[0]);
705 			return 1;
706 		default:
707 			val = spdk_strtol(optarg, 10);
708 			if (val < 0) {
709 				fprintf(stderr, "Converting a string to integer failed\n");
710 				return val;
711 			}
712 			switch (op) {
713 			case 'i':
714 				g_arbitration.shm_id = val;
715 				break;
716 			case 'l':
717 				g_arbitration.latency_tracking_enable = val;
718 				break;
719 			case 'm':
720 				g_arbitration.max_completions = val;
721 				break;
722 			case 'q':
723 				g_arbitration.queue_depth = val;
724 				break;
725 			case 's':
726 				g_arbitration.io_size_bytes = val;
727 				break;
728 			case 't':
729 				g_arbitration.time_in_sec = val;
730 				break;
731 			case 'M':
732 				g_arbitration.rw_percentage = val;
733 				mix_specified = true;
734 				break;
735 			case 'a':
736 				g_arbitration.arbitration_mechanism = val;
737 				break;
738 			case 'b':
739 				g_arbitration.arbitration_config = val;
740 				break;
741 			case 'n':
742 				g_arbitration.io_count = val;
743 				break;
744 			default:
745 				usage(argv[0]);
746 				return -EINVAL;
747 			}
748 		}
749 	}
750 
751 	workload_type = g_arbitration.workload_type;
752 
753 	if (strcmp(workload_type, "read") &&
754 	    strcmp(workload_type, "write") &&
755 	    strcmp(workload_type, "randread") &&
756 	    strcmp(workload_type, "randwrite") &&
757 	    strcmp(workload_type, "rw") &&
758 	    strcmp(workload_type, "randrw")) {
759 		fprintf(stderr,
760 			"io pattern type must be one of\n"
761 			"(read, write, randread, randwrite, rw, randrw)\n");
762 		return 1;
763 	}
764 
765 	if (!strcmp(workload_type, "read") ||
766 	    !strcmp(workload_type, "randread")) {
767 		g_arbitration.rw_percentage = 100;
768 	}
769 
770 	if (!strcmp(workload_type, "write") ||
771 	    !strcmp(workload_type, "randwrite")) {
772 		g_arbitration.rw_percentage = 0;
773 	}
774 
775 	if (!strcmp(workload_type, "read") ||
776 	    !strcmp(workload_type, "randread") ||
777 	    !strcmp(workload_type, "write") ||
778 	    !strcmp(workload_type, "randwrite")) {
779 		if (mix_specified) {
780 			fprintf(stderr, "Ignoring -M option... Please use -M option"
781 				" only when using rw or randrw.\n");
782 		}
783 	}
784 
785 	if (!strcmp(workload_type, "rw") ||
786 	    !strcmp(workload_type, "randrw")) {
787 		if (g_arbitration.rw_percentage < 0 || g_arbitration.rw_percentage > 100) {
788 			fprintf(stderr,
789 				"-M must be specified to value from 0 to 100 "
790 				"for rw or randrw.\n");
791 			return 1;
792 		}
793 	}
794 
795 	if (!strcmp(workload_type, "read") ||
796 	    !strcmp(workload_type, "write") ||
797 	    !strcmp(workload_type, "rw")) {
798 		g_arbitration.is_random = 0;
799 	} else {
800 		g_arbitration.is_random = 1;
801 	}
802 
803 	if (g_arbitration.latency_tracking_enable != 0 &&
804 	    g_arbitration.latency_tracking_enable != 1) {
805 		fprintf(stderr,
806 			"-l must be specified to value 0 or 1.\n");
807 		return 1;
808 	}
809 
810 	switch (g_arbitration.arbitration_mechanism) {
811 	case SPDK_NVME_CC_AMS_RR:
812 	case SPDK_NVME_CC_AMS_WRR:
813 	case SPDK_NVME_CC_AMS_VS:
814 		break;
815 	default:
816 		fprintf(stderr,
817 			"-a must be specified to value 0, 1, or 7.\n");
818 		return 1;
819 	}
820 
821 	if (g_arbitration.arbitration_config != 0 &&
822 	    g_arbitration.arbitration_config != 1) {
823 		fprintf(stderr,
824 			"-b must be specified to value 0 or 1.\n");
825 		return 1;
826 	} else if (g_arbitration.arbitration_config == 1 &&
827 		   g_arbitration.arbitration_mechanism != SPDK_NVME_CC_AMS_WRR) {
828 		fprintf(stderr,
829 			"-a must be specified to 1 (WRR) together.\n");
830 		return 1;
831 	}
832 
833 	return 0;
834 }
835 
836 static int
837 register_workers(void)
838 {
839 	uint32_t i;
840 	struct worker_thread *worker;
841 	enum spdk_nvme_qprio qprio = SPDK_NVME_QPRIO_URGENT;
842 
843 	g_workers = NULL;
844 	g_arbitration.num_workers = 0;
845 
846 	SPDK_ENV_FOREACH_CORE(i) {
847 		worker = calloc(1, sizeof(*worker));
848 		if (worker == NULL) {
849 			fprintf(stderr, "Unable to allocate worker\n");
850 			return -1;
851 		}
852 
853 		worker->lcore = i;
854 		worker->next = g_workers;
855 		g_workers = worker;
856 		g_arbitration.num_workers++;
857 
858 		if (g_arbitration.arbitration_mechanism == SPDK_NVME_CAP_AMS_WRR) {
859 			qprio++;
860 		}
861 
862 		worker->qprio = qprio % SPDK_NVME_QPRIO_MAX;
863 	}
864 
865 	return 0;
866 }
867 
868 static bool
869 probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
870 	 struct spdk_nvme_ctrlr_opts *opts)
871 {
872 	/* Update with user specified arbitration configuration */
873 	opts->arb_mechanism = g_arbitration.arbitration_mechanism;
874 
875 	printf("Attaching to %s\n", trid->traddr);
876 
877 	return true;
878 }
879 
880 static void
881 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
882 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
883 {
884 	printf("Attached to %s\n", trid->traddr);
885 
886 	/* Update with actual arbitration configuration in use */
887 	g_arbitration.arbitration_mechanism = opts->arb_mechanism;
888 
889 	register_ctrlr(ctrlr);
890 }
891 
892 static int
893 register_controllers(void)
894 {
895 	printf("Initializing NVMe Controllers\n");
896 
897 	if (spdk_nvme_probe(NULL, NULL, probe_cb, attach_cb, NULL) != 0) {
898 		fprintf(stderr, "spdk_nvme_probe() failed\n");
899 		return 1;
900 	}
901 
902 	if (g_arbitration.num_namespaces == 0) {
903 		fprintf(stderr, "No valid namespaces to continue IO testing\n");
904 		return 1;
905 	}
906 
907 	return 0;
908 }
909 
910 static void
911 unregister_controllers(void)
912 {
913 	struct ctrlr_entry *entry = g_controllers;
914 
915 	while (entry) {
916 		struct ctrlr_entry *next = entry->next;
917 		if (g_arbitration.latency_tracking_enable &&
918 		    spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) {
919 			set_latency_tracking_feature(entry->ctrlr, false);
920 		}
921 		spdk_nvme_detach(entry->ctrlr);
922 		free(entry);
923 		entry = next;
924 	}
925 }
926 
927 static int
928 associate_workers_with_ns(void)
929 {
930 	struct ns_entry		*entry = g_namespaces;
931 	struct worker_thread	*worker = g_workers;
932 	struct ns_worker_ctx	*ns_ctx;
933 	int			i, count;
934 
935 	count = g_arbitration.num_namespaces > g_arbitration.num_workers ?
936 		g_arbitration.num_namespaces : g_arbitration.num_workers;
937 
938 	for (i = 0; i < count; i++) {
939 		if (entry == NULL) {
940 			break;
941 		}
942 
943 		ns_ctx = malloc(sizeof(struct ns_worker_ctx));
944 		if (!ns_ctx) {
945 			return 1;
946 		}
947 		memset(ns_ctx, 0, sizeof(*ns_ctx));
948 
949 		printf("Associating %s with lcore %d\n", entry->name, worker->lcore);
950 		ns_ctx->entry = entry;
951 		ns_ctx->next = worker->ns_ctx;
952 		worker->ns_ctx = ns_ctx;
953 
954 		worker = worker->next;
955 		if (worker == NULL) {
956 			worker = g_workers;
957 		}
958 
959 		entry = entry->next;
960 		if (entry == NULL) {
961 			entry = g_namespaces;
962 		}
963 
964 	}
965 
966 	return 0;
967 }
968 
969 static void
970 get_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl)
971 {
972 	struct feature *feature = cb_arg;
973 	int fid = feature - features;
974 
975 	if (spdk_nvme_cpl_is_error(cpl)) {
976 		printf("get_feature(0x%02X) failed\n", fid);
977 	} else {
978 		feature->result = cpl->cdw0;
979 		feature->valid = true;
980 	}
981 
982 	g_arbitration.outstanding_commands--;
983 }
984 
985 static int
986 get_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t fid)
987 {
988 	struct spdk_nvme_cmd cmd = {};
989 
990 	cmd.opc = SPDK_NVME_OPC_GET_FEATURES;
991 	cmd.cdw10 = fid;
992 
993 	return spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &cmd, NULL, 0, get_feature_completion, &features[fid]);
994 }
995 
996 static void
997 get_arb_feature(struct spdk_nvme_ctrlr *ctrlr)
998 {
999 	get_feature(ctrlr, SPDK_NVME_FEAT_ARBITRATION);
1000 
1001 	g_arbitration.outstanding_commands++;
1002 
1003 	while (g_arbitration.outstanding_commands) {
1004 		spdk_nvme_ctrlr_process_admin_completions(ctrlr);
1005 	}
1006 
1007 	if (features[SPDK_NVME_FEAT_ARBITRATION].valid) {
1008 		uint32_t arb = features[SPDK_NVME_FEAT_ARBITRATION].result;
1009 		unsigned ab, lpw, mpw, hpw;
1010 
1011 		ab = arb & SPDK_NVME_ARB_BURST_MASK;
1012 		lpw = ((arb >> SPDK_NVME_LOW_PRIO_WEIGHT_SHIFT) & SPDK_NVME_PRIO_WEIGHT_MASK) + 1;
1013 		mpw = ((arb >> SPDK_NVME_MED_PRIO_WEIGHT_SHIFT) & SPDK_NVME_PRIO_WEIGHT_MASK) + 1;
1014 		hpw = ((arb >> SPDK_NVME_HIGH_PRIO_WEIGHT_SHIFT) & SPDK_NVME_PRIO_WEIGHT_MASK) + 1;
1015 
1016 		printf("Current Arbitration Configuration\n");
1017 		printf("===========\n");
1018 		printf("Arbitration Burst:           ");
1019 		if (ab == SPDK_NVME_ARB_BURST_MASK) {
1020 			printf("no limit\n");
1021 		} else {
1022 			printf("%u\n", 1u << ab);
1023 		}
1024 
1025 		printf("Low Priority Weight:         %u\n", lpw);
1026 		printf("Medium Priority Weight:      %u\n", mpw);
1027 		printf("High Priority Weight:        %u\n", hpw);
1028 		printf("\n");
1029 	}
1030 }
1031 
1032 static void
1033 set_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl)
1034 {
1035 	struct feature *feature = cb_arg;
1036 	int fid = feature - features;
1037 
1038 	if (spdk_nvme_cpl_is_error(cpl)) {
1039 		printf("set_feature(0x%02X) failed\n", fid);
1040 		feature->valid = false;
1041 	} else {
1042 		printf("Set Arbitration Feature Successfully\n");
1043 	}
1044 
1045 	g_arbitration.outstanding_commands--;
1046 }
1047 
1048 static int
1049 set_arb_feature(struct spdk_nvme_ctrlr *ctrlr)
1050 {
1051 	int ret;
1052 	struct spdk_nvme_cmd cmd = {};
1053 	uint32_t arb = 0;
1054 	unsigned ab, lpw, mpw, hpw;
1055 
1056 	cmd.opc = SPDK_NVME_OPC_SET_FEATURES;
1057 	cmd.cdw10 = SPDK_NVME_FEAT_ARBITRATION;
1058 
1059 	g_arbitration.outstanding_commands = 0;
1060 
1061 	if (features[SPDK_NVME_FEAT_ARBITRATION].valid) {
1062 		ab = USER_SPECIFIED_ARBITRATION_BURST & SPDK_NVME_ARB_BURST_MASK;
1063 		hpw = USER_SPECIFIED_HIGH_PRIORITY_WEIGHT << SPDK_NVME_HIGH_PRIO_WEIGHT_SHIFT;
1064 		mpw = USER_SPECIFIED_MEDIUM_PRIORITY_WEIGHT << SPDK_NVME_MED_PRIO_WEIGHT_SHIFT;
1065 		lpw = USER_SPECIFIED_LOW_PRIORITY_WEIGHT << SPDK_NVME_LOW_PRIO_WEIGHT_SHIFT;
1066 		arb = hpw | mpw | lpw | ab;
1067 		cmd.cdw11 = arb;
1068 	}
1069 
1070 	ret = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &cmd, NULL, 0,
1071 					    set_feature_completion, &features[SPDK_NVME_FEAT_ARBITRATION]);
1072 	if (ret) {
1073 		printf("Set Arbitration Feature: Failed 0x%x\n", ret);
1074 		return 1;
1075 	}
1076 
1077 	g_arbitration.outstanding_commands++;
1078 
1079 	while (g_arbitration.outstanding_commands) {
1080 		spdk_nvme_ctrlr_process_admin_completions(ctrlr);
1081 	}
1082 
1083 	if (!features[SPDK_NVME_FEAT_ARBITRATION].valid) {
1084 		printf("Set Arbitration Feature failed and use default configuration\n");
1085 	}
1086 
1087 	return 0;
1088 }
1089 
1090 int
1091 main(int argc, char **argv)
1092 {
1093 	int rc;
1094 	struct worker_thread *worker, *master_worker;
1095 	unsigned master_core;
1096 	char task_pool_name[30];
1097 	uint32_t task_count;
1098 	struct spdk_env_opts opts;
1099 
1100 	rc = parse_args(argc, argv);
1101 	if (rc != 0) {
1102 		return rc;
1103 	}
1104 
1105 	spdk_env_opts_init(&opts);
1106 	opts.name = "arb";
1107 	opts.core_mask = g_arbitration.core_mask;
1108 	opts.shm_id = g_arbitration.shm_id;
1109 	if (spdk_env_init(&opts) < 0) {
1110 		return 1;
1111 	}
1112 
1113 	g_arbitration.tsc_rate = spdk_get_ticks_hz();
1114 
1115 	if (register_workers() != 0) {
1116 		return 1;
1117 	}
1118 
1119 	if (register_controllers() != 0) {
1120 		return 1;
1121 	}
1122 
1123 	if (associate_workers_with_ns() != 0) {
1124 		return 1;
1125 	}
1126 
1127 	snprintf(task_pool_name, sizeof(task_pool_name), "task_pool_%d", getpid());
1128 
1129 	/*
1130 	 * The task_count will be dynamically calculated based on the
1131 	 * number of attached active namespaces, queue depth and number
1132 	 * of cores (workers) involved in the IO perations.
1133 	 */
1134 	task_count = g_arbitration.num_namespaces > g_arbitration.num_workers ?
1135 		     g_arbitration.num_namespaces : g_arbitration.num_workers;
1136 	task_count *= g_arbitration.queue_depth;
1137 
1138 	task_pool = spdk_mempool_create(task_pool_name, task_count,
1139 					sizeof(struct arb_task), 0, SPDK_ENV_SOCKET_ID_ANY);
1140 	if (task_pool == NULL) {
1141 		fprintf(stderr, "could not initialize task pool\n");
1142 		return 1;
1143 	}
1144 
1145 	print_configuration(argv[0]);
1146 
1147 	printf("Initialization complete. Launching workers.\n");
1148 
1149 	/* Launch all of the slave workers */
1150 	master_core = spdk_env_get_current_core();
1151 	master_worker = NULL;
1152 	worker = g_workers;
1153 	while (worker != NULL) {
1154 		if (worker->lcore != master_core) {
1155 			spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker);
1156 		} else {
1157 			assert(master_worker == NULL);
1158 			master_worker = worker;
1159 		}
1160 		worker = worker->next;
1161 	}
1162 
1163 	assert(master_worker != NULL);
1164 	rc = work_fn(master_worker);
1165 
1166 	spdk_env_thread_wait_all();
1167 
1168 	print_stats();
1169 
1170 	unregister_controllers();
1171 
1172 	cleanup(task_count);
1173 
1174 	if (rc != 0) {
1175 		fprintf(stderr, "%s: errors occured\n", argv[0]);
1176 	}
1177 
1178 	return rc;
1179 }
1180