xref: /spdk/lib/ftl/ftl_core.c (revision c4d9daeb7bf491bc0eb6e8d417b75d44773cb009)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/likely.h"
35 #include "spdk/stdinc.h"
36 #include "spdk/nvme.h"
37 #include "spdk/io_channel.h"
38 #include "spdk/bdev_module.h"
39 #include "spdk_internal/log.h"
40 #include "spdk/ftl.h"
41 
42 #include "ftl_core.h"
43 #include "ftl_band.h"
44 #include "ftl_io.h"
45 #include "ftl_anm.h"
46 #include "ftl_rwb.h"
47 #include "ftl_debug.h"
48 #include "ftl_reloc.h"
49 
50 /* Max number of iovecs */
51 #define FTL_MAX_IOV 1024
52 
53 struct ftl_wptr {
54 	/* Owner device */
55 	struct spdk_ftl_dev		*dev;
56 
57 	/* Current PPA */
58 	struct ftl_ppa			ppa;
59 
60 	/* Band currently being written to */
61 	struct ftl_band			*band;
62 
63 	/* Current logical block's offset */
64 	uint64_t			offset;
65 
66 	/* Current erase block */
67 	struct ftl_chunk		*chunk;
68 
69 	/* IO that is currently processed */
70 	struct ftl_io			*current_io;
71 
72 	/* List link */
73 	LIST_ENTRY(ftl_wptr)		list_entry;
74 };
75 
76 struct ftl_flush {
77 	/* Owner device */
78 	struct spdk_ftl_dev		*dev;
79 
80 	/* Number of batches to wait for */
81 	size_t				num_req;
82 
83 	/* Callback */
84 	struct ftl_cb			cb;
85 
86 	/* Batch bitmap */
87 	struct spdk_bit_array		*bmap;
88 
89 	/* List link */
90 	LIST_ENTRY(ftl_flush)		list_entry;
91 };
92 
93 typedef int (*ftl_next_ppa_fn)(struct ftl_io *, struct ftl_ppa *);
94 static void _ftl_read(void *);
95 static void _ftl_write(void *);
96 
97 static int
98 ftl_rwb_flags_from_io(const struct ftl_io *io)
99 {
100 	int valid_flags = FTL_IO_INTERNAL | FTL_IO_WEAK | FTL_IO_PAD;
101 	return io->flags & valid_flags;
102 }
103 
104 static int
105 ftl_rwb_entry_weak(const struct ftl_rwb_entry *entry)
106 {
107 	return entry->flags & FTL_IO_WEAK;
108 }
109 
110 static void
111 ftl_wptr_free(struct ftl_wptr *wptr)
112 {
113 	if (!wptr) {
114 		return;
115 	}
116 
117 	free(wptr);
118 }
119 
120 static void
121 ftl_remove_wptr(struct ftl_wptr *wptr)
122 {
123 	LIST_REMOVE(wptr, list_entry);
124 	ftl_wptr_free(wptr);
125 }
126 
127 static void
128 ftl_io_cmpl_cb(void *arg, const struct spdk_nvme_cpl *status)
129 {
130 	struct ftl_io *io = arg;
131 
132 	if (spdk_nvme_cpl_is_error(status)) {
133 		ftl_io_process_error(io, status);
134 	}
135 
136 	ftl_trace_completion(io->dev, io, FTL_TRACE_COMPLETION_DISK);
137 
138 	ftl_io_dec_req(io);
139 
140 	if (ftl_io_done(io)) {
141 		ftl_io_complete(io);
142 	}
143 }
144 
145 static void
146 ftl_halt_writes(struct spdk_ftl_dev *dev, struct ftl_band *band)
147 {
148 	struct ftl_wptr *wptr = NULL;
149 
150 	LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
151 		if (wptr->band == band) {
152 			break;
153 		}
154 	}
155 
156 	/* If the band already has the high_prio flag set, other writes must */
157 	/* have failed earlier, so it's already taken care of. */
158 	if (band->high_prio) {
159 		assert(wptr == NULL);
160 		return;
161 	}
162 
163 	ftl_band_write_failed(band);
164 	ftl_remove_wptr(wptr);
165 }
166 
167 static struct ftl_wptr *
168 ftl_wptr_from_band(struct ftl_band *band)
169 {
170 	struct spdk_ftl_dev *dev = band->dev;
171 	struct ftl_wptr *wptr = NULL;
172 
173 	LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
174 		if (wptr->band == band) {
175 			return wptr;
176 		}
177 	}
178 
179 	return NULL;
180 }
181 
182 static void
183 ftl_md_write_fail(struct ftl_io *io, int status)
184 {
185 	struct ftl_band *band = io->band;
186 	struct ftl_wptr *wptr;
187 	char buf[128];
188 
189 	wptr = ftl_wptr_from_band(band);
190 
191 	SPDK_ERRLOG("Metadata write failed @ppa: %s, status: %d\n",
192 		    ftl_ppa2str(wptr->ppa, buf, sizeof(buf)), status);
193 
194 	ftl_halt_writes(io->dev, band);
195 }
196 
197 static void
198 ftl_md_write_cb(void *arg, int status)
199 {
200 	struct ftl_io *io = arg;
201 	struct ftl_wptr *wptr;
202 
203 	wptr = ftl_wptr_from_band(io->band);
204 
205 	if (status) {
206 		ftl_md_write_fail(io, status);
207 		return;
208 	}
209 
210 	ftl_band_set_next_state(io->band);
211 	if (io->band->state == FTL_BAND_STATE_CLOSED) {
212 		ftl_remove_wptr(wptr);
213 	}
214 }
215 
216 static int
217 ftl_ppa_read_next_ppa(struct ftl_io *io, struct ftl_ppa *ppa)
218 {
219 	struct spdk_ftl_dev *dev = io->dev;
220 	size_t lbk_cnt, max_lbks;
221 
222 	assert(ftl_io_mode_ppa(io));
223 	assert(io->iov_pos < io->iov_cnt);
224 
225 	if (io->pos == 0) {
226 		*ppa = io->ppa;
227 	} else {
228 		*ppa = ftl_band_next_xfer_ppa(io->band, io->ppa, io->pos);
229 	}
230 
231 	assert(!ftl_ppa_invalid(*ppa));
232 
233 	/* Metadata has to be read in the way it's written (jumping across */
234 	/* the chunks in xfer_size increments) */
235 	if (io->flags & FTL_IO_MD) {
236 		max_lbks = dev->xfer_size - (ppa->lbk % dev->xfer_size);
237 		lbk_cnt = spdk_min(ftl_io_iovec_len_left(io), max_lbks);
238 		assert(ppa->lbk / dev->xfer_size == (ppa->lbk + lbk_cnt - 1) / dev->xfer_size);
239 	} else {
240 		lbk_cnt = ftl_io_iovec_len_left(io);
241 	}
242 
243 	return lbk_cnt;
244 }
245 
246 static int
247 ftl_wptr_close_band(struct ftl_wptr *wptr)
248 {
249 	struct ftl_band *band = wptr->band;
250 
251 	ftl_band_set_state(band, FTL_BAND_STATE_CLOSING);
252 	band->tail_md_ppa = wptr->ppa;
253 
254 	return ftl_band_write_tail_md(band, band->md.dma_buf, ftl_md_write_cb);
255 }
256 
257 static int
258 ftl_wptr_open_band(struct ftl_wptr *wptr)
259 {
260 	struct ftl_band *band = wptr->band;
261 
262 	assert(ftl_band_chunk_is_first(band, wptr->chunk));
263 	assert(band->md.num_vld == 0);
264 
265 	ftl_band_clear_md(band);
266 
267 	assert(band->state == FTL_BAND_STATE_PREP);
268 	ftl_band_set_state(band, FTL_BAND_STATE_OPENING);
269 
270 	return ftl_band_write_head_md(band, band->md.dma_buf, ftl_md_write_cb);
271 }
272 
273 static int
274 ftl_submit_erase(struct ftl_io *io)
275 {
276 	struct spdk_ftl_dev *dev = io->dev;
277 	struct ftl_band *band = io->band;
278 	struct ftl_ppa ppa = io->ppa;
279 	struct ftl_chunk *chunk;
280 	uint64_t ppa_packed;
281 	int rc = 0;
282 	size_t i;
283 
284 	for (i = 0; i < io->lbk_cnt; ++i) {
285 		if (i != 0) {
286 			chunk = ftl_band_next_chunk(band, ftl_band_chunk_from_ppa(band, ppa));
287 			assert(chunk->state == FTL_CHUNK_STATE_CLOSED ||
288 			       chunk->state == FTL_CHUNK_STATE_VACANT);
289 			ppa = chunk->start_ppa;
290 		}
291 
292 		assert(ppa.lbk == 0);
293 		ppa_packed = ftl_ppa_addr_pack(dev, ppa);
294 
295 		ftl_trace_submission(dev, io, ppa, 1);
296 		rc = spdk_nvme_ocssd_ns_cmd_vector_reset(dev->ns, ftl_get_write_qpair(dev),
297 				&ppa_packed, 1, NULL, ftl_io_cmpl_cb, io);
298 		if (rc) {
299 			ftl_io_fail(io, rc);
300 			SPDK_ERRLOG("Vector reset failed with status: %d\n", rc);
301 			break;
302 		}
303 
304 		ftl_io_inc_req(io);
305 		ftl_io_advance(io, 1);
306 	}
307 
308 	if (ftl_io_done(io)) {
309 		ftl_io_complete(io);
310 	}
311 
312 	return rc;
313 }
314 
315 static void
316 _ftl_io_erase(void *ctx)
317 {
318 	ftl_io_erase((struct ftl_io *)ctx);
319 }
320 
321 static bool
322 ftl_check_core_thread(const struct spdk_ftl_dev *dev)
323 {
324 	return dev->core_thread.thread == spdk_get_thread();
325 }
326 
327 static bool
328 ftl_check_read_thread(const struct spdk_ftl_dev *dev)
329 {
330 	return dev->read_thread.thread == spdk_get_thread();
331 }
332 
333 int
334 ftl_io_erase(struct ftl_io *io)
335 {
336 	struct spdk_ftl_dev *dev = io->dev;
337 
338 	if (ftl_check_core_thread(dev)) {
339 		return ftl_submit_erase(io);
340 	}
341 
342 	spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_erase, io);
343 	return 0;
344 }
345 
346 static struct ftl_band *
347 ftl_next_write_band(struct spdk_ftl_dev *dev)
348 {
349 	struct ftl_band *band;
350 
351 	band = LIST_FIRST(&dev->free_bands);
352 	if (!band) {
353 		return NULL;
354 	}
355 	assert(band->state == FTL_BAND_STATE_FREE);
356 
357 	if (ftl_band_erase(band)) {
358 		/* TODO: handle erase failure */
359 		return NULL;
360 	}
361 
362 	return band;
363 }
364 
365 static struct ftl_band *
366 ftl_next_wptr_band(struct spdk_ftl_dev *dev)
367 {
368 	struct ftl_band *band;
369 
370 	if (!dev->next_band) {
371 		band = ftl_next_write_band(dev);
372 	} else {
373 		assert(dev->next_band->state == FTL_BAND_STATE_PREP);
374 		band = dev->next_band;
375 		dev->next_band = NULL;
376 	}
377 
378 	return band;
379 }
380 
381 static struct ftl_wptr *
382 ftl_wptr_init(struct ftl_band *band)
383 {
384 	struct spdk_ftl_dev *dev = band->dev;
385 	struct ftl_wptr *wptr;
386 
387 	wptr = calloc(1, sizeof(*wptr));
388 	if (!wptr) {
389 		return NULL;
390 	}
391 
392 	wptr->dev = dev;
393 	wptr->band = band;
394 	wptr->chunk = CIRCLEQ_FIRST(&band->chunks);
395 	wptr->ppa = wptr->chunk->start_ppa;
396 
397 	return wptr;
398 }
399 
400 static int
401 ftl_add_wptr(struct spdk_ftl_dev *dev)
402 {
403 	struct ftl_band *band;
404 	struct ftl_wptr *wptr;
405 
406 	band = ftl_next_wptr_band(dev);
407 	if (!band) {
408 		return -1;
409 	}
410 
411 	wptr = ftl_wptr_init(band);
412 	if (!wptr) {
413 		return -1;
414 	}
415 
416 	if (ftl_band_write_prep(band)) {
417 		ftl_wptr_free(wptr);
418 		return -1;
419 	}
420 
421 	LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry);
422 
423 	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: band %u\n", band->id);
424 	ftl_trace_write_band(dev, band);
425 	return 0;
426 }
427 
428 static void
429 ftl_wptr_advance(struct ftl_wptr *wptr, size_t xfer_size)
430 {
431 	struct ftl_band *band = wptr->band;
432 	struct spdk_ftl_dev *dev = wptr->dev;
433 	struct spdk_ftl_conf *conf = &dev->conf;
434 	size_t next_thld;
435 
436 	wptr->offset += xfer_size;
437 	next_thld = (ftl_band_num_usable_lbks(band) * conf->band_thld) / 100;
438 
439 	if (ftl_band_full(band, wptr->offset)) {
440 		ftl_band_set_state(band, FTL_BAND_STATE_FULL);
441 	}
442 
443 	wptr->chunk->busy = true;
444 	wptr->ppa = ftl_band_next_xfer_ppa(band, wptr->ppa, xfer_size);
445 	wptr->chunk = ftl_band_next_operational_chunk(band, wptr->chunk);
446 
447 	assert(!ftl_ppa_invalid(wptr->ppa));
448 
449 	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: grp:%d, pu:%d chunk:%d, lbk:%u\n",
450 		      wptr->ppa.grp, wptr->ppa.pu, wptr->ppa.chk, wptr->ppa.lbk);
451 
452 	if (wptr->offset >= next_thld && !dev->next_band) {
453 		dev->next_band = ftl_next_write_band(dev);
454 	}
455 }
456 
457 static int
458 ftl_wptr_ready(struct ftl_wptr *wptr)
459 {
460 	struct ftl_band *band = wptr->band;
461 
462 	/* TODO: add handling of empty bands */
463 
464 	if (spdk_unlikely(!ftl_chunk_is_writable(wptr->chunk))) {
465 		/* Erasing band may fail after it was assigned to wptr. */
466 		if (spdk_unlikely(wptr->chunk->state == FTL_CHUNK_STATE_BAD)) {
467 			ftl_wptr_advance(wptr, wptr->dev->xfer_size);
468 		}
469 		return 0;
470 	}
471 
472 	/* If we're in the process of writing metadata, wait till it is */
473 	/* completed. */
474 	/* TODO: we should probably change bands once we're writing tail md */
475 	if (ftl_band_state_changing(band)) {
476 		return 0;
477 	}
478 
479 	if (band->state == FTL_BAND_STATE_FULL) {
480 		if (ftl_wptr_close_band(wptr)) {
481 			/* TODO: need recovery here */
482 			assert(false);
483 		}
484 		return 0;
485 	}
486 
487 	if (band->state != FTL_BAND_STATE_OPEN) {
488 		if (ftl_wptr_open_band(wptr)) {
489 			/* TODO: need recovery here */
490 			assert(false);
491 		}
492 		return 0;
493 	}
494 
495 	return 1;
496 }
497 
498 static const struct spdk_ftl_limit *
499 ftl_get_limit(const struct spdk_ftl_dev *dev, int type)
500 {
501 	assert(type < SPDK_FTL_LIMIT_MAX);
502 	return &dev->conf.defrag.limits[type];
503 }
504 
505 static bool
506 ftl_cache_lba_valid(struct spdk_ftl_dev *dev, struct ftl_rwb_entry *entry)
507 {
508 	struct ftl_ppa ppa;
509 
510 	/* If the LBA is invalid don't bother checking the md and l2p */
511 	if (spdk_unlikely(entry->lba == FTL_LBA_INVALID)) {
512 		return false;
513 	}
514 
515 	ppa = ftl_l2p_get(dev, entry->lba);
516 	if (!(ftl_ppa_cached(ppa) && ppa.offset == entry->pos)) {
517 		return false;
518 	}
519 
520 	return true;
521 }
522 
523 static void
524 ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_rwb_entry *entry)
525 {
526 	pthread_spin_lock(&entry->lock);
527 
528 	if (!ftl_rwb_entry_valid(entry)) {
529 		goto unlock;
530 	}
531 
532 	/* If the l2p wasn't updated and still points at the entry, fill it with the */
533 	/* on-disk PPA and clear the cache status bit. Otherwise, skip the l2p update */
534 	/* and just clear the cache status. */
535 	if (!ftl_cache_lba_valid(dev, entry)) {
536 		goto clear;
537 	}
538 
539 	ftl_l2p_set(dev, entry->lba, entry->ppa);
540 clear:
541 	ftl_rwb_entry_invalidate(entry);
542 unlock:
543 	pthread_spin_unlock(&entry->lock);
544 }
545 
546 static struct ftl_rwb_entry *
547 ftl_acquire_entry(struct spdk_ftl_dev *dev, int flags)
548 {
549 	struct ftl_rwb_entry *entry;
550 
551 	entry = ftl_rwb_acquire(dev->rwb, ftl_rwb_type_from_flags(flags));
552 	if (!entry) {
553 		return NULL;
554 	}
555 
556 	ftl_evict_cache_entry(dev, entry);
557 
558 	entry->flags = flags;
559 	return entry;
560 }
561 
562 static void
563 ftl_rwb_pad(struct spdk_ftl_dev *dev, size_t size)
564 {
565 	struct ftl_rwb_entry *entry;
566 	int flags = FTL_IO_PAD | FTL_IO_INTERNAL;
567 
568 	for (size_t i = 0; i < size; ++i) {
569 		entry = ftl_acquire_entry(dev, flags);
570 		if (!entry) {
571 			break;
572 		}
573 
574 		entry->lba = FTL_LBA_INVALID;
575 		entry->ppa = ftl_to_ppa(FTL_PPA_INVALID);
576 		memset(entry->data, 0, FTL_BLOCK_SIZE);
577 		ftl_rwb_push(entry);
578 	}
579 }
580 
581 static void
582 ftl_remove_free_bands(struct spdk_ftl_dev *dev)
583 {
584 	while (!LIST_EMPTY(&dev->free_bands)) {
585 		LIST_REMOVE(LIST_FIRST(&dev->free_bands), list_entry);
586 	}
587 
588 	dev->next_band = NULL;
589 }
590 
591 static void
592 ftl_process_shutdown(struct spdk_ftl_dev *dev)
593 {
594 	size_t size = ftl_rwb_num_acquired(dev->rwb, FTL_RWB_TYPE_INTERNAL) +
595 		      ftl_rwb_num_acquired(dev->rwb, FTL_RWB_TYPE_USER);
596 
597 	if (size >= dev->xfer_size) {
598 		return;
599 	}
600 
601 	/* If we reach this point we need to remove free bands */
602 	/* and pad current wptr band to the end */
603 	ftl_remove_free_bands(dev);
604 
605 	/* Pad write buffer until band is full */
606 	ftl_rwb_pad(dev, dev->xfer_size - size);
607 }
608 
609 static int
610 ftl_shutdown_complete(struct spdk_ftl_dev *dev)
611 {
612 	return !__atomic_load_n(&dev->num_inflight, __ATOMIC_SEQ_CST) &&
613 	       LIST_EMPTY(&dev->wptr_list);
614 }
615 
616 void
617 ftl_apply_limits(struct spdk_ftl_dev *dev)
618 {
619 	const struct spdk_ftl_limit *limit;
620 	struct ftl_stats *stats = &dev->stats;
621 	size_t rwb_limit[FTL_RWB_TYPE_MAX];
622 	int i;
623 
624 	ftl_rwb_get_limits(dev->rwb, rwb_limit);
625 
626 	/* Clear existing limit */
627 	dev->limit = SPDK_FTL_LIMIT_MAX;
628 
629 	for (i = SPDK_FTL_LIMIT_CRIT; i < SPDK_FTL_LIMIT_MAX; ++i) {
630 		limit = ftl_get_limit(dev, i);
631 
632 		if (dev->num_free <= limit->thld) {
633 			rwb_limit[FTL_RWB_TYPE_USER] =
634 				(limit->limit * ftl_rwb_entry_cnt(dev->rwb)) / 100;
635 			stats->limits[i]++;
636 			dev->limit = i;
637 			goto apply;
638 		}
639 	}
640 
641 	/* Clear the limits, since we don't need to apply them anymore */
642 	rwb_limit[FTL_RWB_TYPE_USER] = ftl_rwb_entry_cnt(dev->rwb);
643 apply:
644 	ftl_trace_limits(dev, rwb_limit, dev->num_free);
645 	ftl_rwb_set_limits(dev->rwb, rwb_limit);
646 }
647 
648 static int
649 ftl_invalidate_addr_unlocked(struct spdk_ftl_dev *dev, struct ftl_ppa ppa)
650 {
651 	struct ftl_band *band = ftl_band_from_ppa(dev, ppa);
652 	struct ftl_md *md = &band->md;
653 	uint64_t offset;
654 
655 	offset = ftl_band_lbkoff_from_ppa(band, ppa);
656 
657 	/* The bit might be already cleared if two writes are scheduled to the */
658 	/* same LBA at the same time */
659 	if (spdk_bit_array_get(md->vld_map, offset)) {
660 		assert(md->num_vld > 0);
661 		spdk_bit_array_clear(md->vld_map, offset);
662 		md->num_vld--;
663 		return 1;
664 	}
665 
666 	return 0;
667 }
668 
669 int
670 ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_ppa ppa)
671 {
672 	struct ftl_band *band;
673 	int rc;
674 
675 	assert(!ftl_ppa_cached(ppa));
676 	band = ftl_band_from_ppa(dev, ppa);
677 
678 	pthread_spin_lock(&band->md.lock);
679 	rc = ftl_invalidate_addr_unlocked(dev, ppa);
680 	pthread_spin_unlock(&band->md.lock);
681 
682 	return rc;
683 }
684 
685 static int
686 ftl_read_retry(int rc)
687 {
688 	return rc == -EAGAIN;
689 }
690 
691 static int
692 ftl_read_canceled(int rc)
693 {
694 	return rc == -EFAULT || rc == 0;
695 }
696 
697 static void
698 ftl_add_to_retry_queue(struct ftl_io *io)
699 {
700 	if (!(io->flags & FTL_IO_RETRY)) {
701 		io->flags |= FTL_IO_RETRY;
702 		TAILQ_INSERT_TAIL(&io->dev->retry_queue, io, retry_entry);
703 	}
704 }
705 
706 static int
707 ftl_submit_read(struct ftl_io *io, ftl_next_ppa_fn next_ppa)
708 {
709 	struct spdk_ftl_dev *dev = io->dev;
710 	struct ftl_ppa ppa;
711 	int rc = 0, lbk_cnt;
712 
713 	while (io->pos < io->lbk_cnt) {
714 		/* We might hit the cache here, if so, skip the read */
715 		lbk_cnt = rc = next_ppa(io, &ppa);
716 
717 		/* We might need to retry the read from scratch (e.g. */
718 		/* because write was under way and completed before */
719 		/* we could read it from rwb */
720 		if (ftl_read_retry(rc)) {
721 			continue;
722 		}
723 
724 		/* We don't have to schedule the read, as it was read from cache */
725 		if (ftl_read_canceled(rc)) {
726 			ftl_io_advance(io, 1);
727 			ftl_trace_completion(io->dev, io, rc ? FTL_TRACE_COMPLETION_INVALID :
728 					     FTL_TRACE_COMPLETION_CACHE);
729 			rc = 0;
730 			continue;
731 		}
732 
733 		assert(lbk_cnt > 0);
734 
735 		ftl_trace_submission(dev, io, ppa, lbk_cnt);
736 		rc = spdk_nvme_ns_cmd_read(dev->ns, ftl_get_read_qpair(dev),
737 					   ftl_io_iovec_addr(io),
738 					   ftl_ppa_addr_pack(io->dev, ppa), lbk_cnt,
739 					   ftl_io_cmpl_cb, io, 0);
740 		if (rc == -ENOMEM) {
741 			ftl_add_to_retry_queue(io);
742 			break;
743 		} else if (rc) {
744 			ftl_io_fail(io, rc);
745 			break;
746 		}
747 
748 		ftl_io_inc_req(io);
749 		ftl_io_advance(io, lbk_cnt);
750 	}
751 
752 	/* If we didn't have to read anything from the device, */
753 	/* complete the request right away */
754 	if (ftl_io_done(io)) {
755 		ftl_io_complete(io);
756 	}
757 
758 	return rc;
759 }
760 
761 static int
762 ftl_ppa_cache_read(struct ftl_io *io, uint64_t lba,
763 		   struct ftl_ppa ppa, void *buf)
764 {
765 	struct ftl_rwb *rwb = io->dev->rwb;
766 	struct ftl_rwb_entry *entry;
767 	struct ftl_ppa nppa;
768 	int rc = 0;
769 
770 	entry = ftl_rwb_entry_from_offset(rwb, ppa.offset);
771 	pthread_spin_lock(&entry->lock);
772 
773 	nppa = ftl_l2p_get(io->dev, lba);
774 	if (ppa.ppa != nppa.ppa) {
775 		rc = -1;
776 		goto out;
777 	}
778 
779 	memcpy(buf, entry->data, FTL_BLOCK_SIZE);
780 out:
781 	pthread_spin_unlock(&entry->lock);
782 	return rc;
783 }
784 
785 static int
786 ftl_lba_read_next_ppa(struct ftl_io *io, struct ftl_ppa *ppa)
787 {
788 	struct spdk_ftl_dev *dev = io->dev;
789 	struct ftl_ppa next_ppa;
790 	size_t i;
791 
792 	*ppa = ftl_l2p_get(dev, ftl_io_current_lba(io));
793 
794 	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Read ppa:%lx, lba:%lu\n",
795 		      ppa->ppa, ftl_io_current_lba(io));
796 
797 	/* If the PPA is invalid, skip it (the buffer should already be zero'ed) */
798 	if (ftl_ppa_invalid(*ppa)) {
799 		return -EFAULT;
800 	}
801 
802 	if (ftl_ppa_cached(*ppa)) {
803 		if (!ftl_ppa_cache_read(io, ftl_io_current_lba(io), *ppa, ftl_io_iovec_addr(io))) {
804 			return 0;
805 		}
806 
807 		/* If the state changed, we have to re-read the l2p */
808 		return -EAGAIN;
809 	}
810 
811 	for (i = 1; i < ftl_io_iovec_len_left(io); ++i) {
812 		next_ppa = ftl_l2p_get(dev, ftl_io_get_lba(io, io->pos + i));
813 
814 		if (ftl_ppa_invalid(next_ppa) || ftl_ppa_cached(next_ppa)) {
815 			break;
816 		}
817 
818 		if (ftl_ppa_addr_pack(dev, *ppa) + i != ftl_ppa_addr_pack(dev, next_ppa)) {
819 			break;
820 		}
821 	}
822 
823 	return i;
824 }
825 
826 static void
827 ftl_complete_flush(struct ftl_flush *flush)
828 {
829 	assert(flush->num_req == 0);
830 	LIST_REMOVE(flush, list_entry);
831 
832 	flush->cb.fn(flush->cb.ctx, 0);
833 
834 	spdk_bit_array_free(&flush->bmap);
835 	free(flush);
836 }
837 
838 static void
839 ftl_process_flush(struct spdk_ftl_dev *dev, struct ftl_rwb_batch *batch)
840 {
841 	struct ftl_flush *flush, *tflush;
842 	size_t offset;
843 
844 	LIST_FOREACH_SAFE(flush, &dev->flush_list, list_entry, tflush) {
845 		offset = ftl_rwb_batch_get_offset(batch);
846 
847 		if (spdk_bit_array_get(flush->bmap, offset)) {
848 			spdk_bit_array_clear(flush->bmap, offset);
849 			if (!(--flush->num_req)) {
850 				ftl_complete_flush(flush);
851 			}
852 		}
853 	}
854 }
855 
856 static void
857 ftl_write_fail(struct ftl_io *io, int status)
858 {
859 	struct ftl_rwb_batch *batch = io->rwb_batch;
860 	struct spdk_ftl_dev *dev = io->dev;
861 	struct ftl_rwb_entry *entry;
862 	struct ftl_band *band;
863 	char buf[128];
864 
865 	entry = ftl_rwb_batch_first_entry(batch);
866 
867 	band = ftl_band_from_ppa(io->dev, entry->ppa);
868 	SPDK_ERRLOG("Write failed @ppa: %s, status: %d\n",
869 		    ftl_ppa2str(entry->ppa, buf, sizeof(buf)), status);
870 
871 	/* Close the band and, halt wptr and defrag */
872 	ftl_halt_writes(dev, band);
873 
874 	ftl_rwb_foreach(entry, batch) {
875 		/* Invalidate meta set by process_writes() */
876 		ftl_invalidate_addr(dev, entry->ppa);
877 	}
878 
879 	/* Reset the batch back to the the RWB to resend it later */
880 	ftl_rwb_batch_revert(batch);
881 }
882 
883 static void
884 ftl_write_cb(void *arg, int status)
885 {
886 	struct ftl_io *io = arg;
887 	struct spdk_ftl_dev *dev = io->dev;
888 	struct ftl_rwb_batch *batch = io->rwb_batch;
889 	struct ftl_rwb_entry *entry;
890 
891 	if (status) {
892 		ftl_write_fail(io, status);
893 		return;
894 	}
895 
896 	assert(io->lbk_cnt == dev->xfer_size);
897 	ftl_rwb_foreach(entry, batch) {
898 		if (!(io->flags & FTL_IO_MD) && !(entry->flags & FTL_IO_PAD)) {
899 			/* Verify that the LBA is set for user lbks */
900 			assert(entry->lba != FTL_LBA_INVALID);
901 		}
902 
903 		SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write ppa:%lu, lba:%lu\n",
904 			      entry->ppa.ppa, entry->lba);
905 	}
906 
907 	ftl_process_flush(dev, batch);
908 	ftl_rwb_batch_release(batch);
909 }
910 
911 static void
912 ftl_update_rwb_stats(struct spdk_ftl_dev *dev, const struct ftl_rwb_entry *entry)
913 {
914 	if (!ftl_rwb_entry_internal(entry)) {
915 		dev->stats.write_user++;
916 	}
917 	dev->stats.write_total++;
918 }
919 
920 static void
921 ftl_update_l2p(struct spdk_ftl_dev *dev, const struct ftl_rwb_entry *entry,
922 	       struct ftl_ppa ppa)
923 {
924 	struct ftl_ppa prev_ppa;
925 	struct ftl_rwb_entry *prev;
926 	struct ftl_band *band;
927 	int valid;
928 
929 	prev_ppa = ftl_l2p_get(dev, entry->lba);
930 	if (ftl_ppa_invalid(prev_ppa)) {
931 		ftl_l2p_set(dev, entry->lba, ppa);
932 		return;
933 	}
934 
935 	/* If the L2P's PPA is different than what we expected we don't need to */
936 	/* do anything (someone's already overwritten our data). */
937 	if (ftl_rwb_entry_weak(entry) && !ftl_ppa_cmp(prev_ppa, entry->ppa)) {
938 		return;
939 	}
940 
941 	if (ftl_ppa_cached(prev_ppa)) {
942 		assert(!ftl_rwb_entry_weak(entry));
943 		prev = ftl_rwb_entry_from_offset(dev->rwb, prev_ppa.offset);
944 		pthread_spin_lock(&prev->lock);
945 
946 		/* Re-read the L2P under the lock to protect against updates */
947 		/* to this LBA from other threads */
948 		prev_ppa = ftl_l2p_get(dev, entry->lba);
949 
950 		/* If the entry is no longer in cache, another write has been */
951 		/* scheduled in the meantime, so we have to invalidate its LBA */
952 		if (!ftl_ppa_cached(prev_ppa)) {
953 			ftl_invalidate_addr(dev, prev_ppa);
954 		}
955 
956 		/* If previous entry is part of cache, remove and invalidate it */
957 		if (ftl_rwb_entry_valid(prev)) {
958 			ftl_invalidate_addr(dev, prev->ppa);
959 			ftl_rwb_entry_invalidate(prev);
960 		}
961 
962 		ftl_l2p_set(dev, entry->lba, ppa);
963 		pthread_spin_unlock(&prev->lock);
964 		return;
965 	}
966 
967 	/* Lock the band containing previous PPA. This assures atomic changes to */
968 	/* the L2P as wall as metadata. The valid bits in metadata are used to */
969 	/* check weak writes validity. */
970 	band = ftl_band_from_ppa(dev, prev_ppa);
971 	pthread_spin_lock(&band->md.lock);
972 
973 	valid = ftl_invalidate_addr_unlocked(dev, prev_ppa);
974 
975 	/* If the address has been invalidated already, we don't want to update */
976 	/* the L2P for weak writes, as it means the write is no longer valid. */
977 	if (!ftl_rwb_entry_weak(entry) || valid) {
978 		ftl_l2p_set(dev, entry->lba, ppa);
979 	}
980 
981 	pthread_spin_unlock(&band->md.lock);
982 }
983 
984 static struct ftl_io *
985 ftl_io_init_child_write(struct ftl_io *parent, struct ftl_ppa ppa,
986 			void *data, void *md, spdk_ftl_fn cb)
987 {
988 	struct ftl_io *io;
989 	struct spdk_ftl_dev *dev = parent->dev;
990 	struct ftl_io_init_opts opts = {
991 		.dev		= dev,
992 		.io		= NULL,
993 		.parent		= parent,
994 		.rwb_batch	= NULL,
995 		.band		= parent->band,
996 		.size		= sizeof(struct ftl_io),
997 		.flags		= 0,
998 		.type		= FTL_IO_WRITE,
999 		.iov_cnt	= 1,
1000 		.req_size	= dev->xfer_size,
1001 		.fn		= cb,
1002 		.data		= data,
1003 		.md		= md,
1004 	};
1005 
1006 	io = ftl_io_init_internal(&opts);
1007 	if (!io) {
1008 		return NULL;
1009 	}
1010 
1011 	io->ppa = ppa;
1012 
1013 	return io;
1014 }
1015 
1016 static void
1017 ftl_io_child_write_cb(void *ctx, int status)
1018 {
1019 	struct ftl_chunk *chunk;
1020 	struct ftl_io *io = ctx;
1021 
1022 	chunk = ftl_band_chunk_from_ppa(io->band, io->ppa);
1023 	chunk->busy = false;
1024 }
1025 
1026 static int
1027 ftl_submit_child_write(struct ftl_wptr *wptr, struct ftl_io *io, int lbk_cnt)
1028 {
1029 	struct spdk_ftl_dev	*dev = io->dev;
1030 	struct ftl_io		*child;
1031 	struct iovec		*iov = ftl_io_iovec(io);
1032 	int			rc;
1033 
1034 	/* Split IO to child requests and release chunk immediately after child is completed */
1035 	child = ftl_io_init_child_write(io, wptr->ppa, iov[io->iov_pos].iov_base,
1036 					ftl_io_get_md(io), ftl_io_child_write_cb);
1037 	if (!child) {
1038 		return -EAGAIN;
1039 	}
1040 
1041 	rc = spdk_nvme_ns_cmd_write_with_md(dev->ns, ftl_get_write_qpair(dev),
1042 					    ftl_io_iovec_addr(child), child->md,
1043 					    ftl_ppa_addr_pack(dev, wptr->ppa),
1044 					    lbk_cnt, ftl_io_cmpl_cb, child, 0, 0, 0);
1045 	if (rc) {
1046 		ftl_io_fail(child, rc);
1047 		ftl_io_complete(child);
1048 		SPDK_ERRLOG("spdk_nvme_ns_cmd_write failed with status:%d, ppa:%lu\n",
1049 			    rc, wptr->ppa.ppa);
1050 
1051 		return -EIO;
1052 	}
1053 
1054 	ftl_io_inc_req(child);
1055 	ftl_io_advance(child, lbk_cnt);
1056 
1057 	return 0;
1058 }
1059 
1060 static int
1061 ftl_submit_write(struct ftl_wptr *wptr, struct ftl_io *io)
1062 {
1063 	struct spdk_ftl_dev	*dev = io->dev;
1064 	struct iovec		*iov = ftl_io_iovec(io);
1065 	int			rc = 0;
1066 	size_t			lbk_cnt;
1067 
1068 	while (io->iov_pos < io->iov_cnt) {
1069 		lbk_cnt = iov[io->iov_pos].iov_len / PAGE_SIZE;
1070 		assert(iov[io->iov_pos].iov_len > 0);
1071 		assert(lbk_cnt == dev->xfer_size);
1072 
1073 		/* There are no guarantees of the order of completion of NVMe IO submission queue */
1074 		/* so wait until chunk is not busy before submitting another write */
1075 		if (wptr->chunk->busy) {
1076 			wptr->current_io = io;
1077 			rc = -EAGAIN;
1078 			break;
1079 		}
1080 
1081 		rc = ftl_submit_child_write(wptr, io, lbk_cnt);
1082 
1083 		if (rc == -EAGAIN) {
1084 			wptr->current_io = io;
1085 			break;
1086 		} else if (rc) {
1087 			ftl_io_fail(io, rc);
1088 			break;
1089 		}
1090 
1091 		ftl_trace_submission(dev, io, wptr->ppa, lbk_cnt);
1092 
1093 		/* Update parent iovec */
1094 		ftl_io_advance(io, lbk_cnt);
1095 
1096 		ftl_wptr_advance(wptr, lbk_cnt);
1097 	}
1098 
1099 	if (ftl_io_done(io)) {
1100 		/* Parent IO will complete after all children are completed */
1101 		ftl_io_complete(io);
1102 	}
1103 
1104 	return rc;
1105 }
1106 
1107 static void
1108 ftl_flush_pad_batch(struct spdk_ftl_dev *dev)
1109 {
1110 	struct ftl_rwb *rwb = dev->rwb;
1111 	size_t size;
1112 
1113 	size = ftl_rwb_num_acquired(rwb, FTL_RWB_TYPE_INTERNAL) +
1114 	       ftl_rwb_num_acquired(rwb, FTL_RWB_TYPE_USER);
1115 
1116 	/* There must be something in the RWB, otherwise the flush */
1117 	/* wouldn't be waiting for anything */
1118 	assert(size > 0);
1119 
1120 	/* Only add padding when there's less than xfer size */
1121 	/* entries in the buffer. Otherwise we just have to wait */
1122 	/* for the entries to become ready. */
1123 	if (size < dev->xfer_size) {
1124 		ftl_rwb_pad(dev, dev->xfer_size - (size % dev->xfer_size));
1125 	}
1126 }
1127 
1128 static int
1129 ftl_wptr_process_writes(struct ftl_wptr *wptr)
1130 {
1131 	struct spdk_ftl_dev	*dev = wptr->dev;
1132 	struct ftl_rwb_batch	*batch;
1133 	struct ftl_rwb_entry	*entry;
1134 	struct ftl_io		*io;
1135 	struct ftl_ppa		ppa, prev_ppa;
1136 
1137 	if (wptr->current_io) {
1138 		if (ftl_submit_write(wptr, wptr->current_io) == -EAGAIN) {
1139 			return 0;
1140 		}
1141 		wptr->current_io = NULL;
1142 	}
1143 
1144 	/* Make sure the band is prepared for writing */
1145 	if (!ftl_wptr_ready(wptr)) {
1146 		return 0;
1147 	}
1148 
1149 	if (dev->halt) {
1150 		ftl_process_shutdown(dev);
1151 	}
1152 
1153 	batch = ftl_rwb_pop(dev->rwb);
1154 	if (!batch) {
1155 		/* If there are queued flush requests we need to pad the RWB to */
1156 		/* force out remaining entries */
1157 		if (!LIST_EMPTY(&dev->flush_list)) {
1158 			ftl_flush_pad_batch(dev);
1159 		}
1160 
1161 		return 0;
1162 	}
1163 
1164 	io = ftl_io_rwb_init(dev, wptr->band, batch, ftl_write_cb);
1165 	if (!io) {
1166 		goto error;
1167 	}
1168 
1169 	ppa = wptr->ppa;
1170 	ftl_rwb_foreach(entry, batch) {
1171 		entry->ppa = ppa;
1172 
1173 		if (entry->lba != FTL_LBA_INVALID) {
1174 			pthread_spin_lock(&entry->lock);
1175 			prev_ppa = ftl_l2p_get(dev, entry->lba);
1176 
1177 			/* If the l2p was updated in the meantime, don't update band's metadata */
1178 			if (ftl_ppa_cached(prev_ppa) && prev_ppa.offset == entry->pos) {
1179 				/* Setting entry's cache bit needs to be done after metadata */
1180 				/* within the band is updated to make sure that writes */
1181 				/* invalidating the entry clear the metadata as well */
1182 				ftl_band_set_addr(wptr->band, entry->lba, entry->ppa);
1183 				ftl_rwb_entry_set_valid(entry);
1184 			}
1185 			pthread_spin_unlock(&entry->lock);
1186 		}
1187 
1188 		ftl_trace_rwb_pop(dev, entry);
1189 		ftl_update_rwb_stats(dev, entry);
1190 
1191 		ppa = ftl_band_next_ppa(wptr->band, ppa, 1);
1192 	}
1193 
1194 	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write ppa:%lx, %lx\n", wptr->ppa.ppa,
1195 		      ftl_ppa_addr_pack(dev, wptr->ppa));
1196 
1197 	if (ftl_submit_write(wptr, io)) {
1198 		/* TODO: we need some recovery here */
1199 		assert(0 && "Write submit failed");
1200 		if (ftl_io_done(io)) {
1201 			ftl_io_free(io);
1202 		}
1203 	}
1204 
1205 	return dev->xfer_size;
1206 error:
1207 	ftl_rwb_batch_revert(batch);
1208 	return 0;
1209 }
1210 
1211 static int
1212 ftl_process_writes(struct spdk_ftl_dev *dev)
1213 {
1214 	struct ftl_wptr *wptr, *twptr;
1215 	size_t num_active = 0;
1216 	enum ftl_band_state state;
1217 
1218 	LIST_FOREACH_SAFE(wptr, &dev->wptr_list, list_entry, twptr) {
1219 		ftl_wptr_process_writes(wptr);
1220 		state = wptr->band->state;
1221 
1222 		if (state != FTL_BAND_STATE_FULL &&
1223 		    state != FTL_BAND_STATE_CLOSING &&
1224 		    state != FTL_BAND_STATE_CLOSED) {
1225 			num_active++;
1226 		}
1227 	}
1228 
1229 	if (num_active < 1) {
1230 		ftl_add_wptr(dev);
1231 	}
1232 
1233 	return 0;
1234 }
1235 
1236 static void
1237 ftl_rwb_entry_fill(struct ftl_rwb_entry *entry, struct ftl_io *io)
1238 {
1239 	struct ftl_band *band;
1240 
1241 	memcpy(entry->data, ftl_io_iovec_addr(io), FTL_BLOCK_SIZE);
1242 
1243 	if (ftl_rwb_entry_weak(entry)) {
1244 		band = ftl_band_from_ppa(io->dev, io->ppa);
1245 		entry->ppa = ftl_band_next_ppa(band, io->ppa, io->pos);
1246 	}
1247 
1248 	entry->trace = io->trace;
1249 
1250 	if (entry->md) {
1251 		memcpy(entry->md, &entry->lba, sizeof(entry->lba));
1252 	}
1253 }
1254 
1255 static int
1256 ftl_rwb_fill(struct ftl_io *io)
1257 {
1258 	struct spdk_ftl_dev *dev = io->dev;
1259 	struct ftl_rwb_entry *entry;
1260 	struct ftl_ppa ppa = { .cached = 1 };
1261 	int flags = ftl_rwb_flags_from_io(io);
1262 	uint64_t lba;
1263 
1264 	while (io->pos < io->lbk_cnt) {
1265 		lba = ftl_io_current_lba(io);
1266 		if (lba == FTL_LBA_INVALID) {
1267 			ftl_io_advance(io, 1);
1268 			continue;
1269 		}
1270 
1271 		entry = ftl_acquire_entry(dev, flags);
1272 		if (!entry) {
1273 			return -EAGAIN;
1274 		}
1275 
1276 		entry->lba = lba;
1277 		ftl_rwb_entry_fill(entry, io);
1278 
1279 		ppa.offset = entry->pos;
1280 
1281 		ftl_trace_rwb_fill(dev, io);
1282 		ftl_io_advance(io, 1);
1283 		ftl_update_l2p(dev, entry, ppa);
1284 
1285 		/* Needs to be done after L2P is updated to avoid race with */
1286 		/* write completion callback when it's processed faster than */
1287 		/* L2P is set in update_l2p(). */
1288 		ftl_rwb_push(entry);
1289 	}
1290 
1291 	ftl_io_complete(io);
1292 	return 0;
1293 }
1294 
1295 static bool
1296 ftl_dev_needs_defrag(struct spdk_ftl_dev *dev)
1297 {
1298 	const struct spdk_ftl_limit *limit = ftl_get_limit(dev, SPDK_FTL_LIMIT_START);
1299 
1300 	if (ftl_reloc_is_halted(dev->reloc)) {
1301 		return false;
1302 	}
1303 
1304 	if (dev->df_band) {
1305 		return false;
1306 	}
1307 
1308 	if (dev->num_free <= limit->thld) {
1309 		return true;
1310 	}
1311 
1312 	return false;
1313 }
1314 
1315 static double
1316 ftl_band_calc_merit(struct ftl_band *band, size_t *threshold_valid)
1317 {
1318 	size_t usable, valid, invalid;
1319 	double vld_ratio;
1320 
1321 	/* If the band doesn't have any usable lbks it's of no use */
1322 	usable = ftl_band_num_usable_lbks(band);
1323 	if (usable == 0) {
1324 		return 0.0;
1325 	}
1326 
1327 	valid =  threshold_valid ? (usable - *threshold_valid) : band->md.num_vld;
1328 	invalid = usable - valid;
1329 
1330 	/* Add one to avoid division by 0 */
1331 	vld_ratio = (double)invalid / (double)(valid + 1);
1332 	return vld_ratio * ftl_band_age(band);
1333 }
1334 
1335 static bool
1336 ftl_band_needs_defrag(struct ftl_band *band, struct spdk_ftl_dev *dev)
1337 {
1338 	struct spdk_ftl_conf *conf = &dev->conf;
1339 	size_t thld_vld;
1340 
1341 	/* If we're in dire need of free bands, every band is worth defragging */
1342 	if (ftl_current_limit(dev) == SPDK_FTL_LIMIT_CRIT) {
1343 		return true;
1344 	}
1345 
1346 	thld_vld = (ftl_band_num_usable_lbks(band) * conf->defrag.invalid_thld) / 100;
1347 
1348 	return band->merit > ftl_band_calc_merit(band, &thld_vld);
1349 }
1350 
1351 static struct ftl_band *
1352 ftl_select_defrag_band(struct spdk_ftl_dev *dev)
1353 {
1354 	struct ftl_band *band, *mband = NULL;
1355 	double merit = 0;
1356 
1357 	LIST_FOREACH(band, &dev->shut_bands, list_entry) {
1358 		assert(band->state == FTL_BAND_STATE_CLOSED);
1359 		band->merit = ftl_band_calc_merit(band, NULL);
1360 		if (band->merit > merit) {
1361 			merit = band->merit;
1362 			mband = band;
1363 		}
1364 	}
1365 
1366 	if (mband && !ftl_band_needs_defrag(mband, dev)) {
1367 		mband = NULL;
1368 	}
1369 
1370 	return mband;
1371 }
1372 
1373 static void
1374 ftl_process_relocs(struct spdk_ftl_dev *dev)
1375 {
1376 	struct ftl_band *band;
1377 
1378 	if (ftl_dev_needs_defrag(dev)) {
1379 		band = dev->df_band = ftl_select_defrag_band(dev);
1380 
1381 		if (band) {
1382 			ftl_reloc_add(dev->reloc, band, 0, ftl_num_band_lbks(dev), 0);
1383 			ftl_trace_defrag_band(dev, band);
1384 		}
1385 	}
1386 
1387 	ftl_reloc(dev->reloc);
1388 }
1389 
1390 int
1391 ftl_current_limit(const struct spdk_ftl_dev *dev)
1392 {
1393 	return dev->limit;
1394 }
1395 
1396 void
1397 spdk_ftl_dev_get_attrs(const struct spdk_ftl_dev *dev, struct spdk_ftl_attrs *attrs)
1398 {
1399 	attrs->uuid = dev->uuid;
1400 	attrs->lbk_cnt = dev->num_lbas;
1401 	attrs->lbk_size = FTL_BLOCK_SIZE;
1402 	attrs->range = dev->range;
1403 	attrs->cache_bdev_desc = dev->cache_bdev_desc;
1404 }
1405 
1406 static void
1407 _ftl_io_write(void *ctx)
1408 {
1409 	ftl_io_write((struct ftl_io *)ctx);
1410 }
1411 
1412 int
1413 ftl_io_write(struct ftl_io *io)
1414 {
1415 	struct spdk_ftl_dev *dev = io->dev;
1416 
1417 	/* For normal IOs we just need to copy the data onto the rwb */
1418 	if (!(io->flags & FTL_IO_MD)) {
1419 		return ftl_rwb_fill(io);
1420 	}
1421 
1422 	/* Metadata has its own buffer, so it doesn't have to be copied, so just */
1423 	/* send it the the core thread and schedule the write immediately */
1424 	if (ftl_check_core_thread(dev)) {
1425 		return ftl_submit_write(ftl_wptr_from_band(io->band), io);
1426 	}
1427 
1428 	spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_write, io);
1429 
1430 	return 0;
1431 }
1432 
1433 static int
1434 _spdk_ftl_write(struct ftl_io *io)
1435 {
1436 	int rc;
1437 
1438 	rc = ftl_io_write(io);
1439 	if (rc == -EAGAIN) {
1440 		spdk_thread_send_msg(spdk_io_channel_get_thread(io->ioch),
1441 				     _ftl_write, io);
1442 		return 0;
1443 	}
1444 
1445 	if (rc) {
1446 		ftl_io_free(io);
1447 	}
1448 
1449 	return rc;
1450 }
1451 
1452 static void
1453 _ftl_write(void *ctx)
1454 {
1455 	_spdk_ftl_write(ctx);
1456 }
1457 
1458 int
1459 spdk_ftl_write(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
1460 	       struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
1461 {
1462 	struct ftl_io *io;
1463 
1464 	if (iov_cnt == 0 || iov_cnt > FTL_MAX_IOV) {
1465 		return -EINVAL;
1466 	}
1467 
1468 	if (lba_cnt == 0) {
1469 		return -EINVAL;
1470 	}
1471 
1472 	if (lba_cnt != ftl_iovec_num_lbks(iov, iov_cnt)) {
1473 		return -EINVAL;
1474 	}
1475 
1476 	if (!dev->initialized) {
1477 		return -EBUSY;
1478 	}
1479 
1480 	io = ftl_io_alloc(ch);
1481 	if (!io) {
1482 		return -ENOMEM;
1483 	}
1484 
1485 	ftl_io_user_init(dev, io, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_WRITE);
1486 	return _spdk_ftl_write(io);
1487 }
1488 
1489 int
1490 ftl_io_read(struct ftl_io *io)
1491 {
1492 	struct spdk_ftl_dev *dev = io->dev;
1493 	ftl_next_ppa_fn	next_ppa;
1494 
1495 	if (ftl_check_read_thread(dev)) {
1496 		if (ftl_io_mode_ppa(io)) {
1497 			next_ppa = ftl_ppa_read_next_ppa;
1498 		} else {
1499 			next_ppa = ftl_lba_read_next_ppa;
1500 		}
1501 
1502 		return ftl_submit_read(io, next_ppa);
1503 	}
1504 
1505 	spdk_thread_send_msg(ftl_get_read_thread(dev), _ftl_read, io);
1506 	return 0;
1507 }
1508 
1509 static void
1510 _ftl_read(void *arg)
1511 {
1512 	ftl_io_read((struct ftl_io *)arg);
1513 }
1514 
1515 int
1516 spdk_ftl_read(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
1517 	      struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
1518 {
1519 	struct ftl_io *io;
1520 
1521 	if (iov_cnt == 0 || iov_cnt > FTL_MAX_IOV) {
1522 		return -EINVAL;
1523 	}
1524 
1525 	if (lba_cnt == 0) {
1526 		return -EINVAL;
1527 	}
1528 
1529 	if (lba_cnt != ftl_iovec_num_lbks(iov, iov_cnt)) {
1530 		return -EINVAL;
1531 	}
1532 
1533 	if (!dev->initialized) {
1534 		return -EBUSY;
1535 	}
1536 
1537 	io = ftl_io_alloc(ch);
1538 	if (!io) {
1539 		return -ENOMEM;
1540 	}
1541 
1542 	ftl_io_user_init(dev, io, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_READ);
1543 	ftl_io_read(io);
1544 	return 0;
1545 }
1546 
1547 static struct ftl_flush *
1548 ftl_flush_init(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
1549 {
1550 	struct ftl_flush *flush;
1551 	struct ftl_rwb *rwb = dev->rwb;
1552 
1553 	flush = calloc(1, sizeof(*flush));
1554 	if (!flush) {
1555 		return NULL;
1556 	}
1557 
1558 	flush->bmap = spdk_bit_array_create(ftl_rwb_num_batches(rwb));
1559 	if (!flush->bmap) {
1560 		goto error;
1561 	}
1562 
1563 	flush->dev = dev;
1564 	flush->cb.fn = cb_fn;
1565 	flush->cb.ctx = cb_arg;
1566 
1567 	return flush;
1568 error:
1569 	free(flush);
1570 	return NULL;
1571 }
1572 
1573 static void
1574 _ftl_flush(void *ctx)
1575 {
1576 	struct ftl_flush *flush = ctx;
1577 	struct spdk_ftl_dev *dev = flush->dev;
1578 	struct ftl_rwb *rwb = dev->rwb;
1579 	struct ftl_rwb_batch *batch;
1580 
1581 	/* Attach flush object to all non-empty batches */
1582 	ftl_rwb_foreach_batch(batch, rwb) {
1583 		if (!ftl_rwb_batch_empty(batch)) {
1584 			spdk_bit_array_set(flush->bmap, ftl_rwb_batch_get_offset(batch));
1585 			flush->num_req++;
1586 		}
1587 	}
1588 
1589 	LIST_INSERT_HEAD(&dev->flush_list, flush, list_entry);
1590 
1591 	/* If the RWB was already empty, the flush can be completed right away */
1592 	if (!flush->num_req) {
1593 		ftl_complete_flush(flush);
1594 	}
1595 }
1596 
1597 int
1598 spdk_ftl_flush(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
1599 {
1600 	struct ftl_flush *flush;
1601 
1602 	if (!dev->initialized) {
1603 		return -EBUSY;
1604 	}
1605 
1606 	flush = ftl_flush_init(dev, cb_fn, cb_arg);
1607 	if (!flush) {
1608 		return -ENOMEM;
1609 	}
1610 
1611 	spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_flush, flush);
1612 	return 0;
1613 }
1614 
1615 void
1616 ftl_process_anm_event(struct ftl_anm_event *event)
1617 {
1618 	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Unconsumed ANM received for dev: %p...\n", event->dev);
1619 	ftl_anm_event_complete(event);
1620 }
1621 
1622 static void
1623 ftl_process_retry_queue(struct spdk_ftl_dev *dev)
1624 {
1625 	struct ftl_io *io;
1626 	int rc;
1627 
1628 	while (!TAILQ_EMPTY(&dev->retry_queue)) {
1629 		io = TAILQ_FIRST(&dev->retry_queue);
1630 
1631 		/* Retry only if IO is still healthy */
1632 		if (spdk_likely(io->status == 0)) {
1633 			rc = ftl_io_read(io);
1634 			if (rc == -ENOMEM) {
1635 				break;
1636 			}
1637 		}
1638 
1639 		io->flags &= ~FTL_IO_RETRY;
1640 		TAILQ_REMOVE(&dev->retry_queue, io, retry_entry);
1641 
1642 		if (ftl_io_done(io)) {
1643 			ftl_io_complete(io);
1644 		}
1645 	}
1646 }
1647 
1648 int
1649 ftl_task_read(void *ctx)
1650 {
1651 	struct ftl_thread *thread = ctx;
1652 	struct spdk_ftl_dev *dev = thread->dev;
1653 	struct spdk_nvme_qpair *qpair = ftl_get_read_qpair(dev);
1654 	size_t num_completed;
1655 
1656 	if (dev->halt) {
1657 		if (ftl_shutdown_complete(dev)) {
1658 			spdk_poller_unregister(&thread->poller);
1659 			return 0;
1660 		}
1661 	}
1662 
1663 	num_completed = spdk_nvme_qpair_process_completions(qpair, 0);
1664 
1665 	if (num_completed && !TAILQ_EMPTY(&dev->retry_queue)) {
1666 		ftl_process_retry_queue(dev);
1667 	}
1668 
1669 	return num_completed;
1670 }
1671 
1672 int
1673 ftl_task_core(void *ctx)
1674 {
1675 	struct ftl_thread *thread = ctx;
1676 	struct spdk_ftl_dev *dev = thread->dev;
1677 	struct spdk_nvme_qpair *qpair = ftl_get_write_qpair(dev);
1678 
1679 	if (dev->halt) {
1680 		if (ftl_shutdown_complete(dev)) {
1681 			spdk_poller_unregister(&thread->poller);
1682 			return 0;
1683 		}
1684 	}
1685 
1686 	ftl_process_writes(dev);
1687 	spdk_nvme_qpair_process_completions(qpair, 0);
1688 	ftl_process_relocs(dev);
1689 
1690 	return 0;
1691 }
1692 
1693 SPDK_LOG_REGISTER_COMPONENT("ftl_core", SPDK_LOG_FTL_CORE)
1694