xref: /spdk/lib/ftl/ftl_core.c (revision dd1c38cc680e4e8ca2642e93bf289072bff7fc3d)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/likely.h"
35 #include "spdk/stdinc.h"
36 #include "spdk/nvme.h"
37 #include "spdk/io_channel.h"
38 #include "spdk/bdev_module.h"
39 #include "spdk_internal/log.h"
40 #include "spdk/ftl.h"
41 
42 #include "ftl_core.h"
43 #include "ftl_band.h"
44 #include "ftl_io.h"
45 #include "ftl_anm.h"
46 #include "ftl_rwb.h"
47 #include "ftl_debug.h"
48 #include "ftl_reloc.h"
49 
50 /* Max number of iovecs */
51 #define FTL_MAX_IOV 1024
52 
53 struct ftl_wptr {
54 	/* Owner device */
55 	struct spdk_ftl_dev		*dev;
56 
57 	/* Current PPA */
58 	struct ftl_ppa			ppa;
59 
60 	/* Band currently being written to */
61 	struct ftl_band			*band;
62 
63 	/* Current logical block's offset */
64 	uint64_t			offset;
65 
66 	/* Current erase block */
67 	struct ftl_chunk		*chunk;
68 
69 	/* IO that is currently processed */
70 	struct ftl_io			*current_io;
71 
72 	/* List link */
73 	LIST_ENTRY(ftl_wptr)		list_entry;
74 };
75 
76 struct ftl_flush {
77 	/* Owner device */
78 	struct spdk_ftl_dev		*dev;
79 
80 	/* Number of batches to wait for */
81 	size_t				num_req;
82 
83 	/* Callback */
84 	struct ftl_cb			cb;
85 
86 	/* Batch bitmap */
87 	struct spdk_bit_array		*bmap;
88 
89 	/* List link */
90 	LIST_ENTRY(ftl_flush)		list_entry;
91 };
92 
93 typedef int (*ftl_next_ppa_fn)(struct ftl_io *, struct ftl_ppa *, size_t, void *);
94 static void _ftl_read(void *);
95 static void _ftl_write(void *);
96 
97 static int
98 ftl_rwb_flags_from_io(const struct ftl_io *io)
99 {
100 	int valid_flags = FTL_IO_INTERNAL | FTL_IO_WEAK | FTL_IO_PAD;
101 	return io->flags & valid_flags;
102 }
103 
104 static int
105 ftl_rwb_entry_weak(const struct ftl_rwb_entry *entry)
106 {
107 	return entry->flags & FTL_IO_WEAK;
108 }
109 
110 static void
111 ftl_wptr_free(struct ftl_wptr *wptr)
112 {
113 	if (!wptr) {
114 		return;
115 	}
116 
117 	free(wptr);
118 }
119 
120 static void
121 ftl_remove_wptr(struct ftl_wptr *wptr)
122 {
123 	LIST_REMOVE(wptr, list_entry);
124 	ftl_wptr_free(wptr);
125 }
126 
127 static void
128 ftl_io_cmpl_cb(void *arg, const struct spdk_nvme_cpl *status)
129 {
130 	struct ftl_io *io = arg;
131 
132 	if (spdk_nvme_cpl_is_error(status)) {
133 		ftl_io_process_error(io, status);
134 	}
135 
136 	ftl_trace_completion(io->dev, io, FTL_TRACE_COMPLETION_DISK);
137 
138 	ftl_io_dec_req(io);
139 
140 	if (ftl_io_done(io)) {
141 		ftl_io_complete(io);
142 	}
143 }
144 
145 static void
146 ftl_halt_writes(struct spdk_ftl_dev *dev, struct ftl_band *band)
147 {
148 	struct ftl_wptr *wptr = NULL;
149 
150 	LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
151 		if (wptr->band == band) {
152 			break;
153 		}
154 	}
155 
156 	/* If the band already has the high_prio flag set, other writes must */
157 	/* have failed earlier, so it's already taken care of. */
158 	if (band->high_prio) {
159 		assert(wptr == NULL);
160 		return;
161 	}
162 
163 	ftl_band_write_failed(band);
164 	ftl_remove_wptr(wptr);
165 }
166 
167 static struct ftl_wptr *
168 ftl_wptr_from_band(struct ftl_band *band)
169 {
170 	struct spdk_ftl_dev *dev = band->dev;
171 	struct ftl_wptr *wptr = NULL;
172 
173 	LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
174 		if (wptr->band == band) {
175 			return wptr;
176 		}
177 	}
178 
179 	return NULL;
180 }
181 
182 static void
183 ftl_md_write_fail(struct ftl_io *io, int status)
184 {
185 	struct ftl_band *band = io->band;
186 	struct ftl_wptr *wptr;
187 	char buf[128];
188 
189 	wptr = ftl_wptr_from_band(band);
190 
191 	SPDK_ERRLOG("Metadata write failed @ppa: %s, status: %d\n",
192 		    ftl_ppa2str(wptr->ppa, buf, sizeof(buf)), status);
193 
194 	ftl_halt_writes(io->dev, band);
195 }
196 
197 static void
198 ftl_md_write_cb(void *arg, int status)
199 {
200 	struct ftl_io *io = arg;
201 	struct ftl_wptr *wptr;
202 
203 	wptr = ftl_wptr_from_band(io->band);
204 
205 	if (status) {
206 		ftl_md_write_fail(io, status);
207 		return;
208 	}
209 
210 	ftl_band_set_next_state(io->band);
211 	if (io->band->state == FTL_BAND_STATE_CLOSED) {
212 		ftl_remove_wptr(wptr);
213 	}
214 }
215 
216 static int
217 ftl_ppa_read_next_ppa(struct ftl_io *io, struct ftl_ppa *ppa,
218 		      size_t lbk, void *ctx)
219 {
220 	struct spdk_ftl_dev *dev = io->dev;
221 	size_t lbk_cnt, max_lbks;
222 
223 	assert(ftl_io_mode_ppa(io));
224 	assert(io->iov_pos < io->iov_cnt);
225 
226 	if (lbk == 0) {
227 		*ppa = io->ppa;
228 	} else {
229 		*ppa = ftl_band_next_xfer_ppa(io->band, io->ppa, lbk);
230 	}
231 
232 	assert(!ftl_ppa_invalid(*ppa));
233 
234 	/* Metadata has to be read in the way it's written (jumping across */
235 	/* the chunks in xfer_size increments) */
236 	if (io->flags & FTL_IO_MD) {
237 		max_lbks = dev->xfer_size - (ppa->lbk % dev->xfer_size);
238 		lbk_cnt = spdk_min(ftl_io_iovec_len_left(io), max_lbks);
239 		assert(ppa->lbk / dev->xfer_size == (ppa->lbk + lbk_cnt - 1) / dev->xfer_size);
240 	} else {
241 		lbk_cnt = ftl_io_iovec_len_left(io);
242 	}
243 
244 	return lbk_cnt;
245 }
246 
247 static int
248 ftl_wptr_close_band(struct ftl_wptr *wptr)
249 {
250 	struct ftl_band *band = wptr->band;
251 
252 	ftl_band_set_state(band, FTL_BAND_STATE_CLOSING);
253 	band->tail_md_ppa = wptr->ppa;
254 
255 	return ftl_band_write_tail_md(band, band->md.dma_buf, ftl_md_write_cb);
256 }
257 
258 static int
259 ftl_wptr_open_band(struct ftl_wptr *wptr)
260 {
261 	struct ftl_band *band = wptr->band;
262 
263 	assert(ftl_band_chunk_is_first(band, wptr->chunk));
264 	assert(band->md.num_vld == 0);
265 
266 	ftl_band_clear_md(band);
267 
268 	assert(band->state == FTL_BAND_STATE_PREP);
269 	ftl_band_set_state(band, FTL_BAND_STATE_OPENING);
270 
271 	return ftl_band_write_head_md(band, band->md.dma_buf, ftl_md_write_cb);
272 }
273 
274 static int
275 ftl_submit_erase(struct ftl_io *io)
276 {
277 	struct spdk_ftl_dev *dev = io->dev;
278 	struct ftl_band *band = io->band;
279 	struct ftl_ppa ppa = io->ppa;
280 	struct ftl_chunk *chunk;
281 	uint64_t ppa_packed;
282 	int rc = 0;
283 	size_t i;
284 
285 	for (i = 0; i < io->lbk_cnt; ++i) {
286 		if (i != 0) {
287 			chunk = ftl_band_next_chunk(band, ftl_band_chunk_from_ppa(band, ppa));
288 			assert(chunk->state == FTL_CHUNK_STATE_CLOSED ||
289 			       chunk->state == FTL_CHUNK_STATE_VACANT);
290 			ppa = chunk->start_ppa;
291 		}
292 
293 		assert(ppa.lbk == 0);
294 		ppa_packed = ftl_ppa_addr_pack(dev, ppa);
295 
296 		ftl_trace_submission(dev, io, ppa, 1);
297 		rc = spdk_nvme_ocssd_ns_cmd_vector_reset(dev->ns, ftl_get_write_qpair(dev),
298 				&ppa_packed, 1, NULL, ftl_io_cmpl_cb, io);
299 		if (rc) {
300 			ftl_io_fail(io, rc);
301 			SPDK_ERRLOG("Vector reset failed with status: %d\n", rc);
302 			break;
303 		}
304 
305 		ftl_io_inc_req(io);
306 		ftl_io_advance(io, 1);
307 	}
308 
309 	if (ftl_io_done(io)) {
310 		ftl_io_complete(io);
311 	}
312 
313 	return rc;
314 }
315 
316 static void
317 _ftl_io_erase(void *ctx)
318 {
319 	ftl_io_erase((struct ftl_io *)ctx);
320 }
321 
322 static bool
323 ftl_check_core_thread(const struct spdk_ftl_dev *dev)
324 {
325 	return dev->core_thread.thread == spdk_get_thread();
326 }
327 
328 static bool
329 ftl_check_read_thread(const struct spdk_ftl_dev *dev)
330 {
331 	return dev->read_thread.thread == spdk_get_thread();
332 }
333 
334 int
335 ftl_io_erase(struct ftl_io *io)
336 {
337 	struct spdk_ftl_dev *dev = io->dev;
338 
339 	if (ftl_check_core_thread(dev)) {
340 		return ftl_submit_erase(io);
341 	}
342 
343 	spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_erase, io);
344 	return 0;
345 }
346 
347 static struct ftl_band *
348 ftl_next_write_band(struct spdk_ftl_dev *dev)
349 {
350 	struct ftl_band *band;
351 
352 	band = LIST_FIRST(&dev->free_bands);
353 	if (!band) {
354 		return NULL;
355 	}
356 	assert(band->state == FTL_BAND_STATE_FREE);
357 
358 	if (ftl_band_erase(band)) {
359 		/* TODO: handle erase failure */
360 		return NULL;
361 	}
362 
363 	return band;
364 }
365 
366 static struct ftl_band *
367 ftl_next_wptr_band(struct spdk_ftl_dev *dev)
368 {
369 	struct ftl_band *band;
370 
371 	if (!dev->next_band) {
372 		band = ftl_next_write_band(dev);
373 	} else {
374 		assert(dev->next_band->state == FTL_BAND_STATE_PREP);
375 		band = dev->next_band;
376 		dev->next_band = NULL;
377 	}
378 
379 	return band;
380 }
381 
382 static struct ftl_wptr *
383 ftl_wptr_init(struct ftl_band *band)
384 {
385 	struct spdk_ftl_dev *dev = band->dev;
386 	struct ftl_wptr *wptr;
387 
388 	wptr = calloc(1, sizeof(*wptr));
389 	if (!wptr) {
390 		return NULL;
391 	}
392 
393 	wptr->dev = dev;
394 	wptr->band = band;
395 	wptr->chunk = CIRCLEQ_FIRST(&band->chunks);
396 	wptr->ppa = wptr->chunk->start_ppa;
397 
398 	return wptr;
399 }
400 
401 static int
402 ftl_add_wptr(struct spdk_ftl_dev *dev)
403 {
404 	struct ftl_band *band;
405 	struct ftl_wptr *wptr;
406 
407 	band = ftl_next_wptr_band(dev);
408 	if (!band) {
409 		return -1;
410 	}
411 
412 	wptr = ftl_wptr_init(band);
413 	if (!wptr) {
414 		return -1;
415 	}
416 
417 	if (ftl_band_write_prep(band)) {
418 		ftl_wptr_free(wptr);
419 		return -1;
420 	}
421 
422 	LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry);
423 
424 	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: band %u\n", band->id);
425 	ftl_trace_write_band(dev, band);
426 	return 0;
427 }
428 
429 static void
430 ftl_wptr_advance(struct ftl_wptr *wptr, size_t xfer_size)
431 {
432 	struct ftl_band *band = wptr->band;
433 	struct spdk_ftl_dev *dev = wptr->dev;
434 	struct spdk_ftl_conf *conf = &dev->conf;
435 	size_t next_thld;
436 
437 	wptr->offset += xfer_size;
438 	next_thld = (ftl_band_num_usable_lbks(band) * conf->band_thld) / 100;
439 
440 	if (ftl_band_full(band, wptr->offset)) {
441 		ftl_band_set_state(band, FTL_BAND_STATE_FULL);
442 	}
443 
444 	wptr->chunk->busy = true;
445 	wptr->ppa = ftl_band_next_xfer_ppa(band, wptr->ppa, xfer_size);
446 	wptr->chunk = ftl_band_next_operational_chunk(band, wptr->chunk);
447 
448 	assert(!ftl_ppa_invalid(wptr->ppa));
449 
450 	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: grp:%d, pu:%d chunk:%d, lbk:%u\n",
451 		      wptr->ppa.grp, wptr->ppa.pu, wptr->ppa.chk, wptr->ppa.lbk);
452 
453 	if (wptr->offset >= next_thld && !dev->next_band) {
454 		dev->next_band = ftl_next_write_band(dev);
455 	}
456 }
457 
458 static int
459 ftl_wptr_ready(struct ftl_wptr *wptr)
460 {
461 	struct ftl_band *band = wptr->band;
462 
463 	/* TODO: add handling of empty bands */
464 
465 	if (spdk_unlikely(!ftl_chunk_is_writable(wptr->chunk))) {
466 		/* Erasing band may fail after it was assigned to wptr. */
467 		if (spdk_unlikely(wptr->chunk->state == FTL_CHUNK_STATE_BAD)) {
468 			ftl_wptr_advance(wptr, wptr->dev->xfer_size);
469 		}
470 		return 0;
471 	}
472 
473 	/* If we're in the process of writing metadata, wait till it is */
474 	/* completed. */
475 	/* TODO: we should probably change bands once we're writing tail md */
476 	if (ftl_band_state_changing(band)) {
477 		return 0;
478 	}
479 
480 	if (band->state == FTL_BAND_STATE_FULL) {
481 		if (ftl_wptr_close_band(wptr)) {
482 			/* TODO: need recovery here */
483 			assert(false);
484 		}
485 		return 0;
486 	}
487 
488 	if (band->state != FTL_BAND_STATE_OPEN) {
489 		if (ftl_wptr_open_band(wptr)) {
490 			/* TODO: need recovery here */
491 			assert(false);
492 		}
493 		return 0;
494 	}
495 
496 	return 1;
497 }
498 
499 static const struct spdk_ftl_limit *
500 ftl_get_limit(const struct spdk_ftl_dev *dev, int type)
501 {
502 	assert(type < SPDK_FTL_LIMIT_MAX);
503 	return &dev->conf.defrag.limits[type];
504 }
505 
506 static bool
507 ftl_cache_lba_valid(struct spdk_ftl_dev *dev, struct ftl_rwb_entry *entry)
508 {
509 	struct ftl_ppa ppa;
510 
511 	/* If the LBA is invalid don't bother checking the md and l2p */
512 	if (spdk_unlikely(entry->lba == FTL_LBA_INVALID)) {
513 		return false;
514 	}
515 
516 	ppa = ftl_l2p_get(dev, entry->lba);
517 	if (!(ftl_ppa_cached(ppa) && ppa.offset == entry->pos)) {
518 		return false;
519 	}
520 
521 	return true;
522 }
523 
524 static void
525 ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_rwb_entry *entry)
526 {
527 	pthread_spin_lock(&entry->lock);
528 
529 	if (!ftl_rwb_entry_valid(entry)) {
530 		goto unlock;
531 	}
532 
533 	/* If the l2p wasn't updated and still points at the entry, fill it with the */
534 	/* on-disk PPA and clear the cache status bit. Otherwise, skip the l2p update */
535 	/* and just clear the cache status. */
536 	if (!ftl_cache_lba_valid(dev, entry)) {
537 		goto clear;
538 	}
539 
540 	ftl_l2p_set(dev, entry->lba, entry->ppa);
541 clear:
542 	ftl_rwb_entry_invalidate(entry);
543 unlock:
544 	pthread_spin_unlock(&entry->lock);
545 }
546 
547 static struct ftl_rwb_entry *
548 ftl_acquire_entry(struct spdk_ftl_dev *dev, int flags)
549 {
550 	struct ftl_rwb_entry *entry;
551 
552 	entry = ftl_rwb_acquire(dev->rwb, ftl_rwb_type_from_flags(flags));
553 	if (!entry) {
554 		return NULL;
555 	}
556 
557 	ftl_evict_cache_entry(dev, entry);
558 
559 	entry->flags = flags;
560 	return entry;
561 }
562 
563 static void
564 ftl_rwb_pad(struct spdk_ftl_dev *dev, size_t size)
565 {
566 	struct ftl_rwb_entry *entry;
567 	int flags = FTL_IO_PAD | FTL_IO_INTERNAL;
568 
569 	for (size_t i = 0; i < size; ++i) {
570 		entry = ftl_acquire_entry(dev, flags);
571 		if (!entry) {
572 			break;
573 		}
574 
575 		entry->lba = FTL_LBA_INVALID;
576 		entry->ppa = ftl_to_ppa(FTL_PPA_INVALID);
577 		memset(entry->data, 0, FTL_BLOCK_SIZE);
578 		ftl_rwb_push(entry);
579 	}
580 }
581 
582 static void
583 ftl_remove_free_bands(struct spdk_ftl_dev *dev)
584 {
585 	while (!LIST_EMPTY(&dev->free_bands)) {
586 		LIST_REMOVE(LIST_FIRST(&dev->free_bands), list_entry);
587 	}
588 
589 	dev->next_band = NULL;
590 }
591 
592 static void
593 ftl_process_shutdown(struct spdk_ftl_dev *dev)
594 {
595 	size_t size = ftl_rwb_num_acquired(dev->rwb, FTL_RWB_TYPE_INTERNAL) +
596 		      ftl_rwb_num_acquired(dev->rwb, FTL_RWB_TYPE_USER);
597 
598 	if (size >= dev->xfer_size) {
599 		return;
600 	}
601 
602 	/* If we reach this point we need to remove free bands */
603 	/* and pad current wptr band to the end */
604 	ftl_remove_free_bands(dev);
605 
606 	/* Pad write buffer until band is full */
607 	ftl_rwb_pad(dev, dev->xfer_size - size);
608 }
609 
610 static int
611 ftl_shutdown_complete(struct spdk_ftl_dev *dev)
612 {
613 	return !__atomic_load_n(&dev->num_inflight, __ATOMIC_SEQ_CST) &&
614 	       LIST_EMPTY(&dev->wptr_list);
615 }
616 
617 void
618 ftl_apply_limits(struct spdk_ftl_dev *dev)
619 {
620 	const struct spdk_ftl_limit *limit;
621 	struct ftl_stats *stats = &dev->stats;
622 	size_t rwb_limit[FTL_RWB_TYPE_MAX];
623 	int i;
624 
625 	ftl_rwb_get_limits(dev->rwb, rwb_limit);
626 
627 	/* Clear existing limit */
628 	dev->limit = SPDK_FTL_LIMIT_MAX;
629 
630 	for (i = SPDK_FTL_LIMIT_CRIT; i < SPDK_FTL_LIMIT_MAX; ++i) {
631 		limit = ftl_get_limit(dev, i);
632 
633 		if (dev->num_free <= limit->thld) {
634 			rwb_limit[FTL_RWB_TYPE_USER] =
635 				(limit->limit * ftl_rwb_entry_cnt(dev->rwb)) / 100;
636 			stats->limits[i]++;
637 			dev->limit = i;
638 			goto apply;
639 		}
640 	}
641 
642 	/* Clear the limits, since we don't need to apply them anymore */
643 	rwb_limit[FTL_RWB_TYPE_USER] = ftl_rwb_entry_cnt(dev->rwb);
644 apply:
645 	ftl_trace_limits(dev, rwb_limit, dev->num_free);
646 	ftl_rwb_set_limits(dev->rwb, rwb_limit);
647 }
648 
649 static int
650 ftl_invalidate_addr_unlocked(struct spdk_ftl_dev *dev, struct ftl_ppa ppa)
651 {
652 	struct ftl_band *band = ftl_band_from_ppa(dev, ppa);
653 	struct ftl_md *md = &band->md;
654 	uint64_t offset;
655 
656 	offset = ftl_band_lbkoff_from_ppa(band, ppa);
657 
658 	/* The bit might be already cleared if two writes are scheduled to the */
659 	/* same LBA at the same time */
660 	if (spdk_bit_array_get(md->vld_map, offset)) {
661 		assert(md->num_vld > 0);
662 		spdk_bit_array_clear(md->vld_map, offset);
663 		md->num_vld--;
664 		return 1;
665 	}
666 
667 	return 0;
668 }
669 
670 int
671 ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_ppa ppa)
672 {
673 	struct ftl_band *band;
674 	int rc;
675 
676 	assert(!ftl_ppa_cached(ppa));
677 	band = ftl_band_from_ppa(dev, ppa);
678 
679 	pthread_spin_lock(&band->md.lock);
680 	rc = ftl_invalidate_addr_unlocked(dev, ppa);
681 	pthread_spin_unlock(&band->md.lock);
682 
683 	return rc;
684 }
685 
686 static int
687 ftl_read_retry(int rc)
688 {
689 	return rc == -EAGAIN;
690 }
691 
692 static int
693 ftl_read_canceled(int rc)
694 {
695 	return rc == 0;
696 }
697 
698 static void
699 ftl_add_to_retry_queue(struct ftl_io *io)
700 {
701 	if (!(io->flags & FTL_IO_RETRY)) {
702 		io->flags |= FTL_IO_RETRY;
703 		TAILQ_INSERT_TAIL(&io->dev->retry_queue, io, retry_entry);
704 	}
705 }
706 
707 static int
708 ftl_submit_read(struct ftl_io *io, ftl_next_ppa_fn next_ppa,
709 		void *ctx)
710 {
711 	struct spdk_ftl_dev *dev = io->dev;
712 	struct ftl_ppa ppa;
713 	int rc = 0, lbk_cnt;
714 
715 	while (io->pos < io->lbk_cnt) {
716 		/* We might hit the cache here, if so, skip the read */
717 		lbk_cnt = rc = next_ppa(io, &ppa, io->pos, ctx);
718 
719 		/* We might need to retry the read from scratch (e.g. */
720 		/* because write was under way and completed before */
721 		/* we could read it from rwb */
722 		if (ftl_read_retry(rc)) {
723 			continue;
724 		}
725 
726 		/* We don't have to schedule the read, as it was read from cache */
727 		if (ftl_read_canceled(rc)) {
728 			ftl_io_advance(io, 1);
729 			continue;
730 		}
731 
732 		assert(lbk_cnt > 0);
733 
734 		ftl_trace_submission(dev, io, ppa, lbk_cnt);
735 		rc = spdk_nvme_ns_cmd_read(dev->ns, ftl_get_read_qpair(dev),
736 					   ftl_io_iovec_addr(io),
737 					   ftl_ppa_addr_pack(io->dev, ppa), lbk_cnt,
738 					   ftl_io_cmpl_cb, io, 0);
739 		if (rc == -ENOMEM) {
740 			ftl_add_to_retry_queue(io);
741 			break;
742 		} else if (rc) {
743 			ftl_io_fail(io, rc);
744 			break;
745 		}
746 
747 		ftl_io_inc_req(io);
748 		ftl_io_advance(io, lbk_cnt);
749 	}
750 
751 	/* If we didn't have to read anything from the device, */
752 	/* complete the request right away */
753 	if (ftl_io_done(io)) {
754 		ftl_io_complete(io);
755 	}
756 
757 	return rc;
758 }
759 
760 static int
761 ftl_ppa_cache_read(struct ftl_io *io, uint64_t lba,
762 		   struct ftl_ppa ppa, void *buf)
763 {
764 	struct ftl_rwb *rwb = io->dev->rwb;
765 	struct ftl_rwb_entry *entry;
766 	struct ftl_ppa nppa;
767 	int rc = 0;
768 
769 	entry = ftl_rwb_entry_from_offset(rwb, ppa.offset);
770 	pthread_spin_lock(&entry->lock);
771 
772 	nppa = ftl_l2p_get(io->dev, lba);
773 	if (ppa.ppa != nppa.ppa) {
774 		rc = -1;
775 		goto out;
776 	}
777 
778 	memcpy(buf, entry->data, FTL_BLOCK_SIZE);
779 out:
780 	pthread_spin_unlock(&entry->lock);
781 	return rc;
782 }
783 
784 static int
785 ftl_lba_read_next_ppa(struct ftl_io *io, struct ftl_ppa *ppa,
786 		      size_t lbk, void *ctx)
787 {
788 	struct spdk_ftl_dev *dev = io->dev;
789 	struct ftl_ppa next_ppa;
790 	size_t i;
791 
792 	*ppa = ftl_l2p_get(dev, io->lba + lbk);
793 
794 	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Read ppa:%lx, lba:%lu\n", ppa->ppa, io->lba);
795 
796 	/* If the PPA is invalid, skip it (the buffer should already be zero'ed) */
797 	if (ftl_ppa_invalid(*ppa)) {
798 		ftl_trace_completion(io->dev, io, FTL_TRACE_COMPLETION_INVALID);
799 		return 0;
800 	}
801 
802 	if (ftl_ppa_cached(*ppa)) {
803 		if (!ftl_ppa_cache_read(io, io->lba + lbk, *ppa, ftl_io_iovec_addr(io))) {
804 			ftl_trace_completion(io->dev, io, FTL_TRACE_COMPLETION_CACHE);
805 			return 0;
806 		}
807 
808 		/* If the state changed, we have to re-read the l2p */
809 		return -EAGAIN;
810 	}
811 
812 	for (i = 1; i < ftl_io_iovec_len_left(io); ++i) {
813 		next_ppa = ftl_l2p_get(dev, io->lba + lbk + i);
814 
815 		if (ftl_ppa_invalid(next_ppa) || ftl_ppa_cached(next_ppa)) {
816 			break;
817 		}
818 
819 		if (ftl_ppa_addr_pack(dev, *ppa) + i != ftl_ppa_addr_pack(dev, next_ppa)) {
820 			break;
821 		}
822 	}
823 
824 	return i;
825 }
826 
827 static void
828 ftl_complete_flush(struct ftl_flush *flush)
829 {
830 	assert(flush->num_req == 0);
831 	LIST_REMOVE(flush, list_entry);
832 
833 	flush->cb.fn(flush->cb.ctx, 0);
834 
835 	spdk_bit_array_free(&flush->bmap);
836 	free(flush);
837 }
838 
839 static void
840 ftl_process_flush(struct spdk_ftl_dev *dev, struct ftl_rwb_batch *batch)
841 {
842 	struct ftl_flush *flush, *tflush;
843 	size_t offset;
844 
845 	LIST_FOREACH_SAFE(flush, &dev->flush_list, list_entry, tflush) {
846 		offset = ftl_rwb_batch_get_offset(batch);
847 
848 		if (spdk_bit_array_get(flush->bmap, offset)) {
849 			spdk_bit_array_set(flush->bmap, offset);
850 			if (!(--flush->num_req)) {
851 				ftl_complete_flush(flush);
852 			}
853 		}
854 	}
855 }
856 
857 static void
858 ftl_write_fail(struct ftl_io *io, int status)
859 {
860 	struct ftl_rwb_batch *batch = io->rwb_batch;
861 	struct spdk_ftl_dev *dev = io->dev;
862 	struct ftl_rwb_entry *entry;
863 	struct ftl_band *band;
864 	char buf[128];
865 
866 	entry = ftl_rwb_batch_first_entry(batch);
867 
868 	band = ftl_band_from_ppa(io->dev, entry->ppa);
869 	SPDK_ERRLOG("Write failed @ppa: %s, status: %d\n",
870 		    ftl_ppa2str(entry->ppa, buf, sizeof(buf)), status);
871 
872 	/* Close the band and, halt wptr and defrag */
873 	ftl_halt_writes(dev, band);
874 
875 	ftl_rwb_foreach(entry, batch) {
876 		/* Invalidate meta set by process_writes() */
877 		ftl_invalidate_addr(dev, entry->ppa);
878 	}
879 
880 	/* Reset the batch back to the the RWB to resend it later */
881 	ftl_rwb_batch_revert(batch);
882 }
883 
884 static void
885 ftl_write_cb(void *arg, int status)
886 {
887 	struct ftl_io *io = arg;
888 	struct spdk_ftl_dev *dev = io->dev;
889 	struct ftl_rwb_batch *batch = io->rwb_batch;
890 	struct ftl_rwb_entry *entry;
891 
892 	if (status) {
893 		ftl_write_fail(io, status);
894 		return;
895 	}
896 
897 	assert(io->lbk_cnt == dev->xfer_size);
898 	ftl_rwb_foreach(entry, batch) {
899 		if (!(io->flags & FTL_IO_MD) && !(entry->flags & FTL_IO_PAD)) {
900 			/* Verify that the LBA is set for user lbks */
901 			assert(entry->lba != FTL_LBA_INVALID);
902 		}
903 
904 		SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write ppa:%lu, lba:%lu\n",
905 			      entry->ppa.ppa, entry->lba);
906 	}
907 
908 	ftl_process_flush(dev, batch);
909 	ftl_rwb_batch_release(batch);
910 }
911 
912 static void
913 ftl_update_rwb_stats(struct spdk_ftl_dev *dev, const struct ftl_rwb_entry *entry)
914 {
915 	if (!ftl_rwb_entry_internal(entry)) {
916 		dev->stats.write_user++;
917 	}
918 	dev->stats.write_total++;
919 }
920 
921 static void
922 ftl_update_l2p(struct spdk_ftl_dev *dev, const struct ftl_rwb_entry *entry,
923 	       struct ftl_ppa ppa)
924 {
925 	struct ftl_ppa prev_ppa;
926 	struct ftl_rwb_entry *prev;
927 	struct ftl_band *band;
928 	int valid;
929 
930 	prev_ppa = ftl_l2p_get(dev, entry->lba);
931 	if (ftl_ppa_invalid(prev_ppa)) {
932 		ftl_l2p_set(dev, entry->lba, ppa);
933 		return;
934 	}
935 
936 	/* If the L2P's PPA is different than what we expected we don't need to */
937 	/* do anything (someone's already overwritten our data). */
938 	if (ftl_rwb_entry_weak(entry) && !ftl_ppa_cmp(prev_ppa, entry->ppa)) {
939 		return;
940 	}
941 
942 	if (ftl_ppa_cached(prev_ppa)) {
943 		assert(!ftl_rwb_entry_weak(entry));
944 		prev = ftl_rwb_entry_from_offset(dev->rwb, prev_ppa.offset);
945 		pthread_spin_lock(&prev->lock);
946 
947 		/* Re-read the L2P under the lock to protect against updates */
948 		/* to this LBA from other threads */
949 		prev_ppa = ftl_l2p_get(dev, entry->lba);
950 
951 		/* If the entry is no longer in cache, another write has been */
952 		/* scheduled in the meantime, so we have to invalidate its LBA */
953 		if (!ftl_ppa_cached(prev_ppa)) {
954 			ftl_invalidate_addr(dev, prev_ppa);
955 		}
956 
957 		/* If previous entry is part of cache, remove and invalidate it */
958 		if (ftl_rwb_entry_valid(prev)) {
959 			ftl_invalidate_addr(dev, prev->ppa);
960 			ftl_rwb_entry_invalidate(prev);
961 		}
962 
963 		ftl_l2p_set(dev, entry->lba, ppa);
964 		pthread_spin_unlock(&prev->lock);
965 		return;
966 	}
967 
968 	/* Lock the band containing previous PPA. This assures atomic changes to */
969 	/* the L2P as wall as metadata. The valid bits in metadata are used to */
970 	/* check weak writes validity. */
971 	band = ftl_band_from_ppa(dev, prev_ppa);
972 	pthread_spin_lock(&band->md.lock);
973 
974 	valid = ftl_invalidate_addr_unlocked(dev, prev_ppa);
975 
976 	/* If the address has been invalidated already, we don't want to update */
977 	/* the L2P for weak writes, as it means the write is no longer valid. */
978 	if (!ftl_rwb_entry_weak(entry) || valid) {
979 		ftl_l2p_set(dev, entry->lba, ppa);
980 	}
981 
982 	pthread_spin_unlock(&band->md.lock);
983 }
984 
985 static struct ftl_io *
986 ftl_io_init_child_write(struct ftl_io *parent, struct ftl_ppa ppa,
987 			void *data, void *md, spdk_ftl_fn cb)
988 {
989 	struct ftl_io *io;
990 	struct spdk_ftl_dev *dev = parent->dev;
991 	struct ftl_io_init_opts opts = {
992 		.dev		= dev,
993 		.io		= NULL,
994 		.parent		= parent,
995 		.rwb_batch	= NULL,
996 		.band		= parent->band,
997 		.size		= sizeof(struct ftl_io),
998 		.flags		= 0,
999 		.type		= FTL_IO_WRITE,
1000 		.iov_cnt	= 1,
1001 		.req_size	= dev->xfer_size,
1002 		.fn		= cb,
1003 		.data		= data,
1004 		.md		= md,
1005 	};
1006 
1007 	io = ftl_io_init_internal(&opts);
1008 	if (!io) {
1009 		return NULL;
1010 	}
1011 
1012 	io->ppa = ppa;
1013 
1014 	return io;
1015 }
1016 
1017 static void
1018 ftl_io_child_write_cb(void *ctx, int status)
1019 {
1020 	struct ftl_chunk *chunk;
1021 	struct ftl_io *io = ctx;
1022 
1023 	chunk = ftl_band_chunk_from_ppa(io->band, io->ppa);
1024 	chunk->busy = false;
1025 }
1026 
1027 static int
1028 ftl_submit_child_write(struct ftl_wptr *wptr, struct ftl_io *io, int lbk_cnt)
1029 {
1030 	struct spdk_ftl_dev	*dev = io->dev;
1031 	struct ftl_io		*child;
1032 	struct iovec		*iov = ftl_io_iovec(io);
1033 	int			rc;
1034 
1035 	/* Split IO to child requests and release chunk immediately after child is completed */
1036 	child = ftl_io_init_child_write(io, wptr->ppa, iov[io->iov_pos].iov_base,
1037 					ftl_io_get_md(io), ftl_io_child_write_cb);
1038 	if (!child) {
1039 		return -EAGAIN;
1040 	}
1041 
1042 	rc = spdk_nvme_ns_cmd_write_with_md(dev->ns, ftl_get_write_qpair(dev),
1043 					    child->iov.iov_base, child->md,
1044 					    ftl_ppa_addr_pack(dev, wptr->ppa),
1045 					    lbk_cnt, ftl_io_cmpl_cb, child, 0, 0, 0);
1046 	if (rc) {
1047 		ftl_io_fail(child, rc);
1048 		ftl_io_complete(child);
1049 		SPDK_ERRLOG("spdk_nvme_ns_cmd_write failed with status:%d, ppa:%lu\n",
1050 			    rc, wptr->ppa.ppa);
1051 
1052 		return -EIO;
1053 	}
1054 
1055 	ftl_io_inc_req(child);
1056 	ftl_io_advance(child, lbk_cnt);
1057 
1058 	return 0;
1059 }
1060 
1061 static int
1062 ftl_submit_write(struct ftl_wptr *wptr, struct ftl_io *io)
1063 {
1064 	struct spdk_ftl_dev	*dev = io->dev;
1065 	struct iovec		*iov = ftl_io_iovec(io);
1066 	int			rc = 0;
1067 	size_t			lbk_cnt;
1068 
1069 	while (io->iov_pos < io->iov_cnt) {
1070 		lbk_cnt = iov[io->iov_pos].iov_len / PAGE_SIZE;
1071 		assert(iov[io->iov_pos].iov_len > 0);
1072 		assert(lbk_cnt == dev->xfer_size);
1073 
1074 		/* There are no guarantees of the order of completion of NVMe IO submission queue */
1075 		/* so wait until chunk is not busy before submitting another write */
1076 		if (wptr->chunk->busy) {
1077 			wptr->current_io = io;
1078 			rc = -EAGAIN;
1079 			break;
1080 		}
1081 
1082 		rc = ftl_submit_child_write(wptr, io, lbk_cnt);
1083 
1084 		if (rc == -EAGAIN) {
1085 			wptr->current_io = io;
1086 			break;
1087 		} else if (rc) {
1088 			ftl_io_fail(io, rc);
1089 			break;
1090 		}
1091 
1092 		ftl_trace_submission(dev, io, wptr->ppa, lbk_cnt);
1093 
1094 		/* Update parent iovec */
1095 		ftl_io_advance(io, lbk_cnt);
1096 
1097 		ftl_wptr_advance(wptr, lbk_cnt);
1098 	}
1099 
1100 	if (ftl_io_done(io)) {
1101 		/* Parent IO will complete after all children are completed */
1102 		ftl_io_complete(io);
1103 	}
1104 
1105 	return rc;
1106 }
1107 
1108 static void
1109 ftl_flush_pad_batch(struct spdk_ftl_dev *dev)
1110 {
1111 	struct ftl_rwb *rwb = dev->rwb;
1112 	size_t size;
1113 
1114 	size = ftl_rwb_num_acquired(rwb, FTL_RWB_TYPE_INTERNAL) +
1115 	       ftl_rwb_num_acquired(rwb, FTL_RWB_TYPE_USER);
1116 
1117 	/* There must be something in the RWB, otherwise the flush */
1118 	/* wouldn't be waiting for anything */
1119 	assert(size > 0);
1120 
1121 	/* Only add padding when there's less than xfer size */
1122 	/* entries in the buffer. Otherwise we just have to wait */
1123 	/* for the entries to become ready. */
1124 	if (size < dev->xfer_size) {
1125 		ftl_rwb_pad(dev, dev->xfer_size - (size % dev->xfer_size));
1126 	}
1127 }
1128 
1129 static int
1130 ftl_wptr_process_writes(struct ftl_wptr *wptr)
1131 {
1132 	struct spdk_ftl_dev	*dev = wptr->dev;
1133 	struct ftl_rwb_batch	*batch;
1134 	struct ftl_rwb_entry	*entry;
1135 	struct ftl_io		*io;
1136 	struct ftl_ppa		ppa, prev_ppa;
1137 
1138 	if (wptr->current_io) {
1139 		if (ftl_submit_write(wptr, wptr->current_io) == -EAGAIN) {
1140 			return 0;
1141 		}
1142 		wptr->current_io = NULL;
1143 	}
1144 
1145 	/* Make sure the band is prepared for writing */
1146 	if (!ftl_wptr_ready(wptr)) {
1147 		return 0;
1148 	}
1149 
1150 	if (dev->halt) {
1151 		ftl_process_shutdown(dev);
1152 	}
1153 
1154 	batch = ftl_rwb_pop(dev->rwb);
1155 	if (!batch) {
1156 		/* If there are queued flush requests we need to pad the RWB to */
1157 		/* force out remaining entries */
1158 		if (!LIST_EMPTY(&dev->flush_list)) {
1159 			ftl_flush_pad_batch(dev);
1160 		}
1161 
1162 		return 0;
1163 	}
1164 
1165 	io = ftl_io_rwb_init(dev, wptr->band, batch, ftl_write_cb);
1166 	if (!io) {
1167 		goto error;
1168 	}
1169 
1170 	ppa = wptr->ppa;
1171 	ftl_rwb_foreach(entry, batch) {
1172 		entry->ppa = ppa;
1173 
1174 		if (entry->lba != FTL_LBA_INVALID) {
1175 			pthread_spin_lock(&entry->lock);
1176 			prev_ppa = ftl_l2p_get(dev, entry->lba);
1177 
1178 			/* If the l2p was updated in the meantime, don't update band's metadata */
1179 			if (ftl_ppa_cached(prev_ppa) && prev_ppa.offset == entry->pos) {
1180 				/* Setting entry's cache bit needs to be done after metadata */
1181 				/* within the band is updated to make sure that writes */
1182 				/* invalidating the entry clear the metadata as well */
1183 				ftl_band_set_addr(wptr->band, entry->lba, entry->ppa);
1184 				ftl_rwb_entry_set_valid(entry);
1185 			}
1186 			pthread_spin_unlock(&entry->lock);
1187 		}
1188 
1189 		ftl_trace_rwb_pop(dev, entry);
1190 		ftl_update_rwb_stats(dev, entry);
1191 
1192 		ppa = ftl_band_next_ppa(wptr->band, ppa, 1);
1193 	}
1194 
1195 	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write ppa:%lx, %lx\n", wptr->ppa.ppa,
1196 		      ftl_ppa_addr_pack(dev, wptr->ppa));
1197 
1198 	if (ftl_submit_write(wptr, io)) {
1199 		/* TODO: we need some recovery here */
1200 		assert(0 && "Write submit failed");
1201 		if (ftl_io_done(io)) {
1202 			ftl_io_free(io);
1203 		}
1204 	}
1205 
1206 	return dev->xfer_size;
1207 error:
1208 	ftl_rwb_batch_revert(batch);
1209 	return 0;
1210 }
1211 
1212 static int
1213 ftl_process_writes(struct spdk_ftl_dev *dev)
1214 {
1215 	struct ftl_wptr *wptr, *twptr;
1216 	size_t num_active = 0;
1217 	enum ftl_band_state state;
1218 
1219 	LIST_FOREACH_SAFE(wptr, &dev->wptr_list, list_entry, twptr) {
1220 		ftl_wptr_process_writes(wptr);
1221 		state = wptr->band->state;
1222 
1223 		if (state != FTL_BAND_STATE_FULL &&
1224 		    state != FTL_BAND_STATE_CLOSING &&
1225 		    state != FTL_BAND_STATE_CLOSED) {
1226 			num_active++;
1227 		}
1228 	}
1229 
1230 	if (num_active < 1) {
1231 		ftl_add_wptr(dev);
1232 	}
1233 
1234 	return 0;
1235 }
1236 
1237 static void
1238 ftl_rwb_entry_fill(struct ftl_rwb_entry *entry, struct ftl_io *io)
1239 {
1240 	struct ftl_band *band;
1241 
1242 	memcpy(entry->data, ftl_io_iovec_addr(io), FTL_BLOCK_SIZE);
1243 
1244 	if (ftl_rwb_entry_weak(entry)) {
1245 		band = ftl_band_from_ppa(io->dev, io->ppa);
1246 		entry->ppa = ftl_band_next_ppa(band, io->ppa, io->pos);
1247 	}
1248 
1249 	entry->trace = io->trace;
1250 
1251 	if (entry->md) {
1252 		memcpy(entry->md, &entry->lba, sizeof(io->lba));
1253 	}
1254 }
1255 
1256 static int
1257 ftl_rwb_fill(struct ftl_io *io)
1258 {
1259 	struct spdk_ftl_dev *dev = io->dev;
1260 	struct ftl_rwb_entry *entry;
1261 	struct ftl_ppa ppa = { .cached = 1 };
1262 	int flags = ftl_rwb_flags_from_io(io);
1263 	uint64_t lba;
1264 
1265 	while (io->pos < io->lbk_cnt) {
1266 		lba = ftl_io_current_lba(io);
1267 		if (lba == FTL_LBA_INVALID) {
1268 			ftl_io_advance(io, 1);
1269 			continue;
1270 		}
1271 
1272 		entry = ftl_acquire_entry(dev, flags);
1273 		if (!entry) {
1274 			return -EAGAIN;
1275 		}
1276 
1277 		entry->lba = lba;
1278 		ftl_rwb_entry_fill(entry, io);
1279 
1280 		ppa.offset = entry->pos;
1281 
1282 		ftl_io_advance(io, 1);
1283 		ftl_update_l2p(dev, entry, ppa);
1284 
1285 		/* Needs to be done after L2P is updated to avoid race with */
1286 		/* write completion callback when it's processed faster than */
1287 		/* L2P is set in update_l2p(). */
1288 		ftl_rwb_push(entry);
1289 		ftl_trace_rwb_fill(dev, io);
1290 	}
1291 
1292 	ftl_io_complete(io);
1293 	return 0;
1294 }
1295 
1296 static bool
1297 ftl_dev_needs_defrag(struct spdk_ftl_dev *dev)
1298 {
1299 	const struct spdk_ftl_limit *limit = ftl_get_limit(dev, SPDK_FTL_LIMIT_START);
1300 
1301 	if (ftl_reloc_is_halted(dev->reloc)) {
1302 		return false;
1303 	}
1304 
1305 	if (dev->df_band) {
1306 		return false;
1307 	}
1308 
1309 	if (dev->num_free <= limit->thld) {
1310 		return true;
1311 	}
1312 
1313 	return false;
1314 }
1315 
1316 static double
1317 ftl_band_calc_merit(struct ftl_band *band, size_t *threshold_valid)
1318 {
1319 	size_t usable, valid, invalid;
1320 	double vld_ratio;
1321 
1322 	/* If the band doesn't have any usable lbks it's of no use */
1323 	usable = ftl_band_num_usable_lbks(band);
1324 	if (usable == 0) {
1325 		return 0.0;
1326 	}
1327 
1328 	valid =  threshold_valid ? (usable - *threshold_valid) : band->md.num_vld;
1329 	invalid = usable - valid;
1330 
1331 	/* Add one to avoid division by 0 */
1332 	vld_ratio = (double)invalid / (double)(valid + 1);
1333 	return vld_ratio * ftl_band_age(band);
1334 }
1335 
1336 static bool
1337 ftl_band_needs_defrag(struct ftl_band *band, struct spdk_ftl_dev *dev)
1338 {
1339 	struct spdk_ftl_conf *conf = &dev->conf;
1340 	size_t thld_vld;
1341 
1342 	/* If we're in dire need of free bands, every band is worth defragging */
1343 	if (ftl_current_limit(dev) == SPDK_FTL_LIMIT_CRIT) {
1344 		return true;
1345 	}
1346 
1347 	thld_vld = (ftl_band_num_usable_lbks(band) * conf->defrag.invalid_thld) / 100;
1348 
1349 	return band->merit > ftl_band_calc_merit(band, &thld_vld);
1350 }
1351 
1352 static struct ftl_band *
1353 ftl_select_defrag_band(struct spdk_ftl_dev *dev)
1354 {
1355 	struct ftl_band *band, *mband = NULL;
1356 	double merit = 0;
1357 
1358 	LIST_FOREACH(band, &dev->shut_bands, list_entry) {
1359 		assert(band->state == FTL_BAND_STATE_CLOSED);
1360 		band->merit = ftl_band_calc_merit(band, NULL);
1361 		if (band->merit > merit) {
1362 			merit = band->merit;
1363 			mband = band;
1364 		}
1365 	}
1366 
1367 	if (mband && !ftl_band_needs_defrag(mband, dev)) {
1368 		mband = NULL;
1369 	}
1370 
1371 	return mband;
1372 }
1373 
1374 static void
1375 ftl_process_relocs(struct spdk_ftl_dev *dev)
1376 {
1377 	struct ftl_band *band;
1378 
1379 	if (ftl_dev_needs_defrag(dev)) {
1380 		band = dev->df_band = ftl_select_defrag_band(dev);
1381 
1382 		if (band) {
1383 			ftl_reloc_add(dev->reloc, band, 0, ftl_num_band_lbks(dev), 0);
1384 			ftl_trace_defrag_band(dev, band);
1385 		}
1386 	}
1387 
1388 	ftl_reloc(dev->reloc);
1389 }
1390 
1391 int
1392 ftl_current_limit(const struct spdk_ftl_dev *dev)
1393 {
1394 	return dev->limit;
1395 }
1396 
1397 void
1398 spdk_ftl_dev_get_attrs(const struct spdk_ftl_dev *dev, struct spdk_ftl_attrs *attrs)
1399 {
1400 	attrs->uuid = dev->uuid;
1401 	attrs->lbk_cnt = dev->num_lbas;
1402 	attrs->lbk_size = FTL_BLOCK_SIZE;
1403 	attrs->range = dev->range;
1404 	attrs->cache_bdev_desc = dev->cache_bdev_desc;
1405 }
1406 
1407 static void
1408 _ftl_io_write(void *ctx)
1409 {
1410 	ftl_io_write((struct ftl_io *)ctx);
1411 }
1412 
1413 int
1414 ftl_io_write(struct ftl_io *io)
1415 {
1416 	struct spdk_ftl_dev *dev = io->dev;
1417 
1418 	/* For normal IOs we just need to copy the data onto the rwb */
1419 	if (!(io->flags & FTL_IO_MD)) {
1420 		return ftl_rwb_fill(io);
1421 	}
1422 
1423 	/* Metadata has its own buffer, so it doesn't have to be copied, so just */
1424 	/* send it the the core thread and schedule the write immediately */
1425 	if (ftl_check_core_thread(dev)) {
1426 		return ftl_submit_write(ftl_wptr_from_band(io->band), io);
1427 	}
1428 
1429 	spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_write, io);
1430 
1431 	return 0;
1432 }
1433 
1434 static int
1435 _spdk_ftl_write(struct ftl_io *io)
1436 {
1437 	int rc;
1438 
1439 	rc = ftl_io_write(io);
1440 	if (rc == -EAGAIN) {
1441 		spdk_thread_send_msg(spdk_io_channel_get_thread(io->ioch),
1442 				     _ftl_write, io);
1443 		return 0;
1444 	}
1445 
1446 	if (rc) {
1447 		ftl_io_free(io);
1448 	}
1449 
1450 	return rc;
1451 }
1452 
1453 static void
1454 _ftl_write(void *ctx)
1455 {
1456 	_spdk_ftl_write(ctx);
1457 }
1458 
1459 int
1460 spdk_ftl_write(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
1461 	       struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
1462 {
1463 	struct ftl_io *io;
1464 
1465 	if (iov_cnt == 0 || iov_cnt > FTL_MAX_IOV) {
1466 		return -EINVAL;
1467 	}
1468 
1469 	if (lba_cnt == 0) {
1470 		return -EINVAL;
1471 	}
1472 
1473 	if (lba_cnt != ftl_iovec_num_lbks(iov, iov_cnt)) {
1474 		return -EINVAL;
1475 	}
1476 
1477 	if (!dev->initialized) {
1478 		return -EBUSY;
1479 	}
1480 
1481 	io = ftl_io_alloc(ch);
1482 	if (!io) {
1483 		return -ENOMEM;
1484 	}
1485 
1486 	ftl_io_user_init(dev, io, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_WRITE);
1487 	return _spdk_ftl_write(io);
1488 }
1489 
1490 int
1491 ftl_io_read(struct ftl_io *io)
1492 {
1493 	struct spdk_ftl_dev *dev = io->dev;
1494 	ftl_next_ppa_fn	next_ppa;
1495 
1496 	if (ftl_check_read_thread(dev)) {
1497 		if (ftl_io_mode_ppa(io)) {
1498 			next_ppa = ftl_ppa_read_next_ppa;
1499 		} else {
1500 			next_ppa = ftl_lba_read_next_ppa;
1501 		}
1502 
1503 		return ftl_submit_read(io, next_ppa, NULL);
1504 	}
1505 
1506 	spdk_thread_send_msg(ftl_get_read_thread(dev), _ftl_read, io);
1507 	return 0;
1508 }
1509 
1510 static void
1511 _ftl_read(void *arg)
1512 {
1513 	ftl_io_read((struct ftl_io *)arg);
1514 }
1515 
1516 int
1517 spdk_ftl_read(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
1518 	      struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
1519 {
1520 	struct ftl_io *io;
1521 
1522 	if (iov_cnt == 0 || iov_cnt > FTL_MAX_IOV) {
1523 		return -EINVAL;
1524 	}
1525 
1526 	if (lba_cnt == 0) {
1527 		return -EINVAL;
1528 	}
1529 
1530 	if (lba_cnt != ftl_iovec_num_lbks(iov, iov_cnt)) {
1531 		return -EINVAL;
1532 	}
1533 
1534 	if (!dev->initialized) {
1535 		return -EBUSY;
1536 	}
1537 
1538 	io = ftl_io_alloc(ch);
1539 	if (!io) {
1540 		return -ENOMEM;
1541 	}
1542 
1543 	ftl_io_user_init(dev, io, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_READ);
1544 	ftl_io_read(io);
1545 	return 0;
1546 }
1547 
1548 static struct ftl_flush *
1549 ftl_flush_init(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
1550 {
1551 	struct ftl_flush *flush;
1552 	struct ftl_rwb *rwb = dev->rwb;
1553 
1554 	flush = calloc(1, sizeof(*flush));
1555 	if (!flush) {
1556 		return NULL;
1557 	}
1558 
1559 	flush->bmap = spdk_bit_array_create(ftl_rwb_num_batches(rwb));
1560 	if (!flush->bmap) {
1561 		goto error;
1562 	}
1563 
1564 	flush->dev = dev;
1565 	flush->cb.fn = cb_fn;
1566 	flush->cb.ctx = cb_arg;
1567 
1568 	return flush;
1569 error:
1570 	free(flush);
1571 	return NULL;
1572 }
1573 
1574 static void
1575 _ftl_flush(void *ctx)
1576 {
1577 	struct ftl_flush *flush = ctx;
1578 	struct spdk_ftl_dev *dev = flush->dev;
1579 	struct ftl_rwb *rwb = dev->rwb;
1580 	struct ftl_rwb_batch *batch;
1581 
1582 	/* Attach flush object to all non-empty batches */
1583 	ftl_rwb_foreach_batch(batch, rwb) {
1584 		if (!ftl_rwb_batch_empty(batch)) {
1585 			spdk_bit_array_set(flush->bmap, ftl_rwb_batch_get_offset(batch));
1586 			flush->num_req++;
1587 		}
1588 	}
1589 
1590 	LIST_INSERT_HEAD(&dev->flush_list, flush, list_entry);
1591 
1592 	/* If the RWB was already empty, the flush can be completed right away */
1593 	if (!flush->num_req) {
1594 		ftl_complete_flush(flush);
1595 	}
1596 }
1597 
1598 int
1599 spdk_ftl_flush(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
1600 {
1601 	struct ftl_flush *flush;
1602 
1603 	if (!dev->initialized) {
1604 		return -EBUSY;
1605 	}
1606 
1607 	flush = ftl_flush_init(dev, cb_fn, cb_arg);
1608 	if (!flush) {
1609 		return -ENOMEM;
1610 	}
1611 
1612 	spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_flush, flush);
1613 	return 0;
1614 }
1615 
1616 void
1617 ftl_process_anm_event(struct ftl_anm_event *event)
1618 {
1619 	SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Unconsumed ANM received for dev: %p...\n", event->dev);
1620 	ftl_anm_event_complete(event);
1621 }
1622 
1623 static void
1624 ftl_process_retry_queue(struct spdk_ftl_dev *dev)
1625 {
1626 	struct ftl_io *io;
1627 	int rc;
1628 
1629 	while (!TAILQ_EMPTY(&dev->retry_queue)) {
1630 		io = TAILQ_FIRST(&dev->retry_queue);
1631 
1632 		/* Retry only if IO is still healthy */
1633 		if (spdk_likely(io->status == 0)) {
1634 			rc = ftl_io_read(io);
1635 			if (rc == -ENOMEM) {
1636 				break;
1637 			}
1638 		}
1639 
1640 		io->flags &= ~FTL_IO_RETRY;
1641 		TAILQ_REMOVE(&dev->retry_queue, io, retry_entry);
1642 
1643 		if (ftl_io_done(io)) {
1644 			ftl_io_complete(io);
1645 		}
1646 	}
1647 }
1648 
1649 int
1650 ftl_task_read(void *ctx)
1651 {
1652 	struct ftl_thread *thread = ctx;
1653 	struct spdk_ftl_dev *dev = thread->dev;
1654 	struct spdk_nvme_qpair *qpair = ftl_get_read_qpair(dev);
1655 	size_t num_completed;
1656 
1657 	if (dev->halt) {
1658 		if (ftl_shutdown_complete(dev)) {
1659 			spdk_poller_unregister(&thread->poller);
1660 			return 0;
1661 		}
1662 	}
1663 
1664 	num_completed = spdk_nvme_qpair_process_completions(qpair, 0);
1665 
1666 	if (num_completed && !TAILQ_EMPTY(&dev->retry_queue)) {
1667 		ftl_process_retry_queue(dev);
1668 	}
1669 
1670 	return num_completed;
1671 }
1672 
1673 int
1674 ftl_task_core(void *ctx)
1675 {
1676 	struct ftl_thread *thread = ctx;
1677 	struct spdk_ftl_dev *dev = thread->dev;
1678 	struct spdk_nvme_qpair *qpair = ftl_get_write_qpair(dev);
1679 
1680 	if (dev->halt) {
1681 		if (ftl_shutdown_complete(dev)) {
1682 			spdk_poller_unregister(&thread->poller);
1683 			return 0;
1684 		}
1685 	}
1686 
1687 	ftl_process_writes(dev);
1688 	spdk_nvme_qpair_process_completions(qpair, 0);
1689 	ftl_process_relocs(dev);
1690 
1691 	return 0;
1692 }
1693 
1694 SPDK_LOG_REGISTER_COMPONENT("ftl_core", SPDK_LOG_FTL_CORE)
1695