xref: /freebsd-src/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c (revision 17aab35a77a1b1bf02fc85bb8ffadccb0ca5006d)
1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9271171e0SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy /*
22eda14cbcSMatt Macy  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
23eda14cbcSMatt Macy  * Copyright (c) 2019 by Delphix. All rights reserved.
24783d3ff6SMartin Matuska  * Copyright (c) 2023, 2024, Klara Inc.
25eda14cbcSMatt Macy  */
26eda14cbcSMatt Macy 
27eda14cbcSMatt Macy /*
28eda14cbcSMatt Macy  * See abd.c for a general overview of the arc buffered data (ABD).
29eda14cbcSMatt Macy  *
30eda14cbcSMatt Macy  * Linear buffers act exactly like normal buffers and are always mapped into the
31eda14cbcSMatt Macy  * kernel's virtual memory space, while scattered ABD data chunks are allocated
32eda14cbcSMatt Macy  * as physical pages and then mapped in only while they are actually being
33eda14cbcSMatt Macy  * accessed through one of the abd_* library functions. Using scattered ABDs
34eda14cbcSMatt Macy  * provides several benefits:
35eda14cbcSMatt Macy  *
36eda14cbcSMatt Macy  *  (1) They avoid use of kmem_*, preventing performance problems where running
37eda14cbcSMatt Macy  *      kmem_reap on very large memory systems never finishes and causes
38eda14cbcSMatt Macy  *      constant TLB shootdowns.
39eda14cbcSMatt Macy  *
40eda14cbcSMatt Macy  *  (2) Fragmentation is less of an issue since when we are at the limit of
41eda14cbcSMatt Macy  *      allocatable space, we won't have to search around for a long free
42eda14cbcSMatt Macy  *      hole in the VA space for large ARC allocations. Each chunk is mapped in
43eda14cbcSMatt Macy  *      individually, so even if we are using HIGHMEM (see next point) we
44eda14cbcSMatt Macy  *      wouldn't need to worry about finding a contiguous address range.
45eda14cbcSMatt Macy  *
46eda14cbcSMatt Macy  *  (3) If we are not using HIGHMEM, then all physical memory is always
47eda14cbcSMatt Macy  *      mapped into the kernel's address space, so we also avoid the map /
48eda14cbcSMatt Macy  *      unmap costs on each ABD access.
49eda14cbcSMatt Macy  *
50eda14cbcSMatt Macy  * If we are not using HIGHMEM, scattered buffers which have only one chunk
51eda14cbcSMatt Macy  * can be treated as linear buffers, because they are contiguous in the
52eda14cbcSMatt Macy  * kernel's virtual address space.  See abd_alloc_chunks() for details.
53eda14cbcSMatt Macy  */
54eda14cbcSMatt Macy 
55eda14cbcSMatt Macy #include <sys/abd_impl.h>
56eda14cbcSMatt Macy #include <sys/param.h>
57eda14cbcSMatt Macy #include <sys/zio.h>
58eda14cbcSMatt Macy #include <sys/arc.h>
59eda14cbcSMatt Macy #include <sys/zfs_context.h>
60eda14cbcSMatt Macy #include <sys/zfs_znode.h>
61eda14cbcSMatt Macy #include <linux/kmap_compat.h>
62783d3ff6SMartin Matuska #include <linux/mm_compat.h>
63eda14cbcSMatt Macy #include <linux/scatterlist.h>
64783d3ff6SMartin Matuska #include <linux/version.h>
65fd45b686SMartin Matuska 
66fd45b686SMartin Matuska #if defined(MAX_ORDER)
67fd45b686SMartin Matuska #define	ABD_MAX_ORDER	(MAX_ORDER)
68fd45b686SMartin Matuska #elif defined(MAX_PAGE_ORDER)
69fd45b686SMartin Matuska #define	ABD_MAX_ORDER	(MAX_PAGE_ORDER)
70fd45b686SMartin Matuska #endif
71eda14cbcSMatt Macy 
72eda14cbcSMatt Macy typedef struct abd_stats {
73eda14cbcSMatt Macy 	kstat_named_t abdstat_struct_size;
74eda14cbcSMatt Macy 	kstat_named_t abdstat_linear_cnt;
75eda14cbcSMatt Macy 	kstat_named_t abdstat_linear_data_size;
76eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_cnt;
77eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_data_size;
78eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_chunk_waste;
79fd45b686SMartin Matuska 	kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER];
80eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_page_multi_chunk;
81eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_page_multi_zone;
82eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_page_alloc_retry;
83eda14cbcSMatt Macy 	kstat_named_t abdstat_scatter_sg_table_retry;
84eda14cbcSMatt Macy } abd_stats_t;
85eda14cbcSMatt Macy 
86eda14cbcSMatt Macy static abd_stats_t abd_stats = {
87eda14cbcSMatt Macy 	/* Amount of memory occupied by all of the abd_t struct allocations */
88eda14cbcSMatt Macy 	{ "struct_size",			KSTAT_DATA_UINT64 },
89eda14cbcSMatt Macy 	/*
90eda14cbcSMatt Macy 	 * The number of linear ABDs which are currently allocated, excluding
91eda14cbcSMatt Macy 	 * ABDs which don't own their data (for instance the ones which were
92eda14cbcSMatt Macy 	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
93eda14cbcSMatt Macy 	 * ABD takes ownership of its buf then it will become tracked.
94eda14cbcSMatt Macy 	 */
95eda14cbcSMatt Macy 	{ "linear_cnt",				KSTAT_DATA_UINT64 },
96eda14cbcSMatt Macy 	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
97eda14cbcSMatt Macy 	{ "linear_data_size",			KSTAT_DATA_UINT64 },
98eda14cbcSMatt Macy 	/*
99eda14cbcSMatt Macy 	 * The number of scatter ABDs which are currently allocated, excluding
100eda14cbcSMatt Macy 	 * ABDs which don't own their data (for instance the ones which were
101eda14cbcSMatt Macy 	 * allocated through abd_get_offset()).
102eda14cbcSMatt Macy 	 */
103eda14cbcSMatt Macy 	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
104eda14cbcSMatt Macy 	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
105eda14cbcSMatt Macy 	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
106eda14cbcSMatt Macy 	/*
107eda14cbcSMatt Macy 	 * The amount of space wasted at the end of the last chunk across all
108eda14cbcSMatt Macy 	 * scatter ABDs tracked by scatter_cnt.
109eda14cbcSMatt Macy 	 */
110eda14cbcSMatt Macy 	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
111eda14cbcSMatt Macy 	/*
112eda14cbcSMatt Macy 	 * The number of compound allocations of a given order.  These
113eda14cbcSMatt Macy 	 * allocations are spread over all currently allocated ABDs, and
114eda14cbcSMatt Macy 	 * act as a measure of memory fragmentation.
115eda14cbcSMatt Macy 	 */
116eda14cbcSMatt Macy 	{ { "scatter_order_N",			KSTAT_DATA_UINT64 } },
117eda14cbcSMatt Macy 	/*
118eda14cbcSMatt Macy 	 * The number of scatter ABDs which contain multiple chunks.
119eda14cbcSMatt Macy 	 * ABDs are preferentially allocated from the minimum number of
120eda14cbcSMatt Macy 	 * contiguous multi-page chunks, a single chunk is optimal.
121eda14cbcSMatt Macy 	 */
122eda14cbcSMatt Macy 	{ "scatter_page_multi_chunk",		KSTAT_DATA_UINT64 },
123eda14cbcSMatt Macy 	/*
124eda14cbcSMatt Macy 	 * The number of scatter ABDs which are split across memory zones.
125eda14cbcSMatt Macy 	 * ABDs are preferentially allocated using pages from a single zone.
126eda14cbcSMatt Macy 	 */
127eda14cbcSMatt Macy 	{ "scatter_page_multi_zone",		KSTAT_DATA_UINT64 },
128eda14cbcSMatt Macy 	/*
129eda14cbcSMatt Macy 	 *  The total number of retries encountered when attempting to
130eda14cbcSMatt Macy 	 *  allocate the pages to populate the scatter ABD.
131eda14cbcSMatt Macy 	 */
132eda14cbcSMatt Macy 	{ "scatter_page_alloc_retry",		KSTAT_DATA_UINT64 },
133eda14cbcSMatt Macy 	/*
134eda14cbcSMatt Macy 	 *  The total number of retries encountered when attempting to
135eda14cbcSMatt Macy 	 *  allocate the sg table for an ABD.
136eda14cbcSMatt Macy 	 */
137eda14cbcSMatt Macy 	{ "scatter_sg_table_retry",		KSTAT_DATA_UINT64 },
138eda14cbcSMatt Macy };
139eda14cbcSMatt Macy 
140dbd5678dSMartin Matuska static struct {
1410d8fe237SMartin Matuska 	wmsum_t abdstat_struct_size;
1420d8fe237SMartin Matuska 	wmsum_t abdstat_linear_cnt;
1430d8fe237SMartin Matuska 	wmsum_t abdstat_linear_data_size;
1440d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_cnt;
1450d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_data_size;
1460d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_chunk_waste;
147fd45b686SMartin Matuska 	wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER];
1480d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_page_multi_chunk;
1490d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_page_multi_zone;
1500d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_page_alloc_retry;
1510d8fe237SMartin Matuska 	wmsum_t abdstat_scatter_sg_table_retry;
1520d8fe237SMartin Matuska } abd_sums;
1530d8fe237SMartin Matuska 
154eda14cbcSMatt Macy #define	abd_for_each_sg(abd, sg, n, i)	\
155eda14cbcSMatt Macy 	for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
156eda14cbcSMatt Macy 
157eda14cbcSMatt Macy /*
158eda14cbcSMatt Macy  * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
159eda14cbcSMatt Macy  * ABD's.  Smaller allocations will use linear ABD's which uses
160eda14cbcSMatt Macy  * zio_[data_]buf_alloc().
161eda14cbcSMatt Macy  *
162eda14cbcSMatt Macy  * Scatter ABD's use at least one page each, so sub-page allocations waste
163eda14cbcSMatt Macy  * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
164eda14cbcSMatt Macy  * half of each page).  Using linear ABD's for small allocations means that
165eda14cbcSMatt Macy  * they will be put on slabs which contain many allocations.  This can
166eda14cbcSMatt Macy  * improve memory efficiency, but it also makes it much harder for ARC
167eda14cbcSMatt Macy  * evictions to actually free pages, because all the buffers on one slab need
168eda14cbcSMatt Macy  * to be freed in order for the slab (and underlying pages) to be freed.
169eda14cbcSMatt Macy  * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
170eda14cbcSMatt Macy  * possible for them to actually waste more memory than scatter (one page per
171eda14cbcSMatt Macy  * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
172eda14cbcSMatt Macy  *
173eda14cbcSMatt Macy  * Spill blocks are typically 512B and are heavily used on systems running
174eda14cbcSMatt Macy  * selinux with the default dnode size and the `xattr=sa` property set.
175eda14cbcSMatt Macy  *
176eda14cbcSMatt Macy  * By default we use linear allocations for 512B and 1KB, and scatter
177eda14cbcSMatt Macy  * allocations for larger (1.5KB and up).
178eda14cbcSMatt Macy  */
179e92ffd9bSMartin Matuska static int zfs_abd_scatter_min_size = 512 * 3;
180eda14cbcSMatt Macy 
181eda14cbcSMatt Macy /*
182eda14cbcSMatt Macy  * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
183eda14cbcSMatt Macy  * just a single zero'd page. This allows us to conserve memory by
184eda14cbcSMatt Macy  * only using a single zero page for the scatterlist.
185eda14cbcSMatt Macy  */
186eda14cbcSMatt Macy abd_t *abd_zero_scatter = NULL;
187eda14cbcSMatt Macy 
188eda14cbcSMatt Macy struct page;
1897a7741afSMartin Matuska 
190eda14cbcSMatt Macy /*
191e2df9bb4SMartin Matuska  * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
192e2df9bb4SMartin Matuska  * point to ZERO_PAGE if it is available or it will be an allocated zero'd
193e2df9bb4SMartin Matuska  * PAGESIZE buffer.
194eda14cbcSMatt Macy  */
195eda14cbcSMatt Macy static struct page *abd_zero_page = NULL;
196eda14cbcSMatt Macy 
197eda14cbcSMatt Macy static kmem_cache_t *abd_cache = NULL;
198eda14cbcSMatt Macy static kstat_t *abd_ksp;
199eda14cbcSMatt Macy 
2007877fdebSMatt Macy static uint_t
201eda14cbcSMatt Macy abd_chunkcnt_for_bytes(size_t size)
202eda14cbcSMatt Macy {
203eda14cbcSMatt Macy 	return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
204eda14cbcSMatt Macy }
205eda14cbcSMatt Macy 
206eda14cbcSMatt Macy abd_t *
207184c1b94SMartin Matuska abd_alloc_struct_impl(size_t size)
208eda14cbcSMatt Macy {
209eda14cbcSMatt Macy 	/*
210eda14cbcSMatt Macy 	 * In Linux we do not use the size passed in during ABD
211eda14cbcSMatt Macy 	 * allocation, so we just ignore it.
212eda14cbcSMatt Macy 	 */
213e92ffd9bSMartin Matuska 	(void) size;
214eda14cbcSMatt Macy 	abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
215eda14cbcSMatt Macy 	ASSERT3P(abd, !=, NULL);
216eda14cbcSMatt Macy 	ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
217eda14cbcSMatt Macy 
218eda14cbcSMatt Macy 	return (abd);
219eda14cbcSMatt Macy }
220eda14cbcSMatt Macy 
221eda14cbcSMatt Macy void
222184c1b94SMartin Matuska abd_free_struct_impl(abd_t *abd)
223eda14cbcSMatt Macy {
224eda14cbcSMatt Macy 	kmem_cache_free(abd_cache, abd);
225eda14cbcSMatt Macy 	ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
226eda14cbcSMatt Macy }
227eda14cbcSMatt Macy 
228fd45b686SMartin Matuska static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;
229e92ffd9bSMartin Matuska 
230eda14cbcSMatt Macy /*
231eda14cbcSMatt Macy  * Mark zfs data pages so they can be excluded from kernel crash dumps
232eda14cbcSMatt Macy  */
233eda14cbcSMatt Macy #ifdef _LP64
234eda14cbcSMatt Macy #define	ABD_FILE_CACHE_PAGE	0x2F5ABDF11ECAC4E
235eda14cbcSMatt Macy 
236eda14cbcSMatt Macy static inline void
237eda14cbcSMatt Macy abd_mark_zfs_page(struct page *page)
238eda14cbcSMatt Macy {
239eda14cbcSMatt Macy 	get_page(page);
240eda14cbcSMatt Macy 	SetPagePrivate(page);
241eda14cbcSMatt Macy 	set_page_private(page, ABD_FILE_CACHE_PAGE);
242eda14cbcSMatt Macy }
243eda14cbcSMatt Macy 
244eda14cbcSMatt Macy static inline void
245eda14cbcSMatt Macy abd_unmark_zfs_page(struct page *page)
246eda14cbcSMatt Macy {
247eda14cbcSMatt Macy 	set_page_private(page, 0UL);
248eda14cbcSMatt Macy 	ClearPagePrivate(page);
249eda14cbcSMatt Macy 	put_page(page);
250eda14cbcSMatt Macy }
251eda14cbcSMatt Macy #else
252eda14cbcSMatt Macy #define	abd_mark_zfs_page(page)
253eda14cbcSMatt Macy #define	abd_unmark_zfs_page(page)
254eda14cbcSMatt Macy #endif /* _LP64 */
255eda14cbcSMatt Macy 
256eda14cbcSMatt Macy #ifndef CONFIG_HIGHMEM
257eda14cbcSMatt Macy 
258eda14cbcSMatt Macy #ifndef __GFP_RECLAIM
259eda14cbcSMatt Macy #define	__GFP_RECLAIM		__GFP_WAIT
260eda14cbcSMatt Macy #endif
261eda14cbcSMatt Macy 
262eda14cbcSMatt Macy /*
263eda14cbcSMatt Macy  * The goal is to minimize fragmentation by preferentially populating ABDs
264eda14cbcSMatt Macy  * with higher order compound pages from a single zone.  Allocation size is
265eda14cbcSMatt Macy  * progressively decreased until it can be satisfied without performing
266eda14cbcSMatt Macy  * reclaim or compaction.  When necessary this function will degenerate to
267eda14cbcSMatt Macy  * allocating individual pages and allowing reclaim to satisfy allocations.
268eda14cbcSMatt Macy  */
269eda14cbcSMatt Macy void
270eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size)
271eda14cbcSMatt Macy {
272eda14cbcSMatt Macy 	struct list_head pages;
273eda14cbcSMatt Macy 	struct sg_table table;
274eda14cbcSMatt Macy 	struct scatterlist *sg;
275eda14cbcSMatt Macy 	struct page *page, *tmp_page = NULL;
276ce4dcb97SMartin Matuska 	gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO;
277eda14cbcSMatt Macy 	gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
278fd45b686SMartin Matuska 	unsigned int max_order = MIN(zfs_abd_scatter_max_order,
279fd45b686SMartin Matuska 	    ABD_MAX_ORDER - 1);
280c9539b89SMartin Matuska 	unsigned int nr_pages = abd_chunkcnt_for_bytes(size);
281c9539b89SMartin Matuska 	unsigned int chunks = 0, zones = 0;
282eda14cbcSMatt Macy 	size_t remaining_size;
283eda14cbcSMatt Macy 	int nid = NUMA_NO_NODE;
284c9539b89SMartin Matuska 	unsigned int alloc_pages = 0;
285eda14cbcSMatt Macy 
286eda14cbcSMatt Macy 	INIT_LIST_HEAD(&pages);
287eda14cbcSMatt Macy 
288c9539b89SMartin Matuska 	ASSERT3U(alloc_pages, <, nr_pages);
289c9539b89SMartin Matuska 
290eda14cbcSMatt Macy 	while (alloc_pages < nr_pages) {
291c9539b89SMartin Matuska 		unsigned int chunk_pages;
292c9539b89SMartin Matuska 		unsigned int order;
293eda14cbcSMatt Macy 
294eda14cbcSMatt Macy 		order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
295eda14cbcSMatt Macy 		chunk_pages = (1U << order);
296eda14cbcSMatt Macy 
297eda14cbcSMatt Macy 		page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
298eda14cbcSMatt Macy 		if (page == NULL) {
299eda14cbcSMatt Macy 			if (order == 0) {
300eda14cbcSMatt Macy 				ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
301eda14cbcSMatt Macy 				schedule_timeout_interruptible(1);
302eda14cbcSMatt Macy 			} else {
303eda14cbcSMatt Macy 				max_order = MAX(0, order - 1);
304eda14cbcSMatt Macy 			}
305eda14cbcSMatt Macy 			continue;
306eda14cbcSMatt Macy 		}
307eda14cbcSMatt Macy 
308eda14cbcSMatt Macy 		list_add_tail(&page->lru, &pages);
309eda14cbcSMatt Macy 
310eda14cbcSMatt Macy 		if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
311eda14cbcSMatt Macy 			zones++;
312eda14cbcSMatt Macy 
313eda14cbcSMatt Macy 		nid = page_to_nid(page);
314eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_orders[order]);
315eda14cbcSMatt Macy 		chunks++;
316eda14cbcSMatt Macy 		alloc_pages += chunk_pages;
317eda14cbcSMatt Macy 	}
318eda14cbcSMatt Macy 
319eda14cbcSMatt Macy 	ASSERT3S(alloc_pages, ==, nr_pages);
320eda14cbcSMatt Macy 
321eda14cbcSMatt Macy 	while (sg_alloc_table(&table, chunks, gfp)) {
322eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
323eda14cbcSMatt Macy 		schedule_timeout_interruptible(1);
324eda14cbcSMatt Macy 	}
325eda14cbcSMatt Macy 
326eda14cbcSMatt Macy 	sg = table.sgl;
327eda14cbcSMatt Macy 	remaining_size = size;
328eda14cbcSMatt Macy 	list_for_each_entry_safe(page, tmp_page, &pages, lru) {
329eda14cbcSMatt Macy 		size_t sg_size = MIN(PAGESIZE << compound_order(page),
330eda14cbcSMatt Macy 		    remaining_size);
331eda14cbcSMatt Macy 		sg_set_page(sg, page, sg_size, 0);
332eda14cbcSMatt Macy 		abd_mark_zfs_page(page);
333eda14cbcSMatt Macy 		remaining_size -= sg_size;
334eda14cbcSMatt Macy 
335eda14cbcSMatt Macy 		sg = sg_next(sg);
336eda14cbcSMatt Macy 		list_del(&page->lru);
337eda14cbcSMatt Macy 	}
338eda14cbcSMatt Macy 
339eda14cbcSMatt Macy 	/*
340eda14cbcSMatt Macy 	 * These conditions ensure that a possible transformation to a linear
341eda14cbcSMatt Macy 	 * ABD would be valid.
342eda14cbcSMatt Macy 	 */
343eda14cbcSMatt Macy 	ASSERT(!PageHighMem(sg_page(table.sgl)));
344eda14cbcSMatt Macy 	ASSERT0(ABD_SCATTER(abd).abd_offset);
345eda14cbcSMatt Macy 
346eda14cbcSMatt Macy 	if (table.nents == 1) {
347eda14cbcSMatt Macy 		/*
348eda14cbcSMatt Macy 		 * Since there is only one entry, this ABD can be represented
349eda14cbcSMatt Macy 		 * as a linear buffer.  All single-page (4K) ABD's can be
350eda14cbcSMatt Macy 		 * represented this way.  Some multi-page ABD's can also be
351eda14cbcSMatt Macy 		 * represented this way, if we were able to allocate a single
352eda14cbcSMatt Macy 		 * "chunk" (higher-order "page" which represents a power-of-2
353eda14cbcSMatt Macy 		 * series of physically-contiguous pages).  This is often the
354eda14cbcSMatt Macy 		 * case for 2-page (8K) ABD's.
355eda14cbcSMatt Macy 		 *
356eda14cbcSMatt Macy 		 * Representing a single-entry scatter ABD as a linear ABD
357eda14cbcSMatt Macy 		 * has the performance advantage of avoiding the copy (and
358eda14cbcSMatt Macy 		 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
359eda14cbcSMatt Macy 		 * A performance increase of around 5% has been observed for
360eda14cbcSMatt Macy 		 * ARC-cached reads (of small blocks which can take advantage
361eda14cbcSMatt Macy 		 * of this).
362eda14cbcSMatt Macy 		 *
363eda14cbcSMatt Macy 		 * Note that this optimization is only possible because the
364eda14cbcSMatt Macy 		 * pages are always mapped into the kernel's address space.
365eda14cbcSMatt Macy 		 * This is not the case for highmem pages, so the
366eda14cbcSMatt Macy 		 * optimization can not be made there.
367eda14cbcSMatt Macy 		 */
368eda14cbcSMatt Macy 		abd->abd_flags |= ABD_FLAG_LINEAR;
369eda14cbcSMatt Macy 		abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
370eda14cbcSMatt Macy 		abd->abd_u.abd_linear.abd_sgl = table.sgl;
371eda14cbcSMatt Macy 		ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl));
372eda14cbcSMatt Macy 	} else if (table.nents > 1) {
373eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
374eda14cbcSMatt Macy 		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
375eda14cbcSMatt Macy 
376eda14cbcSMatt Macy 		if (zones) {
377eda14cbcSMatt Macy 			ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
378eda14cbcSMatt Macy 			abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
379eda14cbcSMatt Macy 		}
380eda14cbcSMatt Macy 
381eda14cbcSMatt Macy 		ABD_SCATTER(abd).abd_sgl = table.sgl;
382eda14cbcSMatt Macy 		ABD_SCATTER(abd).abd_nents = table.nents;
383eda14cbcSMatt Macy 	}
384eda14cbcSMatt Macy }
385eda14cbcSMatt Macy #else
386eda14cbcSMatt Macy 
387eda14cbcSMatt Macy /*
388eda14cbcSMatt Macy  * Allocate N individual pages to construct a scatter ABD.  This function
389eda14cbcSMatt Macy  * makes no attempt to request contiguous pages and requires the minimal
390eda14cbcSMatt Macy  * number of kernel interfaces.  It's designed for maximum compatibility.
391eda14cbcSMatt Macy  */
392eda14cbcSMatt Macy void
393eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size)
394eda14cbcSMatt Macy {
395eda14cbcSMatt Macy 	struct scatterlist *sg = NULL;
396eda14cbcSMatt Macy 	struct sg_table table;
397eda14cbcSMatt Macy 	struct page *page;
398ce4dcb97SMartin Matuska 	gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO;
399eda14cbcSMatt Macy 	int nr_pages = abd_chunkcnt_for_bytes(size);
400eda14cbcSMatt Macy 	int i = 0;
401eda14cbcSMatt Macy 
402eda14cbcSMatt Macy 	while (sg_alloc_table(&table, nr_pages, gfp)) {
403eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
404eda14cbcSMatt Macy 		schedule_timeout_interruptible(1);
405eda14cbcSMatt Macy 	}
406eda14cbcSMatt Macy 
407eda14cbcSMatt Macy 	ASSERT3U(table.nents, ==, nr_pages);
408eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_sgl = table.sgl;
409eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_nents = nr_pages;
410eda14cbcSMatt Macy 
411eda14cbcSMatt Macy 	abd_for_each_sg(abd, sg, nr_pages, i) {
412eda14cbcSMatt Macy 		while ((page = __page_cache_alloc(gfp)) == NULL) {
413eda14cbcSMatt Macy 			ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
414eda14cbcSMatt Macy 			schedule_timeout_interruptible(1);
415eda14cbcSMatt Macy 		}
416eda14cbcSMatt Macy 
417eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_orders[0]);
418eda14cbcSMatt Macy 		sg_set_page(sg, page, PAGESIZE, 0);
419eda14cbcSMatt Macy 		abd_mark_zfs_page(page);
420eda14cbcSMatt Macy 	}
421eda14cbcSMatt Macy 
422eda14cbcSMatt Macy 	if (nr_pages > 1) {
423eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
424eda14cbcSMatt Macy 		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
425eda14cbcSMatt Macy 	}
426eda14cbcSMatt Macy }
427eda14cbcSMatt Macy #endif /* !CONFIG_HIGHMEM */
428eda14cbcSMatt Macy 
429eda14cbcSMatt Macy /*
430eda14cbcSMatt Macy  * This must be called if any of the sg_table allocation functions
431eda14cbcSMatt Macy  * are called.
432eda14cbcSMatt Macy  */
433eda14cbcSMatt Macy static void
434eda14cbcSMatt Macy abd_free_sg_table(abd_t *abd)
435eda14cbcSMatt Macy {
436eda14cbcSMatt Macy 	struct sg_table table;
437eda14cbcSMatt Macy 
438eda14cbcSMatt Macy 	table.sgl = ABD_SCATTER(abd).abd_sgl;
439eda14cbcSMatt Macy 	table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents;
440eda14cbcSMatt Macy 	sg_free_table(&table);
441eda14cbcSMatt Macy }
442eda14cbcSMatt Macy 
443eda14cbcSMatt Macy void
444eda14cbcSMatt Macy abd_free_chunks(abd_t *abd)
445eda14cbcSMatt Macy {
446eda14cbcSMatt Macy 	struct scatterlist *sg = NULL;
447eda14cbcSMatt Macy 	struct page *page;
448eda14cbcSMatt Macy 	int nr_pages = ABD_SCATTER(abd).abd_nents;
449eda14cbcSMatt Macy 	int order, i = 0;
450eda14cbcSMatt Macy 
451eda14cbcSMatt Macy 	if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
452eda14cbcSMatt Macy 		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
453eda14cbcSMatt Macy 
454eda14cbcSMatt Macy 	if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
455eda14cbcSMatt Macy 		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
456eda14cbcSMatt Macy 
4577a7741afSMartin Matuska 	/*
4587a7741afSMartin Matuska 	 * Scatter ABDs may be constructed by abd_alloc_from_pages() from
4597a7741afSMartin Matuska 	 * an array of pages. In which case they should not be freed.
4607a7741afSMartin Matuska 	 */
4617a7741afSMartin Matuska 	if (!abd_is_from_pages(abd)) {
462eda14cbcSMatt Macy 		abd_for_each_sg(abd, sg, nr_pages, i) {
463eda14cbcSMatt Macy 			page = sg_page(sg);
464eda14cbcSMatt Macy 			abd_unmark_zfs_page(page);
465eda14cbcSMatt Macy 			order = compound_order(page);
466eda14cbcSMatt Macy 			__free_pages(page, order);
467eda14cbcSMatt Macy 			ASSERT3U(sg->length, <=, PAGE_SIZE << order);
468eda14cbcSMatt Macy 			ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
469eda14cbcSMatt Macy 		}
4707a7741afSMartin Matuska 	}
4717a7741afSMartin Matuska 
472eda14cbcSMatt Macy 	abd_free_sg_table(abd);
473eda14cbcSMatt Macy }
474eda14cbcSMatt Macy 
475eda14cbcSMatt Macy /*
476eda14cbcSMatt Macy  * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in
477eda14cbcSMatt Macy  * the scatterlist will be set to the zero'd out buffer abd_zero_page.
478eda14cbcSMatt Macy  */
479eda14cbcSMatt Macy static void
480eda14cbcSMatt Macy abd_alloc_zero_scatter(void)
481eda14cbcSMatt Macy {
482eda14cbcSMatt Macy 	struct scatterlist *sg = NULL;
483eda14cbcSMatt Macy 	struct sg_table table;
484eda14cbcSMatt Macy 	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
485eda14cbcSMatt Macy 	int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
486eda14cbcSMatt Macy 	int i = 0;
487eda14cbcSMatt Macy 
488da5137abSMartin Matuska #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
489da5137abSMartin Matuska 	gfp_t gfp_zero_page = gfp | __GFP_ZERO;
490eda14cbcSMatt Macy 	while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) {
491eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
492eda14cbcSMatt Macy 		schedule_timeout_interruptible(1);
493eda14cbcSMatt Macy 	}
494eda14cbcSMatt Macy 	abd_mark_zfs_page(abd_zero_page);
495da5137abSMartin Matuska #else
496da5137abSMartin Matuska 	abd_zero_page = ZERO_PAGE(0);
497da5137abSMartin Matuska #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
498eda14cbcSMatt Macy 
499eda14cbcSMatt Macy 	while (sg_alloc_table(&table, nr_pages, gfp)) {
500eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
501eda14cbcSMatt Macy 		schedule_timeout_interruptible(1);
502eda14cbcSMatt Macy 	}
503eda14cbcSMatt Macy 	ASSERT3U(table.nents, ==, nr_pages);
504eda14cbcSMatt Macy 
505eda14cbcSMatt Macy 	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
506184c1b94SMartin Matuska 	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
507eda14cbcSMatt Macy 	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
508eda14cbcSMatt Macy 	ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
509eda14cbcSMatt Macy 	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
510eda14cbcSMatt Macy 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
511e2df9bb4SMartin Matuska 	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
512eda14cbcSMatt Macy 
513eda14cbcSMatt Macy 	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
514eda14cbcSMatt Macy 		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
515eda14cbcSMatt Macy 	}
516eda14cbcSMatt Macy 
517eda14cbcSMatt Macy 	ABDSTAT_BUMP(abdstat_scatter_cnt);
518eda14cbcSMatt Macy 	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
519eda14cbcSMatt Macy 	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
520eda14cbcSMatt Macy }
521eda14cbcSMatt Macy 
522eda14cbcSMatt Macy boolean_t
523eda14cbcSMatt Macy abd_size_alloc_linear(size_t size)
524eda14cbcSMatt Macy {
5251f88aa09SMartin Matuska 	return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size);
526eda14cbcSMatt Macy }
527eda14cbcSMatt Macy 
528eda14cbcSMatt Macy void
529eda14cbcSMatt Macy abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
530eda14cbcSMatt Macy {
531eda14cbcSMatt Macy 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
532eda14cbcSMatt Macy 	int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
533eda14cbcSMatt Macy 	if (op == ABDSTAT_INCR) {
534eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_scatter_cnt);
535eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
536eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
537eda14cbcSMatt Macy 		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
538eda14cbcSMatt Macy 	} else {
539eda14cbcSMatt Macy 		ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
540eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
541eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
542eda14cbcSMatt Macy 		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
543eda14cbcSMatt Macy 	}
544eda14cbcSMatt Macy }
545eda14cbcSMatt Macy 
546eda14cbcSMatt Macy void
547eda14cbcSMatt Macy abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
548eda14cbcSMatt Macy {
549eda14cbcSMatt Macy 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
550eda14cbcSMatt Macy 	if (op == ABDSTAT_INCR) {
551eda14cbcSMatt Macy 		ABDSTAT_BUMP(abdstat_linear_cnt);
552eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
553eda14cbcSMatt Macy 	} else {
554eda14cbcSMatt Macy 		ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
555eda14cbcSMatt Macy 		ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
556eda14cbcSMatt Macy 	}
557eda14cbcSMatt Macy }
558eda14cbcSMatt Macy 
559eda14cbcSMatt Macy void
560eda14cbcSMatt Macy abd_verify_scatter(abd_t *abd)
561eda14cbcSMatt Macy {
562eda14cbcSMatt Macy 	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
563eda14cbcSMatt Macy 	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
564eda14cbcSMatt Macy 	    ABD_SCATTER(abd).abd_sgl->length);
5657a7741afSMartin Matuska 
5667a7741afSMartin Matuska #ifdef ZFS_DEBUG
5677a7741afSMartin Matuska 	struct scatterlist *sg = NULL;
5687a7741afSMartin Matuska 	size_t n = ABD_SCATTER(abd).abd_nents;
5697a7741afSMartin Matuska 	int i = 0;
5707a7741afSMartin Matuska 
571eda14cbcSMatt Macy 	abd_for_each_sg(abd, sg, n, i) {
572eda14cbcSMatt Macy 		ASSERT3P(sg_page(sg), !=, NULL);
573eda14cbcSMatt Macy 	}
5747a7741afSMartin Matuska #endif
575eda14cbcSMatt Macy }
576eda14cbcSMatt Macy 
577eda14cbcSMatt Macy static void
578eda14cbcSMatt Macy abd_free_zero_scatter(void)
579eda14cbcSMatt Macy {
580eda14cbcSMatt Macy 	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
581eda14cbcSMatt Macy 	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE);
582eda14cbcSMatt Macy 	ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
583eda14cbcSMatt Macy 
584eda14cbcSMatt Macy 	abd_free_sg_table(abd_zero_scatter);
585eda14cbcSMatt Macy 	abd_free_struct(abd_zero_scatter);
586eda14cbcSMatt Macy 	abd_zero_scatter = NULL;
587eda14cbcSMatt Macy 	ASSERT3P(abd_zero_page, !=, NULL);
588da5137abSMartin Matuska #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
589eda14cbcSMatt Macy 	abd_unmark_zfs_page(abd_zero_page);
590eda14cbcSMatt Macy 	__free_page(abd_zero_page);
591da5137abSMartin Matuska #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
592eda14cbcSMatt Macy }
593eda14cbcSMatt Macy 
5940d8fe237SMartin Matuska static int
5950d8fe237SMartin Matuska abd_kstats_update(kstat_t *ksp, int rw)
5960d8fe237SMartin Matuska {
5970d8fe237SMartin Matuska 	abd_stats_t *as = ksp->ks_data;
5980d8fe237SMartin Matuska 
5990d8fe237SMartin Matuska 	if (rw == KSTAT_WRITE)
6000d8fe237SMartin Matuska 		return (EACCES);
6010d8fe237SMartin Matuska 	as->abdstat_struct_size.value.ui64 =
6020d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_struct_size);
6030d8fe237SMartin Matuska 	as->abdstat_linear_cnt.value.ui64 =
6040d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_linear_cnt);
6050d8fe237SMartin Matuska 	as->abdstat_linear_data_size.value.ui64 =
6060d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_linear_data_size);
6070d8fe237SMartin Matuska 	as->abdstat_scatter_cnt.value.ui64 =
6080d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_cnt);
6090d8fe237SMartin Matuska 	as->abdstat_scatter_data_size.value.ui64 =
6100d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_data_size);
6110d8fe237SMartin Matuska 	as->abdstat_scatter_chunk_waste.value.ui64 =
6120d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_chunk_waste);
613fd45b686SMartin Matuska 	for (int i = 0; i < ABD_MAX_ORDER; i++) {
6140d8fe237SMartin Matuska 		as->abdstat_scatter_orders[i].value.ui64 =
6150d8fe237SMartin Matuska 		    wmsum_value(&abd_sums.abdstat_scatter_orders[i]);
6160d8fe237SMartin Matuska 	}
6170d8fe237SMartin Matuska 	as->abdstat_scatter_page_multi_chunk.value.ui64 =
6180d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk);
6190d8fe237SMartin Matuska 	as->abdstat_scatter_page_multi_zone.value.ui64 =
6200d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone);
6210d8fe237SMartin Matuska 	as->abdstat_scatter_page_alloc_retry.value.ui64 =
6220d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry);
6230d8fe237SMartin Matuska 	as->abdstat_scatter_sg_table_retry.value.ui64 =
6240d8fe237SMartin Matuska 	    wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry);
6250d8fe237SMartin Matuska 	return (0);
6260d8fe237SMartin Matuska }
6270d8fe237SMartin Matuska 
628eda14cbcSMatt Macy void
629eda14cbcSMatt Macy abd_init(void)
630eda14cbcSMatt Macy {
631eda14cbcSMatt Macy 	int i;
632eda14cbcSMatt Macy 
633eda14cbcSMatt Macy 	abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
634ce4dcb97SMartin Matuska 	    0, NULL, NULL, NULL, NULL, NULL, KMC_RECLAIMABLE);
635eda14cbcSMatt Macy 
6360d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_struct_size, 0);
6370d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_linear_cnt, 0);
6380d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_linear_data_size, 0);
6390d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_cnt, 0);
6400d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_data_size, 0);
6410d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0);
642fd45b686SMartin Matuska 	for (i = 0; i < ABD_MAX_ORDER; i++)
6430d8fe237SMartin Matuska 		wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0);
6440d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0);
6450d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0);
6460d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0);
6470d8fe237SMartin Matuska 	wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0);
6480d8fe237SMartin Matuska 
649eda14cbcSMatt Macy 	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
650eda14cbcSMatt Macy 	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
651eda14cbcSMatt Macy 	if (abd_ksp != NULL) {
652fd45b686SMartin Matuska 		for (i = 0; i < ABD_MAX_ORDER; i++) {
653eda14cbcSMatt Macy 			snprintf(abd_stats.abdstat_scatter_orders[i].name,
654eda14cbcSMatt Macy 			    KSTAT_STRLEN, "scatter_order_%d", i);
655eda14cbcSMatt Macy 			abd_stats.abdstat_scatter_orders[i].data_type =
656eda14cbcSMatt Macy 			    KSTAT_DATA_UINT64;
657eda14cbcSMatt Macy 		}
658eda14cbcSMatt Macy 		abd_ksp->ks_data = &abd_stats;
6590d8fe237SMartin Matuska 		abd_ksp->ks_update = abd_kstats_update;
660eda14cbcSMatt Macy 		kstat_install(abd_ksp);
661eda14cbcSMatt Macy 	}
662eda14cbcSMatt Macy 
663eda14cbcSMatt Macy 	abd_alloc_zero_scatter();
664eda14cbcSMatt Macy }
665eda14cbcSMatt Macy 
666eda14cbcSMatt Macy void
667eda14cbcSMatt Macy abd_fini(void)
668eda14cbcSMatt Macy {
669eda14cbcSMatt Macy 	abd_free_zero_scatter();
670eda14cbcSMatt Macy 
671eda14cbcSMatt Macy 	if (abd_ksp != NULL) {
672eda14cbcSMatt Macy 		kstat_delete(abd_ksp);
673eda14cbcSMatt Macy 		abd_ksp = NULL;
674eda14cbcSMatt Macy 	}
675eda14cbcSMatt Macy 
6760d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_struct_size);
6770d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_linear_cnt);
6780d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_linear_data_size);
6790d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_cnt);
6800d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_data_size);
6810d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste);
682fd45b686SMartin Matuska 	for (int i = 0; i < ABD_MAX_ORDER; i++)
6830d8fe237SMartin Matuska 		wmsum_fini(&abd_sums.abdstat_scatter_orders[i]);
6840d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk);
6850d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone);
6860d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry);
6870d8fe237SMartin Matuska 	wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry);
6880d8fe237SMartin Matuska 
689eda14cbcSMatt Macy 	if (abd_cache) {
690eda14cbcSMatt Macy 		kmem_cache_destroy(abd_cache);
691eda14cbcSMatt Macy 		abd_cache = NULL;
692eda14cbcSMatt Macy 	}
693eda14cbcSMatt Macy }
694eda14cbcSMatt Macy 
695eda14cbcSMatt Macy void
696eda14cbcSMatt Macy abd_free_linear_page(abd_t *abd)
697eda14cbcSMatt Macy {
698eda14cbcSMatt Macy 	/* Transform it back into a scatter ABD for freeing */
699eda14cbcSMatt Macy 	struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
7007a7741afSMartin Matuska 
7017a7741afSMartin Matuska 	/* When backed by user page unmap it */
7027a7741afSMartin Matuska 	if (abd_is_from_pages(abd))
7037a7741afSMartin Matuska 		zfs_kunmap(sg_page(sg));
704*5c65a0a9SMartin Matuska 	else
705*5c65a0a9SMartin Matuska 		abd_update_scatter_stats(abd, ABDSTAT_DECR);
7067a7741afSMartin Matuska 
707eda14cbcSMatt Macy 	abd->abd_flags &= ~ABD_FLAG_LINEAR;
708eda14cbcSMatt Macy 	abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
709eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_nents = 1;
710eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_offset = 0;
711eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_sgl = sg;
712eda14cbcSMatt Macy 	abd_free_chunks(abd);
7137a7741afSMartin Matuska }
714eda14cbcSMatt Macy 
7157a7741afSMartin Matuska /*
7167a7741afSMartin Matuska  * Allocate a scatter ABD structure from user pages. The pages must be
7177a7741afSMartin Matuska  * pinned with get_user_pages, or similiar, but need not be mapped via
7187a7741afSMartin Matuska  * the kmap interfaces.
7197a7741afSMartin Matuska  */
7207a7741afSMartin Matuska abd_t *
7217a7741afSMartin Matuska abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size)
7227a7741afSMartin Matuska {
7237a7741afSMartin Matuska 	uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE);
7247a7741afSMartin Matuska 	struct sg_table table;
7257a7741afSMartin Matuska 
7267a7741afSMartin Matuska 	VERIFY3U(size, <=, DMU_MAX_ACCESS);
7277a7741afSMartin Matuska 	ASSERT3U(offset, <, PAGE_SIZE);
7287a7741afSMartin Matuska 	ASSERT3P(pages, !=, NULL);
7297a7741afSMartin Matuska 
7307a7741afSMartin Matuska 	/*
7317a7741afSMartin Matuska 	 * Even if this buf is filesystem metadata, we only track that we
7327a7741afSMartin Matuska 	 * own the underlying data buffer, which is not true in this case.
7337a7741afSMartin Matuska 	 * Therefore, we don't ever use ABD_FLAG_META here.
7347a7741afSMartin Matuska 	 */
7357a7741afSMartin Matuska 	abd_t *abd = abd_alloc_struct(0);
7367a7741afSMartin Matuska 	abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER;
7377a7741afSMartin Matuska 	abd->abd_size = size;
7387a7741afSMartin Matuska 
7397a7741afSMartin Matuska 	while (sg_alloc_table_from_pages(&table, pages, npages, offset,
7407a7741afSMartin Matuska 	    size, __GFP_NOWARN | GFP_NOIO) != 0) {
7417a7741afSMartin Matuska 		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
7427a7741afSMartin Matuska 		schedule_timeout_interruptible(1);
7437a7741afSMartin Matuska 	}
7447a7741afSMartin Matuska 
7457a7741afSMartin Matuska 	if ((offset + size) <= PAGE_SIZE) {
7467a7741afSMartin Matuska 		/*
7477a7741afSMartin Matuska 		 * Since there is only one entry, this ABD can be represented
7487a7741afSMartin Matuska 		 * as a linear buffer. All single-page (4K) ABD's constructed
7497a7741afSMartin Matuska 		 * from a user page can be represented this way as long as the
7507a7741afSMartin Matuska 		 * page is mapped to a virtual address. This allows us to
7517a7741afSMartin Matuska 		 * apply an offset in to the mapped page.
7527a7741afSMartin Matuska 		 *
7537a7741afSMartin Matuska 		 * Note that kmap() must be used, not kmap_atomic(), because
7547a7741afSMartin Matuska 		 * the mapping needs to bet set up on all CPUs. Using kmap()
7557a7741afSMartin Matuska 		 * also enables the user of highmem pages when required.
7567a7741afSMartin Matuska 		 */
7577a7741afSMartin Matuska 		abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
7587a7741afSMartin Matuska 		abd->abd_u.abd_linear.abd_sgl = table.sgl;
7597a7741afSMartin Matuska 		zfs_kmap(sg_page(table.sgl));
7607a7741afSMartin Matuska 		ABD_LINEAR_BUF(abd) = sg_virt(table.sgl);
7617a7741afSMartin Matuska 	} else {
7627a7741afSMartin Matuska 		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
7637a7741afSMartin Matuska 		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
7647a7741afSMartin Matuska 
7657a7741afSMartin Matuska 		ABD_SCATTER(abd).abd_offset = offset;
7667a7741afSMartin Matuska 		ABD_SCATTER(abd).abd_sgl = table.sgl;
7677a7741afSMartin Matuska 		ABD_SCATTER(abd).abd_nents = table.nents;
7687a7741afSMartin Matuska 
7697a7741afSMartin Matuska 		ASSERT0(ABD_SCATTER(abd).abd_offset);
7707a7741afSMartin Matuska 	}
7717a7741afSMartin Matuska 
7727a7741afSMartin Matuska 	return (abd);
773eda14cbcSMatt Macy }
774eda14cbcSMatt Macy 
775eda14cbcSMatt Macy /*
776eda14cbcSMatt Macy  * If we're going to use this ABD for doing I/O using the block layer, the
777eda14cbcSMatt Macy  * consumer of the ABD data doesn't care if it's scattered or not, and we don't
778eda14cbcSMatt Macy  * plan to store this ABD in memory for a long period of time, we should
779eda14cbcSMatt Macy  * allocate the ABD type that requires the least data copying to do the I/O.
780eda14cbcSMatt Macy  *
781eda14cbcSMatt Macy  * On Linux the optimal thing to do would be to use abd_get_offset() and
782eda14cbcSMatt Macy  * construct a new ABD which shares the original pages thereby eliminating
783eda14cbcSMatt Macy  * the copy.  But for the moment a new linear ABD is allocated until this
784eda14cbcSMatt Macy  * performance optimization can be implemented.
785eda14cbcSMatt Macy  */
786eda14cbcSMatt Macy abd_t *
787eda14cbcSMatt Macy abd_alloc_for_io(size_t size, boolean_t is_metadata)
788eda14cbcSMatt Macy {
789eda14cbcSMatt Macy 	return (abd_alloc(size, is_metadata));
790eda14cbcSMatt Macy }
791eda14cbcSMatt Macy 
792eda14cbcSMatt Macy abd_t *
7937cd22ac4SMartin Matuska abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
7947cd22ac4SMartin Matuska     size_t size)
795eda14cbcSMatt Macy {
796e92ffd9bSMartin Matuska 	(void) size;
797eda14cbcSMatt Macy 	int i = 0;
798eda14cbcSMatt Macy 	struct scatterlist *sg = NULL;
799eda14cbcSMatt Macy 
800eda14cbcSMatt Macy 	abd_verify(sabd);
801eda14cbcSMatt Macy 	ASSERT3U(off, <=, sabd->abd_size);
802eda14cbcSMatt Macy 
803eda14cbcSMatt Macy 	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
804eda14cbcSMatt Macy 
805184c1b94SMartin Matuska 	if (abd == NULL)
806eda14cbcSMatt Macy 		abd = abd_alloc_struct(0);
807eda14cbcSMatt Macy 
808eda14cbcSMatt Macy 	/*
809eda14cbcSMatt Macy 	 * Even if this buf is filesystem metadata, we only track that
810eda14cbcSMatt Macy 	 * if we own the underlying data buffer, which is not true in
811eda14cbcSMatt Macy 	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
812eda14cbcSMatt Macy 	 */
813eda14cbcSMatt Macy 
814eda14cbcSMatt Macy 	abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
815eda14cbcSMatt Macy 		if (new_offset < sg->length)
816eda14cbcSMatt Macy 			break;
817eda14cbcSMatt Macy 		new_offset -= sg->length;
818eda14cbcSMatt Macy 	}
819eda14cbcSMatt Macy 
820eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_sgl = sg;
821eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_offset = new_offset;
822eda14cbcSMatt Macy 	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
823eda14cbcSMatt Macy 
8247a7741afSMartin Matuska 	if (abd_is_from_pages(sabd))
8257a7741afSMartin Matuska 		abd->abd_flags |= ABD_FLAG_FROM_PAGES;
8267a7741afSMartin Matuska 
827eda14cbcSMatt Macy 	return (abd);
828eda14cbcSMatt Macy }
829eda14cbcSMatt Macy 
830eda14cbcSMatt Macy /*
831eda14cbcSMatt Macy  * Initialize the abd_iter.
832eda14cbcSMatt Macy  */
833eda14cbcSMatt Macy void
834eda14cbcSMatt Macy abd_iter_init(struct abd_iter *aiter, abd_t *abd)
835eda14cbcSMatt Macy {
836eda14cbcSMatt Macy 	ASSERT(!abd_is_gang(abd));
837eda14cbcSMatt Macy 	abd_verify(abd);
838783d3ff6SMartin Matuska 	memset(aiter, 0, sizeof (struct abd_iter));
839eda14cbcSMatt Macy 	aiter->iter_abd = abd;
840783d3ff6SMartin Matuska 	if (!abd_is_linear(abd)) {
841eda14cbcSMatt Macy 		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
842eda14cbcSMatt Macy 		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
843eda14cbcSMatt Macy 	}
844eda14cbcSMatt Macy }
845eda14cbcSMatt Macy 
846eda14cbcSMatt Macy /*
847eda14cbcSMatt Macy  * This is just a helper function to see if we have exhausted the
848eda14cbcSMatt Macy  * abd_iter and reached the end.
849eda14cbcSMatt Macy  */
850eda14cbcSMatt Macy boolean_t
851eda14cbcSMatt Macy abd_iter_at_end(struct abd_iter *aiter)
852eda14cbcSMatt Macy {
853783d3ff6SMartin Matuska 	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
854eda14cbcSMatt Macy 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
855eda14cbcSMatt Macy }
856eda14cbcSMatt Macy 
857eda14cbcSMatt Macy /*
858eda14cbcSMatt Macy  * Advance the iterator by a certain amount. Cannot be called when a chunk is
859eda14cbcSMatt Macy  * in use. This can be safely called when the aiter has already exhausted, in
860eda14cbcSMatt Macy  * which case this does nothing.
861eda14cbcSMatt Macy  */
862eda14cbcSMatt Macy void
863eda14cbcSMatt Macy abd_iter_advance(struct abd_iter *aiter, size_t amount)
864eda14cbcSMatt Macy {
865783d3ff6SMartin Matuska 	/*
866783d3ff6SMartin Matuska 	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
867783d3ff6SMartin Matuska 	 * this state (directly or abd_iter_unmap()) before advancing.
868783d3ff6SMartin Matuska 	 */
869eda14cbcSMatt Macy 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
870eda14cbcSMatt Macy 	ASSERT0(aiter->iter_mapsize);
871783d3ff6SMartin Matuska 	ASSERT3P(aiter->iter_page, ==, NULL);
872783d3ff6SMartin Matuska 	ASSERT0(aiter->iter_page_doff);
873783d3ff6SMartin Matuska 	ASSERT0(aiter->iter_page_dsize);
874eda14cbcSMatt Macy 
875eda14cbcSMatt Macy 	/* There's nothing left to advance to, so do nothing */
876eda14cbcSMatt Macy 	if (abd_iter_at_end(aiter))
877eda14cbcSMatt Macy 		return;
878eda14cbcSMatt Macy 
879eda14cbcSMatt Macy 	aiter->iter_pos += amount;
880eda14cbcSMatt Macy 	aiter->iter_offset += amount;
881eda14cbcSMatt Macy 	if (!abd_is_linear(aiter->iter_abd)) {
882eda14cbcSMatt Macy 		while (aiter->iter_offset >= aiter->iter_sg->length) {
883eda14cbcSMatt Macy 			aiter->iter_offset -= aiter->iter_sg->length;
884eda14cbcSMatt Macy 			aiter->iter_sg = sg_next(aiter->iter_sg);
885eda14cbcSMatt Macy 			if (aiter->iter_sg == NULL) {
886eda14cbcSMatt Macy 				ASSERT0(aiter->iter_offset);
887eda14cbcSMatt Macy 				break;
888eda14cbcSMatt Macy 			}
889eda14cbcSMatt Macy 		}
890eda14cbcSMatt Macy 	}
891eda14cbcSMatt Macy }
892eda14cbcSMatt Macy 
893eda14cbcSMatt Macy /*
894eda14cbcSMatt Macy  * Map the current chunk into aiter. This can be safely called when the aiter
895eda14cbcSMatt Macy  * has already exhausted, in which case this does nothing.
896eda14cbcSMatt Macy  */
897eda14cbcSMatt Macy void
898eda14cbcSMatt Macy abd_iter_map(struct abd_iter *aiter)
899eda14cbcSMatt Macy {
900eda14cbcSMatt Macy 	void *paddr;
901eda14cbcSMatt Macy 	size_t offset = 0;
902eda14cbcSMatt Macy 
903eda14cbcSMatt Macy 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
904eda14cbcSMatt Macy 	ASSERT0(aiter->iter_mapsize);
905eda14cbcSMatt Macy 
906eda14cbcSMatt Macy 	/* There's nothing left to iterate over, so do nothing */
907eda14cbcSMatt Macy 	if (abd_iter_at_end(aiter))
908eda14cbcSMatt Macy 		return;
909eda14cbcSMatt Macy 
910eda14cbcSMatt Macy 	if (abd_is_linear(aiter->iter_abd)) {
911eda14cbcSMatt Macy 		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
912eda14cbcSMatt Macy 		offset = aiter->iter_offset;
913eda14cbcSMatt Macy 		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
914eda14cbcSMatt Macy 		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
915eda14cbcSMatt Macy 	} else {
916eda14cbcSMatt Macy 		offset = aiter->iter_offset;
917eda14cbcSMatt Macy 		aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
918eda14cbcSMatt Macy 		    aiter->iter_abd->abd_size - aiter->iter_pos);
919eda14cbcSMatt Macy 
92075e1fea6SMartin Matuska 		paddr = zfs_kmap_local(sg_page(aiter->iter_sg));
921eda14cbcSMatt Macy 	}
922eda14cbcSMatt Macy 
923eda14cbcSMatt Macy 	aiter->iter_mapaddr = (char *)paddr + offset;
924eda14cbcSMatt Macy }
925eda14cbcSMatt Macy 
926eda14cbcSMatt Macy /*
927eda14cbcSMatt Macy  * Unmap the current chunk from aiter. This can be safely called when the aiter
928eda14cbcSMatt Macy  * has already exhausted, in which case this does nothing.
929eda14cbcSMatt Macy  */
930eda14cbcSMatt Macy void
931eda14cbcSMatt Macy abd_iter_unmap(struct abd_iter *aiter)
932eda14cbcSMatt Macy {
933eda14cbcSMatt Macy 	/* There's nothing left to unmap, so do nothing */
934eda14cbcSMatt Macy 	if (abd_iter_at_end(aiter))
935eda14cbcSMatt Macy 		return;
936eda14cbcSMatt Macy 
937eda14cbcSMatt Macy 	if (!abd_is_linear(aiter->iter_abd)) {
938eda14cbcSMatt Macy 		/* LINTED E_FUNC_SET_NOT_USED */
93975e1fea6SMartin Matuska 		zfs_kunmap_local(aiter->iter_mapaddr - aiter->iter_offset);
940eda14cbcSMatt Macy 	}
941eda14cbcSMatt Macy 
942eda14cbcSMatt Macy 	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
943eda14cbcSMatt Macy 	ASSERT3U(aiter->iter_mapsize, >, 0);
944eda14cbcSMatt Macy 
945eda14cbcSMatt Macy 	aiter->iter_mapaddr = NULL;
946eda14cbcSMatt Macy 	aiter->iter_mapsize = 0;
947eda14cbcSMatt Macy }
948eda14cbcSMatt Macy 
949eda14cbcSMatt Macy void
950eda14cbcSMatt Macy abd_cache_reap_now(void)
951eda14cbcSMatt Macy {
952eda14cbcSMatt Macy }
953eda14cbcSMatt Macy 
954eda14cbcSMatt Macy /*
9557a7741afSMartin Matuska  * Borrow a raw buffer from an ABD without copying the contents of the ABD
9567a7741afSMartin Matuska  * into the buffer. If the ABD is scattered, this will allocate a raw buffer
9577a7741afSMartin Matuska  * whose contents are undefined. To copy over the existing data in the ABD, use
9587a7741afSMartin Matuska  * abd_borrow_buf_copy() instead.
9597a7741afSMartin Matuska  */
9607a7741afSMartin Matuska void *
9617a7741afSMartin Matuska abd_borrow_buf(abd_t *abd, size_t n)
9627a7741afSMartin Matuska {
9637a7741afSMartin Matuska 	void *buf;
9647a7741afSMartin Matuska 	abd_verify(abd);
9657a7741afSMartin Matuska 	ASSERT3U(abd->abd_size, >=, 0);
9667a7741afSMartin Matuska 	/*
9677a7741afSMartin Matuska 	 * In the event the ABD is composed of a single user page from Direct
9687a7741afSMartin Matuska 	 * I/O we can not direclty return the raw buffer. This is a consequence
9697a7741afSMartin Matuska 	 * of not being able to write protect the page and the contents of the
9707a7741afSMartin Matuska 	 * page can be changed at any time by the user.
9717a7741afSMartin Matuska 	 */
9727a7741afSMartin Matuska 	if (abd_is_from_pages(abd)) {
9737a7741afSMartin Matuska 		buf = zio_buf_alloc(n);
9747a7741afSMartin Matuska 	} else if (abd_is_linear(abd)) {
9757a7741afSMartin Matuska 		buf = abd_to_buf(abd);
9767a7741afSMartin Matuska 	} else {
9777a7741afSMartin Matuska 		buf = zio_buf_alloc(n);
9787a7741afSMartin Matuska 	}
9797a7741afSMartin Matuska 
9807a7741afSMartin Matuska #ifdef ZFS_DEBUG
9817a7741afSMartin Matuska 	(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
9827a7741afSMartin Matuska #endif
9837a7741afSMartin Matuska 	return (buf);
9847a7741afSMartin Matuska }
9857a7741afSMartin Matuska 
9867a7741afSMartin Matuska void *
9877a7741afSMartin Matuska abd_borrow_buf_copy(abd_t *abd, size_t n)
9887a7741afSMartin Matuska {
9897a7741afSMartin Matuska 	void *buf = abd_borrow_buf(abd, n);
9907a7741afSMartin Matuska 
9917a7741afSMartin Matuska 	/*
9927a7741afSMartin Matuska 	 * In the event the ABD is composed of a single user page from Direct
9937a7741afSMartin Matuska 	 * I/O we must make sure copy the data over into the newly allocated
9947a7741afSMartin Matuska 	 * buffer. This is a consequence of the fact that we can not write
9957a7741afSMartin Matuska 	 * protect the user page and there is a risk the contents of the page
9967a7741afSMartin Matuska 	 * could be changed by the user at any moment.
9977a7741afSMartin Matuska 	 */
9987a7741afSMartin Matuska 	if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
9997a7741afSMartin Matuska 		abd_copy_to_buf(buf, abd, n);
10007a7741afSMartin Matuska 	}
10017a7741afSMartin Matuska 	return (buf);
10027a7741afSMartin Matuska }
10037a7741afSMartin Matuska 
10047a7741afSMartin Matuska /*
10057a7741afSMartin Matuska  * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will
10067a7741afSMartin Matuska  * not change the contents of the ABD. If you want any changes you made to
10077a7741afSMartin Matuska  * buf to be copied back to abd, use abd_return_buf_copy() instead. If the
10087a7741afSMartin Matuska  * ABD is not constructed from user pages for Direct I/O then an ASSERT
10097a7741afSMartin Matuska  * checks to make sure the contents of buffer have not changed since it was
10107a7741afSMartin Matuska  * borrowed. We can not ASSERT that the contents of the buffer have not changed
10117a7741afSMartin Matuska  * if it is composed of user pages because the pages can not be placed under
10127a7741afSMartin Matuska  * write protection and the user could have possibly changed the contents in
101387bf66d4SMartin Matuska  * the pages at any time. This is also an issue for Direct I/O reads. Checksum
101487bf66d4SMartin Matuska  * verifications in the ZIO pipeline check for this issue and handle it by
101587bf66d4SMartin Matuska  * returning an error on checksum verification failure.
10167a7741afSMartin Matuska  */
10177a7741afSMartin Matuska void
10187a7741afSMartin Matuska abd_return_buf(abd_t *abd, void *buf, size_t n)
10197a7741afSMartin Matuska {
10207a7741afSMartin Matuska 	abd_verify(abd);
10217a7741afSMartin Matuska 	ASSERT3U(abd->abd_size, >=, n);
10227a7741afSMartin Matuska #ifdef ZFS_DEBUG
10237a7741afSMartin Matuska 	(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
10247a7741afSMartin Matuska #endif
10257a7741afSMartin Matuska 	if (abd_is_from_pages(abd)) {
10267a7741afSMartin Matuska 		zio_buf_free(buf, n);
10277a7741afSMartin Matuska 	} else if (abd_is_linear(abd)) {
10287a7741afSMartin Matuska 		ASSERT3P(buf, ==, abd_to_buf(abd));
10297a7741afSMartin Matuska 	} else if (abd_is_gang(abd)) {
10307a7741afSMartin Matuska #ifdef ZFS_DEBUG
10317a7741afSMartin Matuska 		/*
10327a7741afSMartin Matuska 		 * We have to be careful with gang ABD's that we do not ASSERT0
10337a7741afSMartin Matuska 		 * for any ABD's that contain user pages from Direct I/O. In
10347a7741afSMartin Matuska 		 * order to handle this, we just iterate through the gang ABD
10357a7741afSMartin Matuska 		 * and only verify ABDs that are not from user pages.
10367a7741afSMartin Matuska 		 */
10377a7741afSMartin Matuska 		void *cmp_buf = buf;
10387a7741afSMartin Matuska 
10397a7741afSMartin Matuska 		for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
10407a7741afSMartin Matuska 		    cabd != NULL;
10417a7741afSMartin Matuska 		    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
10427a7741afSMartin Matuska 			if (!abd_is_from_pages(cabd)) {
10437a7741afSMartin Matuska 				ASSERT0(abd_cmp_buf(cabd, cmp_buf,
10447a7741afSMartin Matuska 				    cabd->abd_size));
10457a7741afSMartin Matuska 			}
10467a7741afSMartin Matuska 			cmp_buf = (char *)cmp_buf + cabd->abd_size;
10477a7741afSMartin Matuska 		}
10487a7741afSMartin Matuska #endif
10497a7741afSMartin Matuska 		zio_buf_free(buf, n);
10507a7741afSMartin Matuska 	} else {
10517a7741afSMartin Matuska 		ASSERT0(abd_cmp_buf(abd, buf, n));
10527a7741afSMartin Matuska 		zio_buf_free(buf, n);
10537a7741afSMartin Matuska 	}
10547a7741afSMartin Matuska }
10557a7741afSMartin Matuska 
10567a7741afSMartin Matuska void
10577a7741afSMartin Matuska abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
10587a7741afSMartin Matuska {
10597a7741afSMartin Matuska 	if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
10607a7741afSMartin Matuska 		abd_copy_from_buf(abd, buf, n);
10617a7741afSMartin Matuska 	}
10627a7741afSMartin Matuska 	abd_return_buf(abd, buf, n);
10637a7741afSMartin Matuska }
10647a7741afSMartin Matuska 
10657a7741afSMartin Matuska /*
10660d4ad640SMartin Matuska  * This is abd_iter_page(), the function underneath abd_iterate_page_func().
10670d4ad640SMartin Matuska  * It yields the next page struct and data offset and size within it, without
1068783d3ff6SMartin Matuska  * mapping it into the address space.
1069783d3ff6SMartin Matuska  */
10700d4ad640SMartin Matuska 
10710d4ad640SMartin Matuska /*
10720d4ad640SMartin Matuska  * "Compound pages" are a group of pages that can be referenced from a single
10730d4ad640SMartin Matuska  * struct page *. Its organised as a "head" page, followed by a series of
10740d4ad640SMartin Matuska  * "tail" pages.
10750d4ad640SMartin Matuska  *
10760d4ad640SMartin Matuska  * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we
10770d4ad640SMartin Matuska  * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a
10780d4ad640SMartin Matuska  * great many of the IO buffers we get are going to be of this type.
10790d4ad640SMartin Matuska  *
10800d4ad640SMartin Matuska  * The tail pages are just regular PAGESIZE pages, and can be safely used
10810d4ad640SMartin Matuska  * as-is. However, the head page has length covering itself and all the tail
10820d4ad640SMartin Matuska  * pages. If the ABD chunk spans multiple pages, then we can use the head page
10830d4ad640SMartin Matuska  * and a >PAGESIZE length, which is far more efficient.
10840d4ad640SMartin Matuska  *
10850d4ad640SMartin Matuska  * Before kernel 4.5 however, compound page heads were refcounted separately
10860d4ad640SMartin Matuska  * from tail pages, such that moving back to the head page would require us to
10870d4ad640SMartin Matuska  * take a reference to it and releasing it once we're completely finished with
10887a7741afSMartin Matuska  * it. In practice, that meant when our caller is done with the ABD, which we
10890d4ad640SMartin Matuska  * have no insight into from here. Rather than contort this API to track head
10907a7741afSMartin Matuska  * page references on such ancient kernels, we disabled this special compound
10917a7741afSMartin Matuska  * page handling on kernels before 4.5, instead just using treating each page
10927a7741afSMartin Matuska  * within it as a regular PAGESIZE page (which it is). This is slightly less
10937a7741afSMartin Matuska  * efficient, but makes everything far simpler.
10940d4ad640SMartin Matuska  *
10957a7741afSMartin Matuska  * We no longer support kernels before 4.5, so in theory none of this is
10967a7741afSMartin Matuska  * necessary. However, this code is still relatively new in the grand scheme of
10977a7741afSMartin Matuska  * things, so I'm leaving the ability to compile this out for the moment.
10987a7741afSMartin Matuska  *
10997a7741afSMartin Matuska  * Setting/clearing ABD_ITER_COMPOUND_PAGES below enables/disables the special
11007a7741afSMartin Matuska  * handling, by defining the ABD_ITER_PAGE_SIZE(page) macro to understand
11017a7741afSMartin Matuska  * compound pages, or not, and compiling in/out the support to detect compound
11027a7741afSMartin Matuska  * tail pages and move back to the start.
11030d4ad640SMartin Matuska  */
11047a7741afSMartin Matuska 
11057a7741afSMartin Matuska /* On by default */
11067a7741afSMartin Matuska #define	ABD_ITER_COMPOUND_PAGES
11077a7741afSMartin Matuska 
11087a7741afSMartin Matuska #ifdef ABD_ITER_COMPOUND_PAGES
11090d4ad640SMartin Matuska #define	ABD_ITER_PAGE_SIZE(page)	\
11100d4ad640SMartin Matuska 	(PageCompound(page) ? page_size(page) : PAGESIZE)
11110d4ad640SMartin Matuska #else
11120d4ad640SMartin Matuska #define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE)
11130d4ad640SMartin Matuska #endif
11140d4ad640SMartin Matuska 
1115783d3ff6SMartin Matuska void
1116783d3ff6SMartin Matuska abd_iter_page(struct abd_iter *aiter)
1117783d3ff6SMartin Matuska {
1118783d3ff6SMartin Matuska 	if (abd_iter_at_end(aiter)) {
1119783d3ff6SMartin Matuska 		aiter->iter_page = NULL;
1120783d3ff6SMartin Matuska 		aiter->iter_page_doff = 0;
1121783d3ff6SMartin Matuska 		aiter->iter_page_dsize = 0;
1122783d3ff6SMartin Matuska 		return;
1123783d3ff6SMartin Matuska 	}
1124783d3ff6SMartin Matuska 
1125783d3ff6SMartin Matuska 	struct page *page;
1126783d3ff6SMartin Matuska 	size_t doff, dsize;
1127783d3ff6SMartin Matuska 
11280d4ad640SMartin Matuska 	/*
11290d4ad640SMartin Matuska 	 * Find the page, and the start of the data within it. This is computed
11300d4ad640SMartin Matuska 	 * differently for linear and scatter ABDs; linear is referenced by
11310d4ad640SMartin Matuska 	 * virtual memory location, while scatter is referenced by page
11320d4ad640SMartin Matuska 	 * pointer.
11330d4ad640SMartin Matuska 	 */
1134783d3ff6SMartin Matuska 	if (abd_is_linear(aiter->iter_abd)) {
1135783d3ff6SMartin Matuska 		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
1136783d3ff6SMartin Matuska 
1137783d3ff6SMartin Matuska 		/* memory address at iter_pos */
1138783d3ff6SMartin Matuska 		void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
1139783d3ff6SMartin Matuska 
1140783d3ff6SMartin Matuska 		/* struct page for address */
1141783d3ff6SMartin Matuska 		page = is_vmalloc_addr(paddr) ?
1142783d3ff6SMartin Matuska 		    vmalloc_to_page(paddr) : virt_to_page(paddr);
1143783d3ff6SMartin Matuska 
1144783d3ff6SMartin Matuska 		/* offset of address within the page */
1145783d3ff6SMartin Matuska 		doff = offset_in_page(paddr);
1146783d3ff6SMartin Matuska 	} else {
1147783d3ff6SMartin Matuska 		ASSERT(!abd_is_gang(aiter->iter_abd));
1148783d3ff6SMartin Matuska 
1149783d3ff6SMartin Matuska 		/* current scatter page */
11500d4ad640SMartin Matuska 		page = nth_page(sg_page(aiter->iter_sg),
11510d4ad640SMartin Matuska 		    aiter->iter_offset >> PAGE_SHIFT);
1152783d3ff6SMartin Matuska 
1153783d3ff6SMartin Matuska 		/* position within page */
11540d4ad640SMartin Matuska 		doff = aiter->iter_offset & (PAGESIZE - 1);
1155783d3ff6SMartin Matuska 	}
1156783d3ff6SMartin Matuska 
11570d4ad640SMartin Matuska #ifdef ABD_ITER_COMPOUND_PAGES
1158783d3ff6SMartin Matuska 	if (PageTail(page)) {
1159783d3ff6SMartin Matuska 		/*
11600d4ad640SMartin Matuska 		 * If this is a compound tail page, move back to the head, and
11610d4ad640SMartin Matuska 		 * adjust the offset to match. This may let us yield a much
11620d4ad640SMartin Matuska 		 * larger amount of data from a single logical page, and so
11630d4ad640SMartin Matuska 		 * leave our caller with fewer pages to process.
1164783d3ff6SMartin Matuska 		 */
1165783d3ff6SMartin Matuska 		struct page *head = compound_head(page);
1166783d3ff6SMartin Matuska 		doff += ((page - head) * PAGESIZE);
1167783d3ff6SMartin Matuska 		page = head;
1168783d3ff6SMartin Matuska 	}
1169783d3ff6SMartin Matuska #endif
1170783d3ff6SMartin Matuska 
11710d4ad640SMartin Matuska 	ASSERT(page);
11720d4ad640SMartin Matuska 
11730d4ad640SMartin Matuska 	/*
11740d4ad640SMartin Matuska 	 * Compute the maximum amount of data we can take from this page. This
11750d4ad640SMartin Matuska 	 * is the smaller of:
11760d4ad640SMartin Matuska 	 * - the remaining space in the page
11770d4ad640SMartin Matuska 	 * - the remaining space in this scatterlist entry (which may not cover
11780d4ad640SMartin Matuska 	 *   the entire page)
11790d4ad640SMartin Matuska 	 * - the remaining space in the abd (which may not cover the entire
11800d4ad640SMartin Matuska 	 *   scatterlist entry)
11810d4ad640SMartin Matuska 	 */
11820d4ad640SMartin Matuska 	dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff,
11830d4ad640SMartin Matuska 	    aiter->iter_abd->abd_size - aiter->iter_pos);
11840d4ad640SMartin Matuska 	if (!abd_is_linear(aiter->iter_abd))
11850d4ad640SMartin Matuska 		dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset);
11860d4ad640SMartin Matuska 	ASSERT3U(dsize, >, 0);
11870d4ad640SMartin Matuska 
11880d4ad640SMartin Matuska 	/* final iterator outputs */
1189783d3ff6SMartin Matuska 	aiter->iter_page = page;
1190783d3ff6SMartin Matuska 	aiter->iter_page_doff = doff;
11910d4ad640SMartin Matuska 	aiter->iter_page_dsize = dsize;
1192783d3ff6SMartin Matuska }
1193783d3ff6SMartin Matuska 
1194783d3ff6SMartin Matuska /*
1195783d3ff6SMartin Matuska  * Note: ABD BIO functions only needed to support vdev_classic. See comments in
1196783d3ff6SMartin Matuska  * vdev_disk.c.
1197783d3ff6SMartin Matuska  */
1198783d3ff6SMartin Matuska 
1199783d3ff6SMartin Matuska /*
1200eda14cbcSMatt Macy  * bio_nr_pages for ABD.
1201eda14cbcSMatt Macy  * @off is the offset in @abd
1202eda14cbcSMatt Macy  */
1203eda14cbcSMatt Macy unsigned long
1204eda14cbcSMatt Macy abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
1205eda14cbcSMatt Macy {
1206eda14cbcSMatt Macy 	unsigned long pos;
1207eda14cbcSMatt Macy 
1208184c1b94SMartin Matuska 	if (abd_is_gang(abd)) {
1209184c1b94SMartin Matuska 		unsigned long count = 0;
1210eda14cbcSMatt Macy 
1211184c1b94SMartin Matuska 		for (abd_t *cabd = abd_gang_get_offset(abd, &off);
1212184c1b94SMartin Matuska 		    cabd != NULL && size != 0;
1213184c1b94SMartin Matuska 		    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
1214184c1b94SMartin Matuska 			ASSERT3U(off, <, cabd->abd_size);
1215184c1b94SMartin Matuska 			int mysize = MIN(size, cabd->abd_size - off);
1216184c1b94SMartin Matuska 			count += abd_nr_pages_off(cabd, mysize, off);
1217184c1b94SMartin Matuska 			size -= mysize;
1218184c1b94SMartin Matuska 			off = 0;
1219184c1b94SMartin Matuska 		}
1220184c1b94SMartin Matuska 		return (count);
1221184c1b94SMartin Matuska 	}
1222184c1b94SMartin Matuska 
1223eda14cbcSMatt Macy 	if (abd_is_linear(abd))
1224eda14cbcSMatt Macy 		pos = (unsigned long)abd_to_buf(abd) + off;
1225eda14cbcSMatt Macy 	else
1226eda14cbcSMatt Macy 		pos = ABD_SCATTER(abd).abd_offset + off;
1227eda14cbcSMatt Macy 
1228184c1b94SMartin Matuska 	return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
1229184c1b94SMartin Matuska 	    (pos >> PAGE_SHIFT));
1230eda14cbcSMatt Macy }
1231eda14cbcSMatt Macy 
1232eda14cbcSMatt Macy static unsigned int
1233eda14cbcSMatt Macy bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size)
1234eda14cbcSMatt Macy {
1235eda14cbcSMatt Macy 	unsigned int offset, size, i;
1236eda14cbcSMatt Macy 	struct page *page;
1237eda14cbcSMatt Macy 
1238eda14cbcSMatt Macy 	offset = offset_in_page(buf_ptr);
1239eda14cbcSMatt Macy 	for (i = 0; i < bio->bi_max_vecs; i++) {
1240eda14cbcSMatt Macy 		size = PAGE_SIZE - offset;
1241eda14cbcSMatt Macy 
1242eda14cbcSMatt Macy 		if (bio_size <= 0)
1243eda14cbcSMatt Macy 			break;
1244eda14cbcSMatt Macy 
1245eda14cbcSMatt Macy 		if (size > bio_size)
1246eda14cbcSMatt Macy 			size = bio_size;
1247eda14cbcSMatt Macy 
1248eda14cbcSMatt Macy 		if (is_vmalloc_addr(buf_ptr))
1249eda14cbcSMatt Macy 			page = vmalloc_to_page(buf_ptr);
1250eda14cbcSMatt Macy 		else
1251eda14cbcSMatt Macy 			page = virt_to_page(buf_ptr);
1252eda14cbcSMatt Macy 
1253eda14cbcSMatt Macy 		/*
1254eda14cbcSMatt Macy 		 * Some network related block device uses tcp_sendpage, which
1255eda14cbcSMatt Macy 		 * doesn't behave well when using 0-count page, this is a
1256eda14cbcSMatt Macy 		 * safety net to catch them.
1257eda14cbcSMatt Macy 		 */
1258eda14cbcSMatt Macy 		ASSERT3S(page_count(page), >, 0);
1259eda14cbcSMatt Macy 
1260eda14cbcSMatt Macy 		if (bio_add_page(bio, page, size, offset) != size)
1261eda14cbcSMatt Macy 			break;
1262eda14cbcSMatt Macy 
1263eda14cbcSMatt Macy 		buf_ptr += size;
1264eda14cbcSMatt Macy 		bio_size -= size;
1265eda14cbcSMatt Macy 		offset = 0;
1266eda14cbcSMatt Macy 	}
1267eda14cbcSMatt Macy 
1268eda14cbcSMatt Macy 	return (bio_size);
1269eda14cbcSMatt Macy }
1270eda14cbcSMatt Macy 
1271eda14cbcSMatt Macy /*
1272eda14cbcSMatt Macy  * bio_map for gang ABD.
1273eda14cbcSMatt Macy  */
1274eda14cbcSMatt Macy static unsigned int
1275eda14cbcSMatt Macy abd_gang_bio_map_off(struct bio *bio, abd_t *abd,
1276eda14cbcSMatt Macy     unsigned int io_size, size_t off)
1277eda14cbcSMatt Macy {
1278eda14cbcSMatt Macy 	ASSERT(abd_is_gang(abd));
1279eda14cbcSMatt Macy 
1280eda14cbcSMatt Macy 	for (abd_t *cabd = abd_gang_get_offset(abd, &off);
1281eda14cbcSMatt Macy 	    cabd != NULL;
1282eda14cbcSMatt Macy 	    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
1283eda14cbcSMatt Macy 		ASSERT3U(off, <, cabd->abd_size);
1284eda14cbcSMatt Macy 		int size = MIN(io_size, cabd->abd_size - off);
1285eda14cbcSMatt Macy 		int remainder = abd_bio_map_off(bio, cabd, size, off);
1286eda14cbcSMatt Macy 		io_size -= (size - remainder);
1287eda14cbcSMatt Macy 		if (io_size == 0 || remainder > 0)
1288eda14cbcSMatt Macy 			return (io_size);
1289eda14cbcSMatt Macy 		off = 0;
1290eda14cbcSMatt Macy 	}
1291eda14cbcSMatt Macy 	ASSERT0(io_size);
1292eda14cbcSMatt Macy 	return (io_size);
1293eda14cbcSMatt Macy }
1294eda14cbcSMatt Macy 
1295eda14cbcSMatt Macy /*
1296eda14cbcSMatt Macy  * bio_map for ABD.
1297eda14cbcSMatt Macy  * @off is the offset in @abd
1298eda14cbcSMatt Macy  * Remaining IO size is returned
1299eda14cbcSMatt Macy  */
1300eda14cbcSMatt Macy unsigned int
1301eda14cbcSMatt Macy abd_bio_map_off(struct bio *bio, abd_t *abd,
1302eda14cbcSMatt Macy     unsigned int io_size, size_t off)
1303eda14cbcSMatt Macy {
1304eda14cbcSMatt Macy 	struct abd_iter aiter;
1305eda14cbcSMatt Macy 
1306eda14cbcSMatt Macy 	ASSERT3U(io_size, <=, abd->abd_size - off);
1307eda14cbcSMatt Macy 	if (abd_is_linear(abd))
1308eda14cbcSMatt Macy 		return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size));
1309eda14cbcSMatt Macy 
1310eda14cbcSMatt Macy 	ASSERT(!abd_is_linear(abd));
1311eda14cbcSMatt Macy 	if (abd_is_gang(abd))
1312eda14cbcSMatt Macy 		return (abd_gang_bio_map_off(bio, abd, io_size, off));
1313eda14cbcSMatt Macy 
1314eda14cbcSMatt Macy 	abd_iter_init(&aiter, abd);
1315eda14cbcSMatt Macy 	abd_iter_advance(&aiter, off);
1316eda14cbcSMatt Macy 
1317184c1b94SMartin Matuska 	for (int i = 0; i < bio->bi_max_vecs; i++) {
1318eda14cbcSMatt Macy 		struct page *pg;
1319eda14cbcSMatt Macy 		size_t len, sgoff, pgoff;
1320eda14cbcSMatt Macy 		struct scatterlist *sg;
1321eda14cbcSMatt Macy 
1322eda14cbcSMatt Macy 		if (io_size <= 0)
1323eda14cbcSMatt Macy 			break;
1324eda14cbcSMatt Macy 
1325eda14cbcSMatt Macy 		sg = aiter.iter_sg;
1326eda14cbcSMatt Macy 		sgoff = aiter.iter_offset;
1327eda14cbcSMatt Macy 		pgoff = sgoff & (PAGESIZE - 1);
1328eda14cbcSMatt Macy 		len = MIN(io_size, PAGESIZE - pgoff);
1329eda14cbcSMatt Macy 		ASSERT(len > 0);
1330eda14cbcSMatt Macy 
1331eda14cbcSMatt Macy 		pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
1332eda14cbcSMatt Macy 		if (bio_add_page(bio, pg, len, pgoff) != len)
1333eda14cbcSMatt Macy 			break;
1334eda14cbcSMatt Macy 
1335eda14cbcSMatt Macy 		io_size -= len;
1336eda14cbcSMatt Macy 		abd_iter_advance(&aiter, len);
1337eda14cbcSMatt Macy 	}
1338eda14cbcSMatt Macy 
1339eda14cbcSMatt Macy 	return (io_size);
1340eda14cbcSMatt Macy }
1341eda14cbcSMatt Macy 
1342eda14cbcSMatt Macy /* Tunable Parameters */
1343eda14cbcSMatt Macy module_param(zfs_abd_scatter_enabled, int, 0644);
1344eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_enabled,
1345eda14cbcSMatt Macy 	"Toggle whether ABD allocations must be linear.");
1346eda14cbcSMatt Macy module_param(zfs_abd_scatter_min_size, int, 0644);
1347eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_min_size,
1348eda14cbcSMatt Macy 	"Minimum size of scatter allocations.");
1349eda14cbcSMatt Macy module_param(zfs_abd_scatter_max_order, uint, 0644);
1350eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_max_order,
1351eda14cbcSMatt Macy 	"Maximum order allocation used for a scatter ABD.");
1352