1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright (c) 2014 by Chunwei Chen. All rights reserved. 23eda14cbcSMatt Macy * Copyright (c) 2019 by Delphix. All rights reserved. 24783d3ff6SMartin Matuska * Copyright (c) 2023, 2024, Klara Inc. 25eda14cbcSMatt Macy */ 26eda14cbcSMatt Macy 27eda14cbcSMatt Macy /* 28eda14cbcSMatt Macy * See abd.c for a general overview of the arc buffered data (ABD). 29eda14cbcSMatt Macy * 30eda14cbcSMatt Macy * Linear buffers act exactly like normal buffers and are always mapped into the 31eda14cbcSMatt Macy * kernel's virtual memory space, while scattered ABD data chunks are allocated 32eda14cbcSMatt Macy * as physical pages and then mapped in only while they are actually being 33eda14cbcSMatt Macy * accessed through one of the abd_* library functions. Using scattered ABDs 34eda14cbcSMatt Macy * provides several benefits: 35eda14cbcSMatt Macy * 36eda14cbcSMatt Macy * (1) They avoid use of kmem_*, preventing performance problems where running 37eda14cbcSMatt Macy * kmem_reap on very large memory systems never finishes and causes 38eda14cbcSMatt Macy * constant TLB shootdowns. 39eda14cbcSMatt Macy * 40eda14cbcSMatt Macy * (2) Fragmentation is less of an issue since when we are at the limit of 41eda14cbcSMatt Macy * allocatable space, we won't have to search around for a long free 42eda14cbcSMatt Macy * hole in the VA space for large ARC allocations. Each chunk is mapped in 43eda14cbcSMatt Macy * individually, so even if we are using HIGHMEM (see next point) we 44eda14cbcSMatt Macy * wouldn't need to worry about finding a contiguous address range. 45eda14cbcSMatt Macy * 46eda14cbcSMatt Macy * (3) If we are not using HIGHMEM, then all physical memory is always 47eda14cbcSMatt Macy * mapped into the kernel's address space, so we also avoid the map / 48eda14cbcSMatt Macy * unmap costs on each ABD access. 49eda14cbcSMatt Macy * 50eda14cbcSMatt Macy * If we are not using HIGHMEM, scattered buffers which have only one chunk 51eda14cbcSMatt Macy * can be treated as linear buffers, because they are contiguous in the 52eda14cbcSMatt Macy * kernel's virtual address space. See abd_alloc_chunks() for details. 53eda14cbcSMatt Macy */ 54eda14cbcSMatt Macy 55eda14cbcSMatt Macy #include <sys/abd_impl.h> 56eda14cbcSMatt Macy #include <sys/param.h> 57eda14cbcSMatt Macy #include <sys/zio.h> 58eda14cbcSMatt Macy #include <sys/arc.h> 59eda14cbcSMatt Macy #include <sys/zfs_context.h> 60eda14cbcSMatt Macy #include <sys/zfs_znode.h> 61eda14cbcSMatt Macy #include <linux/kmap_compat.h> 62783d3ff6SMartin Matuska #include <linux/mm_compat.h> 63eda14cbcSMatt Macy #include <linux/scatterlist.h> 64783d3ff6SMartin Matuska #include <linux/version.h> 65fd45b686SMartin Matuska 66fd45b686SMartin Matuska #if defined(MAX_ORDER) 67fd45b686SMartin Matuska #define ABD_MAX_ORDER (MAX_ORDER) 68fd45b686SMartin Matuska #elif defined(MAX_PAGE_ORDER) 69fd45b686SMartin Matuska #define ABD_MAX_ORDER (MAX_PAGE_ORDER) 70fd45b686SMartin Matuska #endif 71eda14cbcSMatt Macy 72eda14cbcSMatt Macy typedef struct abd_stats { 73eda14cbcSMatt Macy kstat_named_t abdstat_struct_size; 74eda14cbcSMatt Macy kstat_named_t abdstat_linear_cnt; 75eda14cbcSMatt Macy kstat_named_t abdstat_linear_data_size; 76eda14cbcSMatt Macy kstat_named_t abdstat_scatter_cnt; 77eda14cbcSMatt Macy kstat_named_t abdstat_scatter_data_size; 78eda14cbcSMatt Macy kstat_named_t abdstat_scatter_chunk_waste; 79fd45b686SMartin Matuska kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER]; 80eda14cbcSMatt Macy kstat_named_t abdstat_scatter_page_multi_chunk; 81eda14cbcSMatt Macy kstat_named_t abdstat_scatter_page_multi_zone; 82eda14cbcSMatt Macy kstat_named_t abdstat_scatter_page_alloc_retry; 83eda14cbcSMatt Macy kstat_named_t abdstat_scatter_sg_table_retry; 84eda14cbcSMatt Macy } abd_stats_t; 85eda14cbcSMatt Macy 86eda14cbcSMatt Macy static abd_stats_t abd_stats = { 87eda14cbcSMatt Macy /* Amount of memory occupied by all of the abd_t struct allocations */ 88eda14cbcSMatt Macy { "struct_size", KSTAT_DATA_UINT64 }, 89eda14cbcSMatt Macy /* 90eda14cbcSMatt Macy * The number of linear ABDs which are currently allocated, excluding 91eda14cbcSMatt Macy * ABDs which don't own their data (for instance the ones which were 92eda14cbcSMatt Macy * allocated through abd_get_offset() and abd_get_from_buf()). If an 93eda14cbcSMatt Macy * ABD takes ownership of its buf then it will become tracked. 94eda14cbcSMatt Macy */ 95eda14cbcSMatt Macy { "linear_cnt", KSTAT_DATA_UINT64 }, 96eda14cbcSMatt Macy /* Amount of data stored in all linear ABDs tracked by linear_cnt */ 97eda14cbcSMatt Macy { "linear_data_size", KSTAT_DATA_UINT64 }, 98eda14cbcSMatt Macy /* 99eda14cbcSMatt Macy * The number of scatter ABDs which are currently allocated, excluding 100eda14cbcSMatt Macy * ABDs which don't own their data (for instance the ones which were 101eda14cbcSMatt Macy * allocated through abd_get_offset()). 102eda14cbcSMatt Macy */ 103eda14cbcSMatt Macy { "scatter_cnt", KSTAT_DATA_UINT64 }, 104eda14cbcSMatt Macy /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ 105eda14cbcSMatt Macy { "scatter_data_size", KSTAT_DATA_UINT64 }, 106eda14cbcSMatt Macy /* 107eda14cbcSMatt Macy * The amount of space wasted at the end of the last chunk across all 108eda14cbcSMatt Macy * scatter ABDs tracked by scatter_cnt. 109eda14cbcSMatt Macy */ 110eda14cbcSMatt Macy { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, 111eda14cbcSMatt Macy /* 112eda14cbcSMatt Macy * The number of compound allocations of a given order. These 113eda14cbcSMatt Macy * allocations are spread over all currently allocated ABDs, and 114eda14cbcSMatt Macy * act as a measure of memory fragmentation. 115eda14cbcSMatt Macy */ 116eda14cbcSMatt Macy { { "scatter_order_N", KSTAT_DATA_UINT64 } }, 117eda14cbcSMatt Macy /* 118eda14cbcSMatt Macy * The number of scatter ABDs which contain multiple chunks. 119eda14cbcSMatt Macy * ABDs are preferentially allocated from the minimum number of 120eda14cbcSMatt Macy * contiguous multi-page chunks, a single chunk is optimal. 121eda14cbcSMatt Macy */ 122eda14cbcSMatt Macy { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, 123eda14cbcSMatt Macy /* 124eda14cbcSMatt Macy * The number of scatter ABDs which are split across memory zones. 125eda14cbcSMatt Macy * ABDs are preferentially allocated using pages from a single zone. 126eda14cbcSMatt Macy */ 127eda14cbcSMatt Macy { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, 128eda14cbcSMatt Macy /* 129eda14cbcSMatt Macy * The total number of retries encountered when attempting to 130eda14cbcSMatt Macy * allocate the pages to populate the scatter ABD. 131eda14cbcSMatt Macy */ 132eda14cbcSMatt Macy { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, 133eda14cbcSMatt Macy /* 134eda14cbcSMatt Macy * The total number of retries encountered when attempting to 135eda14cbcSMatt Macy * allocate the sg table for an ABD. 136eda14cbcSMatt Macy */ 137eda14cbcSMatt Macy { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, 138eda14cbcSMatt Macy }; 139eda14cbcSMatt Macy 140dbd5678dSMartin Matuska static struct { 1410d8fe237SMartin Matuska wmsum_t abdstat_struct_size; 1420d8fe237SMartin Matuska wmsum_t abdstat_linear_cnt; 1430d8fe237SMartin Matuska wmsum_t abdstat_linear_data_size; 1440d8fe237SMartin Matuska wmsum_t abdstat_scatter_cnt; 1450d8fe237SMartin Matuska wmsum_t abdstat_scatter_data_size; 1460d8fe237SMartin Matuska wmsum_t abdstat_scatter_chunk_waste; 147fd45b686SMartin Matuska wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER]; 1480d8fe237SMartin Matuska wmsum_t abdstat_scatter_page_multi_chunk; 1490d8fe237SMartin Matuska wmsum_t abdstat_scatter_page_multi_zone; 1500d8fe237SMartin Matuska wmsum_t abdstat_scatter_page_alloc_retry; 1510d8fe237SMartin Matuska wmsum_t abdstat_scatter_sg_table_retry; 1520d8fe237SMartin Matuska } abd_sums; 1530d8fe237SMartin Matuska 154eda14cbcSMatt Macy #define abd_for_each_sg(abd, sg, n, i) \ 155eda14cbcSMatt Macy for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) 156eda14cbcSMatt Macy 157eda14cbcSMatt Macy /* 158eda14cbcSMatt Macy * zfs_abd_scatter_min_size is the minimum allocation size to use scatter 159eda14cbcSMatt Macy * ABD's. Smaller allocations will use linear ABD's which uses 160eda14cbcSMatt Macy * zio_[data_]buf_alloc(). 161eda14cbcSMatt Macy * 162eda14cbcSMatt Macy * Scatter ABD's use at least one page each, so sub-page allocations waste 163eda14cbcSMatt Macy * some space when allocated as scatter (e.g. 2KB scatter allocation wastes 164eda14cbcSMatt Macy * half of each page). Using linear ABD's for small allocations means that 165eda14cbcSMatt Macy * they will be put on slabs which contain many allocations. This can 166eda14cbcSMatt Macy * improve memory efficiency, but it also makes it much harder for ARC 167eda14cbcSMatt Macy * evictions to actually free pages, because all the buffers on one slab need 168eda14cbcSMatt Macy * to be freed in order for the slab (and underlying pages) to be freed. 169eda14cbcSMatt Macy * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's 170eda14cbcSMatt Macy * possible for them to actually waste more memory than scatter (one page per 171eda14cbcSMatt Macy * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). 172eda14cbcSMatt Macy * 173eda14cbcSMatt Macy * Spill blocks are typically 512B and are heavily used on systems running 174eda14cbcSMatt Macy * selinux with the default dnode size and the `xattr=sa` property set. 175eda14cbcSMatt Macy * 176eda14cbcSMatt Macy * By default we use linear allocations for 512B and 1KB, and scatter 177eda14cbcSMatt Macy * allocations for larger (1.5KB and up). 178eda14cbcSMatt Macy */ 179e92ffd9bSMartin Matuska static int zfs_abd_scatter_min_size = 512 * 3; 180eda14cbcSMatt Macy 181eda14cbcSMatt Macy /* 182eda14cbcSMatt Macy * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are 183eda14cbcSMatt Macy * just a single zero'd page. This allows us to conserve memory by 184eda14cbcSMatt Macy * only using a single zero page for the scatterlist. 185eda14cbcSMatt Macy */ 186eda14cbcSMatt Macy abd_t *abd_zero_scatter = NULL; 187eda14cbcSMatt Macy 188eda14cbcSMatt Macy struct page; 1897a7741afSMartin Matuska 190eda14cbcSMatt Macy /* 191e2df9bb4SMartin Matuska * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will 192e2df9bb4SMartin Matuska * point to ZERO_PAGE if it is available or it will be an allocated zero'd 193e2df9bb4SMartin Matuska * PAGESIZE buffer. 194eda14cbcSMatt Macy */ 195eda14cbcSMatt Macy static struct page *abd_zero_page = NULL; 196eda14cbcSMatt Macy 197eda14cbcSMatt Macy static kmem_cache_t *abd_cache = NULL; 198eda14cbcSMatt Macy static kstat_t *abd_ksp; 199eda14cbcSMatt Macy 2007877fdebSMatt Macy static uint_t 201eda14cbcSMatt Macy abd_chunkcnt_for_bytes(size_t size) 202eda14cbcSMatt Macy { 203eda14cbcSMatt Macy return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); 204eda14cbcSMatt Macy } 205eda14cbcSMatt Macy 206eda14cbcSMatt Macy abd_t * 207184c1b94SMartin Matuska abd_alloc_struct_impl(size_t size) 208eda14cbcSMatt Macy { 209eda14cbcSMatt Macy /* 210eda14cbcSMatt Macy * In Linux we do not use the size passed in during ABD 211eda14cbcSMatt Macy * allocation, so we just ignore it. 212eda14cbcSMatt Macy */ 213e92ffd9bSMartin Matuska (void) size; 214eda14cbcSMatt Macy abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); 215eda14cbcSMatt Macy ASSERT3P(abd, !=, NULL); 216eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); 217eda14cbcSMatt Macy 218eda14cbcSMatt Macy return (abd); 219eda14cbcSMatt Macy } 220eda14cbcSMatt Macy 221eda14cbcSMatt Macy void 222184c1b94SMartin Matuska abd_free_struct_impl(abd_t *abd) 223eda14cbcSMatt Macy { 224eda14cbcSMatt Macy kmem_cache_free(abd_cache, abd); 225eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); 226eda14cbcSMatt Macy } 227eda14cbcSMatt Macy 228fd45b686SMartin Matuska static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1; 229e92ffd9bSMartin Matuska 230eda14cbcSMatt Macy /* 231eda14cbcSMatt Macy * Mark zfs data pages so they can be excluded from kernel crash dumps 232eda14cbcSMatt Macy */ 233eda14cbcSMatt Macy #ifdef _LP64 234eda14cbcSMatt Macy #define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E 235eda14cbcSMatt Macy 236eda14cbcSMatt Macy static inline void 237eda14cbcSMatt Macy abd_mark_zfs_page(struct page *page) 238eda14cbcSMatt Macy { 239eda14cbcSMatt Macy get_page(page); 240eda14cbcSMatt Macy SetPagePrivate(page); 241eda14cbcSMatt Macy set_page_private(page, ABD_FILE_CACHE_PAGE); 242eda14cbcSMatt Macy } 243eda14cbcSMatt Macy 244eda14cbcSMatt Macy static inline void 245eda14cbcSMatt Macy abd_unmark_zfs_page(struct page *page) 246eda14cbcSMatt Macy { 247eda14cbcSMatt Macy set_page_private(page, 0UL); 248eda14cbcSMatt Macy ClearPagePrivate(page); 249eda14cbcSMatt Macy put_page(page); 250eda14cbcSMatt Macy } 251eda14cbcSMatt Macy #else 252eda14cbcSMatt Macy #define abd_mark_zfs_page(page) 253eda14cbcSMatt Macy #define abd_unmark_zfs_page(page) 254eda14cbcSMatt Macy #endif /* _LP64 */ 255eda14cbcSMatt Macy 256eda14cbcSMatt Macy #ifndef CONFIG_HIGHMEM 257eda14cbcSMatt Macy 258eda14cbcSMatt Macy #ifndef __GFP_RECLAIM 259eda14cbcSMatt Macy #define __GFP_RECLAIM __GFP_WAIT 260eda14cbcSMatt Macy #endif 261eda14cbcSMatt Macy 262eda14cbcSMatt Macy /* 263eda14cbcSMatt Macy * The goal is to minimize fragmentation by preferentially populating ABDs 264eda14cbcSMatt Macy * with higher order compound pages from a single zone. Allocation size is 265eda14cbcSMatt Macy * progressively decreased until it can be satisfied without performing 266eda14cbcSMatt Macy * reclaim or compaction. When necessary this function will degenerate to 267eda14cbcSMatt Macy * allocating individual pages and allowing reclaim to satisfy allocations. 268eda14cbcSMatt Macy */ 269eda14cbcSMatt Macy void 270eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size) 271eda14cbcSMatt Macy { 272eda14cbcSMatt Macy struct list_head pages; 273eda14cbcSMatt Macy struct sg_table table; 274eda14cbcSMatt Macy struct scatterlist *sg; 275eda14cbcSMatt Macy struct page *page, *tmp_page = NULL; 276ce4dcb97SMartin Matuska gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO; 277eda14cbcSMatt Macy gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; 278fd45b686SMartin Matuska unsigned int max_order = MIN(zfs_abd_scatter_max_order, 279fd45b686SMartin Matuska ABD_MAX_ORDER - 1); 280c9539b89SMartin Matuska unsigned int nr_pages = abd_chunkcnt_for_bytes(size); 281c9539b89SMartin Matuska unsigned int chunks = 0, zones = 0; 282eda14cbcSMatt Macy size_t remaining_size; 283eda14cbcSMatt Macy int nid = NUMA_NO_NODE; 284c9539b89SMartin Matuska unsigned int alloc_pages = 0; 285eda14cbcSMatt Macy 286eda14cbcSMatt Macy INIT_LIST_HEAD(&pages); 287eda14cbcSMatt Macy 288c9539b89SMartin Matuska ASSERT3U(alloc_pages, <, nr_pages); 289c9539b89SMartin Matuska 290eda14cbcSMatt Macy while (alloc_pages < nr_pages) { 291c9539b89SMartin Matuska unsigned int chunk_pages; 292c9539b89SMartin Matuska unsigned int order; 293eda14cbcSMatt Macy 294eda14cbcSMatt Macy order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); 295eda14cbcSMatt Macy chunk_pages = (1U << order); 296eda14cbcSMatt Macy 297eda14cbcSMatt Macy page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); 298eda14cbcSMatt Macy if (page == NULL) { 299eda14cbcSMatt Macy if (order == 0) { 300eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 301eda14cbcSMatt Macy schedule_timeout_interruptible(1); 302eda14cbcSMatt Macy } else { 303eda14cbcSMatt Macy max_order = MAX(0, order - 1); 304eda14cbcSMatt Macy } 305eda14cbcSMatt Macy continue; 306eda14cbcSMatt Macy } 307eda14cbcSMatt Macy 308eda14cbcSMatt Macy list_add_tail(&page->lru, &pages); 309eda14cbcSMatt Macy 310eda14cbcSMatt Macy if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) 311eda14cbcSMatt Macy zones++; 312eda14cbcSMatt Macy 313eda14cbcSMatt Macy nid = page_to_nid(page); 314eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_orders[order]); 315eda14cbcSMatt Macy chunks++; 316eda14cbcSMatt Macy alloc_pages += chunk_pages; 317eda14cbcSMatt Macy } 318eda14cbcSMatt Macy 319eda14cbcSMatt Macy ASSERT3S(alloc_pages, ==, nr_pages); 320eda14cbcSMatt Macy 321eda14cbcSMatt Macy while (sg_alloc_table(&table, chunks, gfp)) { 322eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 323eda14cbcSMatt Macy schedule_timeout_interruptible(1); 324eda14cbcSMatt Macy } 325eda14cbcSMatt Macy 326eda14cbcSMatt Macy sg = table.sgl; 327eda14cbcSMatt Macy remaining_size = size; 328eda14cbcSMatt Macy list_for_each_entry_safe(page, tmp_page, &pages, lru) { 329eda14cbcSMatt Macy size_t sg_size = MIN(PAGESIZE << compound_order(page), 330eda14cbcSMatt Macy remaining_size); 331eda14cbcSMatt Macy sg_set_page(sg, page, sg_size, 0); 332eda14cbcSMatt Macy abd_mark_zfs_page(page); 333eda14cbcSMatt Macy remaining_size -= sg_size; 334eda14cbcSMatt Macy 335eda14cbcSMatt Macy sg = sg_next(sg); 336eda14cbcSMatt Macy list_del(&page->lru); 337eda14cbcSMatt Macy } 338eda14cbcSMatt Macy 339eda14cbcSMatt Macy /* 340eda14cbcSMatt Macy * These conditions ensure that a possible transformation to a linear 341eda14cbcSMatt Macy * ABD would be valid. 342eda14cbcSMatt Macy */ 343eda14cbcSMatt Macy ASSERT(!PageHighMem(sg_page(table.sgl))); 344eda14cbcSMatt Macy ASSERT0(ABD_SCATTER(abd).abd_offset); 345eda14cbcSMatt Macy 346eda14cbcSMatt Macy if (table.nents == 1) { 347eda14cbcSMatt Macy /* 348eda14cbcSMatt Macy * Since there is only one entry, this ABD can be represented 349eda14cbcSMatt Macy * as a linear buffer. All single-page (4K) ABD's can be 350eda14cbcSMatt Macy * represented this way. Some multi-page ABD's can also be 351eda14cbcSMatt Macy * represented this way, if we were able to allocate a single 352eda14cbcSMatt Macy * "chunk" (higher-order "page" which represents a power-of-2 353eda14cbcSMatt Macy * series of physically-contiguous pages). This is often the 354eda14cbcSMatt Macy * case for 2-page (8K) ABD's. 355eda14cbcSMatt Macy * 356eda14cbcSMatt Macy * Representing a single-entry scatter ABD as a linear ABD 357eda14cbcSMatt Macy * has the performance advantage of avoiding the copy (and 358eda14cbcSMatt Macy * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. 359eda14cbcSMatt Macy * A performance increase of around 5% has been observed for 360eda14cbcSMatt Macy * ARC-cached reads (of small blocks which can take advantage 361eda14cbcSMatt Macy * of this). 362eda14cbcSMatt Macy * 363eda14cbcSMatt Macy * Note that this optimization is only possible because the 364eda14cbcSMatt Macy * pages are always mapped into the kernel's address space. 365eda14cbcSMatt Macy * This is not the case for highmem pages, so the 366eda14cbcSMatt Macy * optimization can not be made there. 367eda14cbcSMatt Macy */ 368eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_LINEAR; 369eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; 370eda14cbcSMatt Macy abd->abd_u.abd_linear.abd_sgl = table.sgl; 371eda14cbcSMatt Macy ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); 372eda14cbcSMatt Macy } else if (table.nents > 1) { 373eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 374eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 375eda14cbcSMatt Macy 376eda14cbcSMatt Macy if (zones) { 377eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); 378eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_MULTI_ZONE; 379eda14cbcSMatt Macy } 380eda14cbcSMatt Macy 381eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = table.sgl; 382eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = table.nents; 383eda14cbcSMatt Macy } 384eda14cbcSMatt Macy } 385eda14cbcSMatt Macy #else 386eda14cbcSMatt Macy 387eda14cbcSMatt Macy /* 388eda14cbcSMatt Macy * Allocate N individual pages to construct a scatter ABD. This function 389eda14cbcSMatt Macy * makes no attempt to request contiguous pages and requires the minimal 390eda14cbcSMatt Macy * number of kernel interfaces. It's designed for maximum compatibility. 391eda14cbcSMatt Macy */ 392eda14cbcSMatt Macy void 393eda14cbcSMatt Macy abd_alloc_chunks(abd_t *abd, size_t size) 394eda14cbcSMatt Macy { 395eda14cbcSMatt Macy struct scatterlist *sg = NULL; 396eda14cbcSMatt Macy struct sg_table table; 397eda14cbcSMatt Macy struct page *page; 398ce4dcb97SMartin Matuska gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO; 399eda14cbcSMatt Macy int nr_pages = abd_chunkcnt_for_bytes(size); 400eda14cbcSMatt Macy int i = 0; 401eda14cbcSMatt Macy 402eda14cbcSMatt Macy while (sg_alloc_table(&table, nr_pages, gfp)) { 403eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 404eda14cbcSMatt Macy schedule_timeout_interruptible(1); 405eda14cbcSMatt Macy } 406eda14cbcSMatt Macy 407eda14cbcSMatt Macy ASSERT3U(table.nents, ==, nr_pages); 408eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = table.sgl; 409eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = nr_pages; 410eda14cbcSMatt Macy 411eda14cbcSMatt Macy abd_for_each_sg(abd, sg, nr_pages, i) { 412eda14cbcSMatt Macy while ((page = __page_cache_alloc(gfp)) == NULL) { 413eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 414eda14cbcSMatt Macy schedule_timeout_interruptible(1); 415eda14cbcSMatt Macy } 416eda14cbcSMatt Macy 417eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_orders[0]); 418eda14cbcSMatt Macy sg_set_page(sg, page, PAGESIZE, 0); 419eda14cbcSMatt Macy abd_mark_zfs_page(page); 420eda14cbcSMatt Macy } 421eda14cbcSMatt Macy 422eda14cbcSMatt Macy if (nr_pages > 1) { 423eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 424eda14cbcSMatt Macy abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 425eda14cbcSMatt Macy } 426eda14cbcSMatt Macy } 427eda14cbcSMatt Macy #endif /* !CONFIG_HIGHMEM */ 428eda14cbcSMatt Macy 429eda14cbcSMatt Macy /* 430eda14cbcSMatt Macy * This must be called if any of the sg_table allocation functions 431eda14cbcSMatt Macy * are called. 432eda14cbcSMatt Macy */ 433eda14cbcSMatt Macy static void 434eda14cbcSMatt Macy abd_free_sg_table(abd_t *abd) 435eda14cbcSMatt Macy { 436eda14cbcSMatt Macy struct sg_table table; 437eda14cbcSMatt Macy 438eda14cbcSMatt Macy table.sgl = ABD_SCATTER(abd).abd_sgl; 439eda14cbcSMatt Macy table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; 440eda14cbcSMatt Macy sg_free_table(&table); 441eda14cbcSMatt Macy } 442eda14cbcSMatt Macy 443eda14cbcSMatt Macy void 444eda14cbcSMatt Macy abd_free_chunks(abd_t *abd) 445eda14cbcSMatt Macy { 446eda14cbcSMatt Macy struct scatterlist *sg = NULL; 447eda14cbcSMatt Macy struct page *page; 448eda14cbcSMatt Macy int nr_pages = ABD_SCATTER(abd).abd_nents; 449eda14cbcSMatt Macy int order, i = 0; 450eda14cbcSMatt Macy 451eda14cbcSMatt Macy if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) 452eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); 453eda14cbcSMatt Macy 454eda14cbcSMatt Macy if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) 455eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 456eda14cbcSMatt Macy 4577a7741afSMartin Matuska /* 4587a7741afSMartin Matuska * Scatter ABDs may be constructed by abd_alloc_from_pages() from 4597a7741afSMartin Matuska * an array of pages. In which case they should not be freed. 4607a7741afSMartin Matuska */ 4617a7741afSMartin Matuska if (!abd_is_from_pages(abd)) { 462eda14cbcSMatt Macy abd_for_each_sg(abd, sg, nr_pages, i) { 463eda14cbcSMatt Macy page = sg_page(sg); 464eda14cbcSMatt Macy abd_unmark_zfs_page(page); 465eda14cbcSMatt Macy order = compound_order(page); 466eda14cbcSMatt Macy __free_pages(page, order); 467eda14cbcSMatt Macy ASSERT3U(sg->length, <=, PAGE_SIZE << order); 468eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); 469eda14cbcSMatt Macy } 4707a7741afSMartin Matuska } 4717a7741afSMartin Matuska 472eda14cbcSMatt Macy abd_free_sg_table(abd); 473eda14cbcSMatt Macy } 474eda14cbcSMatt Macy 475eda14cbcSMatt Macy /* 476eda14cbcSMatt Macy * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in 477eda14cbcSMatt Macy * the scatterlist will be set to the zero'd out buffer abd_zero_page. 478eda14cbcSMatt Macy */ 479eda14cbcSMatt Macy static void 480eda14cbcSMatt Macy abd_alloc_zero_scatter(void) 481eda14cbcSMatt Macy { 482eda14cbcSMatt Macy struct scatterlist *sg = NULL; 483eda14cbcSMatt Macy struct sg_table table; 484eda14cbcSMatt Macy gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 485eda14cbcSMatt Macy int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); 486eda14cbcSMatt Macy int i = 0; 487eda14cbcSMatt Macy 488da5137abSMartin Matuska #if defined(HAVE_ZERO_PAGE_GPL_ONLY) 489da5137abSMartin Matuska gfp_t gfp_zero_page = gfp | __GFP_ZERO; 490eda14cbcSMatt Macy while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) { 491eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 492eda14cbcSMatt Macy schedule_timeout_interruptible(1); 493eda14cbcSMatt Macy } 494eda14cbcSMatt Macy abd_mark_zfs_page(abd_zero_page); 495da5137abSMartin Matuska #else 496da5137abSMartin Matuska abd_zero_page = ZERO_PAGE(0); 497da5137abSMartin Matuska #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ 498eda14cbcSMatt Macy 499eda14cbcSMatt Macy while (sg_alloc_table(&table, nr_pages, gfp)) { 500eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 501eda14cbcSMatt Macy schedule_timeout_interruptible(1); 502eda14cbcSMatt Macy } 503eda14cbcSMatt Macy ASSERT3U(table.nents, ==, nr_pages); 504eda14cbcSMatt Macy 505eda14cbcSMatt Macy abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); 506184c1b94SMartin Matuska abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; 507eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_offset = 0; 508eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; 509eda14cbcSMatt Macy ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; 510eda14cbcSMatt Macy abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; 511e2df9bb4SMartin Matuska abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK; 512eda14cbcSMatt Macy 513eda14cbcSMatt Macy abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { 514eda14cbcSMatt Macy sg_set_page(sg, abd_zero_page, PAGESIZE, 0); 515eda14cbcSMatt Macy } 516eda14cbcSMatt Macy 517eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_cnt); 518eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); 519eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 520eda14cbcSMatt Macy } 521eda14cbcSMatt Macy 522eda14cbcSMatt Macy boolean_t 523eda14cbcSMatt Macy abd_size_alloc_linear(size_t size) 524eda14cbcSMatt Macy { 5251f88aa09SMartin Matuska return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size); 526eda14cbcSMatt Macy } 527eda14cbcSMatt Macy 528eda14cbcSMatt Macy void 529eda14cbcSMatt Macy abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) 530eda14cbcSMatt Macy { 531eda14cbcSMatt Macy ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 532eda14cbcSMatt Macy int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size; 533eda14cbcSMatt Macy if (op == ABDSTAT_INCR) { 534eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_scatter_cnt); 535eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); 536eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); 537eda14cbcSMatt Macy arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); 538eda14cbcSMatt Macy } else { 539eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 540eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); 541eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); 542eda14cbcSMatt Macy arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); 543eda14cbcSMatt Macy } 544eda14cbcSMatt Macy } 545eda14cbcSMatt Macy 546eda14cbcSMatt Macy void 547eda14cbcSMatt Macy abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) 548eda14cbcSMatt Macy { 549eda14cbcSMatt Macy ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 550eda14cbcSMatt Macy if (op == ABDSTAT_INCR) { 551eda14cbcSMatt Macy ABDSTAT_BUMP(abdstat_linear_cnt); 552eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); 553eda14cbcSMatt Macy } else { 554eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_linear_cnt); 555eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); 556eda14cbcSMatt Macy } 557eda14cbcSMatt Macy } 558eda14cbcSMatt Macy 559eda14cbcSMatt Macy void 560eda14cbcSMatt Macy abd_verify_scatter(abd_t *abd) 561eda14cbcSMatt Macy { 562eda14cbcSMatt Macy ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); 563eda14cbcSMatt Macy ASSERT3U(ABD_SCATTER(abd).abd_offset, <, 564eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl->length); 5657a7741afSMartin Matuska 5667a7741afSMartin Matuska #ifdef ZFS_DEBUG 5677a7741afSMartin Matuska struct scatterlist *sg = NULL; 5687a7741afSMartin Matuska size_t n = ABD_SCATTER(abd).abd_nents; 5697a7741afSMartin Matuska int i = 0; 5707a7741afSMartin Matuska 571eda14cbcSMatt Macy abd_for_each_sg(abd, sg, n, i) { 572eda14cbcSMatt Macy ASSERT3P(sg_page(sg), !=, NULL); 573eda14cbcSMatt Macy } 5747a7741afSMartin Matuska #endif 575eda14cbcSMatt Macy } 576eda14cbcSMatt Macy 577eda14cbcSMatt Macy static void 578eda14cbcSMatt Macy abd_free_zero_scatter(void) 579eda14cbcSMatt Macy { 580eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 581eda14cbcSMatt Macy ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); 582eda14cbcSMatt Macy ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 583eda14cbcSMatt Macy 584eda14cbcSMatt Macy abd_free_sg_table(abd_zero_scatter); 585eda14cbcSMatt Macy abd_free_struct(abd_zero_scatter); 586eda14cbcSMatt Macy abd_zero_scatter = NULL; 587eda14cbcSMatt Macy ASSERT3P(abd_zero_page, !=, NULL); 588da5137abSMartin Matuska #if defined(HAVE_ZERO_PAGE_GPL_ONLY) 589eda14cbcSMatt Macy abd_unmark_zfs_page(abd_zero_page); 590eda14cbcSMatt Macy __free_page(abd_zero_page); 591da5137abSMartin Matuska #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ 592eda14cbcSMatt Macy } 593eda14cbcSMatt Macy 5940d8fe237SMartin Matuska static int 5950d8fe237SMartin Matuska abd_kstats_update(kstat_t *ksp, int rw) 5960d8fe237SMartin Matuska { 5970d8fe237SMartin Matuska abd_stats_t *as = ksp->ks_data; 5980d8fe237SMartin Matuska 5990d8fe237SMartin Matuska if (rw == KSTAT_WRITE) 6000d8fe237SMartin Matuska return (EACCES); 6010d8fe237SMartin Matuska as->abdstat_struct_size.value.ui64 = 6020d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_struct_size); 6030d8fe237SMartin Matuska as->abdstat_linear_cnt.value.ui64 = 6040d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_linear_cnt); 6050d8fe237SMartin Matuska as->abdstat_linear_data_size.value.ui64 = 6060d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_linear_data_size); 6070d8fe237SMartin Matuska as->abdstat_scatter_cnt.value.ui64 = 6080d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_cnt); 6090d8fe237SMartin Matuska as->abdstat_scatter_data_size.value.ui64 = 6100d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_data_size); 6110d8fe237SMartin Matuska as->abdstat_scatter_chunk_waste.value.ui64 = 6120d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); 613fd45b686SMartin Matuska for (int i = 0; i < ABD_MAX_ORDER; i++) { 6140d8fe237SMartin Matuska as->abdstat_scatter_orders[i].value.ui64 = 6150d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_orders[i]); 6160d8fe237SMartin Matuska } 6170d8fe237SMartin Matuska as->abdstat_scatter_page_multi_chunk.value.ui64 = 6180d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk); 6190d8fe237SMartin Matuska as->abdstat_scatter_page_multi_zone.value.ui64 = 6200d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone); 6210d8fe237SMartin Matuska as->abdstat_scatter_page_alloc_retry.value.ui64 = 6220d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry); 6230d8fe237SMartin Matuska as->abdstat_scatter_sg_table_retry.value.ui64 = 6240d8fe237SMartin Matuska wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry); 6250d8fe237SMartin Matuska return (0); 6260d8fe237SMartin Matuska } 6270d8fe237SMartin Matuska 628eda14cbcSMatt Macy void 629eda14cbcSMatt Macy abd_init(void) 630eda14cbcSMatt Macy { 631eda14cbcSMatt Macy int i; 632eda14cbcSMatt Macy 633eda14cbcSMatt Macy abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), 634ce4dcb97SMartin Matuska 0, NULL, NULL, NULL, NULL, NULL, KMC_RECLAIMABLE); 635eda14cbcSMatt Macy 6360d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_struct_size, 0); 6370d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_linear_cnt, 0); 6380d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_linear_data_size, 0); 6390d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); 6400d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); 6410d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); 642fd45b686SMartin Matuska for (i = 0; i < ABD_MAX_ORDER; i++) 6430d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0); 6440d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0); 6450d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0); 6460d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0); 6470d8fe237SMartin Matuska wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0); 6480d8fe237SMartin Matuska 649eda14cbcSMatt Macy abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, 650eda14cbcSMatt Macy sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 651eda14cbcSMatt Macy if (abd_ksp != NULL) { 652fd45b686SMartin Matuska for (i = 0; i < ABD_MAX_ORDER; i++) { 653eda14cbcSMatt Macy snprintf(abd_stats.abdstat_scatter_orders[i].name, 654eda14cbcSMatt Macy KSTAT_STRLEN, "scatter_order_%d", i); 655eda14cbcSMatt Macy abd_stats.abdstat_scatter_orders[i].data_type = 656eda14cbcSMatt Macy KSTAT_DATA_UINT64; 657eda14cbcSMatt Macy } 658eda14cbcSMatt Macy abd_ksp->ks_data = &abd_stats; 6590d8fe237SMartin Matuska abd_ksp->ks_update = abd_kstats_update; 660eda14cbcSMatt Macy kstat_install(abd_ksp); 661eda14cbcSMatt Macy } 662eda14cbcSMatt Macy 663eda14cbcSMatt Macy abd_alloc_zero_scatter(); 664eda14cbcSMatt Macy } 665eda14cbcSMatt Macy 666eda14cbcSMatt Macy void 667eda14cbcSMatt Macy abd_fini(void) 668eda14cbcSMatt Macy { 669eda14cbcSMatt Macy abd_free_zero_scatter(); 670eda14cbcSMatt Macy 671eda14cbcSMatt Macy if (abd_ksp != NULL) { 672eda14cbcSMatt Macy kstat_delete(abd_ksp); 673eda14cbcSMatt Macy abd_ksp = NULL; 674eda14cbcSMatt Macy } 675eda14cbcSMatt Macy 6760d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_struct_size); 6770d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_linear_cnt); 6780d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_linear_data_size); 6790d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_cnt); 6800d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_data_size); 6810d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); 682fd45b686SMartin Matuska for (int i = 0; i < ABD_MAX_ORDER; i++) 6830d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_orders[i]); 6840d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk); 6850d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone); 6860d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry); 6870d8fe237SMartin Matuska wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry); 6880d8fe237SMartin Matuska 689eda14cbcSMatt Macy if (abd_cache) { 690eda14cbcSMatt Macy kmem_cache_destroy(abd_cache); 691eda14cbcSMatt Macy abd_cache = NULL; 692eda14cbcSMatt Macy } 693eda14cbcSMatt Macy } 694eda14cbcSMatt Macy 695eda14cbcSMatt Macy void 696eda14cbcSMatt Macy abd_free_linear_page(abd_t *abd) 697eda14cbcSMatt Macy { 698eda14cbcSMatt Macy /* Transform it back into a scatter ABD for freeing */ 699eda14cbcSMatt Macy struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; 7007a7741afSMartin Matuska 7017a7741afSMartin Matuska /* When backed by user page unmap it */ 7027a7741afSMartin Matuska if (abd_is_from_pages(abd)) 7037a7741afSMartin Matuska zfs_kunmap(sg_page(sg)); 704*5c65a0a9SMartin Matuska else 705*5c65a0a9SMartin Matuska abd_update_scatter_stats(abd, ABDSTAT_DECR); 7067a7741afSMartin Matuska 707eda14cbcSMatt Macy abd->abd_flags &= ~ABD_FLAG_LINEAR; 708eda14cbcSMatt Macy abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; 709eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = 1; 710eda14cbcSMatt Macy ABD_SCATTER(abd).abd_offset = 0; 711eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = sg; 712eda14cbcSMatt Macy abd_free_chunks(abd); 7137a7741afSMartin Matuska } 714eda14cbcSMatt Macy 7157a7741afSMartin Matuska /* 7167a7741afSMartin Matuska * Allocate a scatter ABD structure from user pages. The pages must be 7177a7741afSMartin Matuska * pinned with get_user_pages, or similiar, but need not be mapped via 7187a7741afSMartin Matuska * the kmap interfaces. 7197a7741afSMartin Matuska */ 7207a7741afSMartin Matuska abd_t * 7217a7741afSMartin Matuska abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size) 7227a7741afSMartin Matuska { 7237a7741afSMartin Matuska uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE); 7247a7741afSMartin Matuska struct sg_table table; 7257a7741afSMartin Matuska 7267a7741afSMartin Matuska VERIFY3U(size, <=, DMU_MAX_ACCESS); 7277a7741afSMartin Matuska ASSERT3U(offset, <, PAGE_SIZE); 7287a7741afSMartin Matuska ASSERT3P(pages, !=, NULL); 7297a7741afSMartin Matuska 7307a7741afSMartin Matuska /* 7317a7741afSMartin Matuska * Even if this buf is filesystem metadata, we only track that we 7327a7741afSMartin Matuska * own the underlying data buffer, which is not true in this case. 7337a7741afSMartin Matuska * Therefore, we don't ever use ABD_FLAG_META here. 7347a7741afSMartin Matuska */ 7357a7741afSMartin Matuska abd_t *abd = abd_alloc_struct(0); 7367a7741afSMartin Matuska abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER; 7377a7741afSMartin Matuska abd->abd_size = size; 7387a7741afSMartin Matuska 7397a7741afSMartin Matuska while (sg_alloc_table_from_pages(&table, pages, npages, offset, 7407a7741afSMartin Matuska size, __GFP_NOWARN | GFP_NOIO) != 0) { 7417a7741afSMartin Matuska ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 7427a7741afSMartin Matuska schedule_timeout_interruptible(1); 7437a7741afSMartin Matuska } 7447a7741afSMartin Matuska 7457a7741afSMartin Matuska if ((offset + size) <= PAGE_SIZE) { 7467a7741afSMartin Matuska /* 7477a7741afSMartin Matuska * Since there is only one entry, this ABD can be represented 7487a7741afSMartin Matuska * as a linear buffer. All single-page (4K) ABD's constructed 7497a7741afSMartin Matuska * from a user page can be represented this way as long as the 7507a7741afSMartin Matuska * page is mapped to a virtual address. This allows us to 7517a7741afSMartin Matuska * apply an offset in to the mapped page. 7527a7741afSMartin Matuska * 7537a7741afSMartin Matuska * Note that kmap() must be used, not kmap_atomic(), because 7547a7741afSMartin Matuska * the mapping needs to bet set up on all CPUs. Using kmap() 7557a7741afSMartin Matuska * also enables the user of highmem pages when required. 7567a7741afSMartin Matuska */ 7577a7741afSMartin Matuska abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE; 7587a7741afSMartin Matuska abd->abd_u.abd_linear.abd_sgl = table.sgl; 7597a7741afSMartin Matuska zfs_kmap(sg_page(table.sgl)); 7607a7741afSMartin Matuska ABD_LINEAR_BUF(abd) = sg_virt(table.sgl); 7617a7741afSMartin Matuska } else { 7627a7741afSMartin Matuska ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 7637a7741afSMartin Matuska abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 7647a7741afSMartin Matuska 7657a7741afSMartin Matuska ABD_SCATTER(abd).abd_offset = offset; 7667a7741afSMartin Matuska ABD_SCATTER(abd).abd_sgl = table.sgl; 7677a7741afSMartin Matuska ABD_SCATTER(abd).abd_nents = table.nents; 7687a7741afSMartin Matuska 7697a7741afSMartin Matuska ASSERT0(ABD_SCATTER(abd).abd_offset); 7707a7741afSMartin Matuska } 7717a7741afSMartin Matuska 7727a7741afSMartin Matuska return (abd); 773eda14cbcSMatt Macy } 774eda14cbcSMatt Macy 775eda14cbcSMatt Macy /* 776eda14cbcSMatt Macy * If we're going to use this ABD for doing I/O using the block layer, the 777eda14cbcSMatt Macy * consumer of the ABD data doesn't care if it's scattered or not, and we don't 778eda14cbcSMatt Macy * plan to store this ABD in memory for a long period of time, we should 779eda14cbcSMatt Macy * allocate the ABD type that requires the least data copying to do the I/O. 780eda14cbcSMatt Macy * 781eda14cbcSMatt Macy * On Linux the optimal thing to do would be to use abd_get_offset() and 782eda14cbcSMatt Macy * construct a new ABD which shares the original pages thereby eliminating 783eda14cbcSMatt Macy * the copy. But for the moment a new linear ABD is allocated until this 784eda14cbcSMatt Macy * performance optimization can be implemented. 785eda14cbcSMatt Macy */ 786eda14cbcSMatt Macy abd_t * 787eda14cbcSMatt Macy abd_alloc_for_io(size_t size, boolean_t is_metadata) 788eda14cbcSMatt Macy { 789eda14cbcSMatt Macy return (abd_alloc(size, is_metadata)); 790eda14cbcSMatt Macy } 791eda14cbcSMatt Macy 792eda14cbcSMatt Macy abd_t * 7937cd22ac4SMartin Matuska abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, 7947cd22ac4SMartin Matuska size_t size) 795eda14cbcSMatt Macy { 796e92ffd9bSMartin Matuska (void) size; 797eda14cbcSMatt Macy int i = 0; 798eda14cbcSMatt Macy struct scatterlist *sg = NULL; 799eda14cbcSMatt Macy 800eda14cbcSMatt Macy abd_verify(sabd); 801eda14cbcSMatt Macy ASSERT3U(off, <=, sabd->abd_size); 802eda14cbcSMatt Macy 803eda14cbcSMatt Macy size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; 804eda14cbcSMatt Macy 805184c1b94SMartin Matuska if (abd == NULL) 806eda14cbcSMatt Macy abd = abd_alloc_struct(0); 807eda14cbcSMatt Macy 808eda14cbcSMatt Macy /* 809eda14cbcSMatt Macy * Even if this buf is filesystem metadata, we only track that 810eda14cbcSMatt Macy * if we own the underlying data buffer, which is not true in 811eda14cbcSMatt Macy * this case. Therefore, we don't ever use ABD_FLAG_META here. 812eda14cbcSMatt Macy */ 813eda14cbcSMatt Macy 814eda14cbcSMatt Macy abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { 815eda14cbcSMatt Macy if (new_offset < sg->length) 816eda14cbcSMatt Macy break; 817eda14cbcSMatt Macy new_offset -= sg->length; 818eda14cbcSMatt Macy } 819eda14cbcSMatt Macy 820eda14cbcSMatt Macy ABD_SCATTER(abd).abd_sgl = sg; 821eda14cbcSMatt Macy ABD_SCATTER(abd).abd_offset = new_offset; 822eda14cbcSMatt Macy ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; 823eda14cbcSMatt Macy 8247a7741afSMartin Matuska if (abd_is_from_pages(sabd)) 8257a7741afSMartin Matuska abd->abd_flags |= ABD_FLAG_FROM_PAGES; 8267a7741afSMartin Matuska 827eda14cbcSMatt Macy return (abd); 828eda14cbcSMatt Macy } 829eda14cbcSMatt Macy 830eda14cbcSMatt Macy /* 831eda14cbcSMatt Macy * Initialize the abd_iter. 832eda14cbcSMatt Macy */ 833eda14cbcSMatt Macy void 834eda14cbcSMatt Macy abd_iter_init(struct abd_iter *aiter, abd_t *abd) 835eda14cbcSMatt Macy { 836eda14cbcSMatt Macy ASSERT(!abd_is_gang(abd)); 837eda14cbcSMatt Macy abd_verify(abd); 838783d3ff6SMartin Matuska memset(aiter, 0, sizeof (struct abd_iter)); 839eda14cbcSMatt Macy aiter->iter_abd = abd; 840783d3ff6SMartin Matuska if (!abd_is_linear(abd)) { 841eda14cbcSMatt Macy aiter->iter_offset = ABD_SCATTER(abd).abd_offset; 842eda14cbcSMatt Macy aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; 843eda14cbcSMatt Macy } 844eda14cbcSMatt Macy } 845eda14cbcSMatt Macy 846eda14cbcSMatt Macy /* 847eda14cbcSMatt Macy * This is just a helper function to see if we have exhausted the 848eda14cbcSMatt Macy * abd_iter and reached the end. 849eda14cbcSMatt Macy */ 850eda14cbcSMatt Macy boolean_t 851eda14cbcSMatt Macy abd_iter_at_end(struct abd_iter *aiter) 852eda14cbcSMatt Macy { 853783d3ff6SMartin Matuska ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); 854eda14cbcSMatt Macy return (aiter->iter_pos == aiter->iter_abd->abd_size); 855eda14cbcSMatt Macy } 856eda14cbcSMatt Macy 857eda14cbcSMatt Macy /* 858eda14cbcSMatt Macy * Advance the iterator by a certain amount. Cannot be called when a chunk is 859eda14cbcSMatt Macy * in use. This can be safely called when the aiter has already exhausted, in 860eda14cbcSMatt Macy * which case this does nothing. 861eda14cbcSMatt Macy */ 862eda14cbcSMatt Macy void 863eda14cbcSMatt Macy abd_iter_advance(struct abd_iter *aiter, size_t amount) 864eda14cbcSMatt Macy { 865783d3ff6SMartin Matuska /* 866783d3ff6SMartin Matuska * Ensure that last chunk is not in use. abd_iterate_*() must clear 867783d3ff6SMartin Matuska * this state (directly or abd_iter_unmap()) before advancing. 868783d3ff6SMartin Matuska */ 869eda14cbcSMatt Macy ASSERT3P(aiter->iter_mapaddr, ==, NULL); 870eda14cbcSMatt Macy ASSERT0(aiter->iter_mapsize); 871783d3ff6SMartin Matuska ASSERT3P(aiter->iter_page, ==, NULL); 872783d3ff6SMartin Matuska ASSERT0(aiter->iter_page_doff); 873783d3ff6SMartin Matuska ASSERT0(aiter->iter_page_dsize); 874eda14cbcSMatt Macy 875eda14cbcSMatt Macy /* There's nothing left to advance to, so do nothing */ 876eda14cbcSMatt Macy if (abd_iter_at_end(aiter)) 877eda14cbcSMatt Macy return; 878eda14cbcSMatt Macy 879eda14cbcSMatt Macy aiter->iter_pos += amount; 880eda14cbcSMatt Macy aiter->iter_offset += amount; 881eda14cbcSMatt Macy if (!abd_is_linear(aiter->iter_abd)) { 882eda14cbcSMatt Macy while (aiter->iter_offset >= aiter->iter_sg->length) { 883eda14cbcSMatt Macy aiter->iter_offset -= aiter->iter_sg->length; 884eda14cbcSMatt Macy aiter->iter_sg = sg_next(aiter->iter_sg); 885eda14cbcSMatt Macy if (aiter->iter_sg == NULL) { 886eda14cbcSMatt Macy ASSERT0(aiter->iter_offset); 887eda14cbcSMatt Macy break; 888eda14cbcSMatt Macy } 889eda14cbcSMatt Macy } 890eda14cbcSMatt Macy } 891eda14cbcSMatt Macy } 892eda14cbcSMatt Macy 893eda14cbcSMatt Macy /* 894eda14cbcSMatt Macy * Map the current chunk into aiter. This can be safely called when the aiter 895eda14cbcSMatt Macy * has already exhausted, in which case this does nothing. 896eda14cbcSMatt Macy */ 897eda14cbcSMatt Macy void 898eda14cbcSMatt Macy abd_iter_map(struct abd_iter *aiter) 899eda14cbcSMatt Macy { 900eda14cbcSMatt Macy void *paddr; 901eda14cbcSMatt Macy size_t offset = 0; 902eda14cbcSMatt Macy 903eda14cbcSMatt Macy ASSERT3P(aiter->iter_mapaddr, ==, NULL); 904eda14cbcSMatt Macy ASSERT0(aiter->iter_mapsize); 905eda14cbcSMatt Macy 906eda14cbcSMatt Macy /* There's nothing left to iterate over, so do nothing */ 907eda14cbcSMatt Macy if (abd_iter_at_end(aiter)) 908eda14cbcSMatt Macy return; 909eda14cbcSMatt Macy 910eda14cbcSMatt Macy if (abd_is_linear(aiter->iter_abd)) { 911eda14cbcSMatt Macy ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); 912eda14cbcSMatt Macy offset = aiter->iter_offset; 913eda14cbcSMatt Macy aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; 914eda14cbcSMatt Macy paddr = ABD_LINEAR_BUF(aiter->iter_abd); 915eda14cbcSMatt Macy } else { 916eda14cbcSMatt Macy offset = aiter->iter_offset; 917eda14cbcSMatt Macy aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, 918eda14cbcSMatt Macy aiter->iter_abd->abd_size - aiter->iter_pos); 919eda14cbcSMatt Macy 92075e1fea6SMartin Matuska paddr = zfs_kmap_local(sg_page(aiter->iter_sg)); 921eda14cbcSMatt Macy } 922eda14cbcSMatt Macy 923eda14cbcSMatt Macy aiter->iter_mapaddr = (char *)paddr + offset; 924eda14cbcSMatt Macy } 925eda14cbcSMatt Macy 926eda14cbcSMatt Macy /* 927eda14cbcSMatt Macy * Unmap the current chunk from aiter. This can be safely called when the aiter 928eda14cbcSMatt Macy * has already exhausted, in which case this does nothing. 929eda14cbcSMatt Macy */ 930eda14cbcSMatt Macy void 931eda14cbcSMatt Macy abd_iter_unmap(struct abd_iter *aiter) 932eda14cbcSMatt Macy { 933eda14cbcSMatt Macy /* There's nothing left to unmap, so do nothing */ 934eda14cbcSMatt Macy if (abd_iter_at_end(aiter)) 935eda14cbcSMatt Macy return; 936eda14cbcSMatt Macy 937eda14cbcSMatt Macy if (!abd_is_linear(aiter->iter_abd)) { 938eda14cbcSMatt Macy /* LINTED E_FUNC_SET_NOT_USED */ 93975e1fea6SMartin Matuska zfs_kunmap_local(aiter->iter_mapaddr - aiter->iter_offset); 940eda14cbcSMatt Macy } 941eda14cbcSMatt Macy 942eda14cbcSMatt Macy ASSERT3P(aiter->iter_mapaddr, !=, NULL); 943eda14cbcSMatt Macy ASSERT3U(aiter->iter_mapsize, >, 0); 944eda14cbcSMatt Macy 945eda14cbcSMatt Macy aiter->iter_mapaddr = NULL; 946eda14cbcSMatt Macy aiter->iter_mapsize = 0; 947eda14cbcSMatt Macy } 948eda14cbcSMatt Macy 949eda14cbcSMatt Macy void 950eda14cbcSMatt Macy abd_cache_reap_now(void) 951eda14cbcSMatt Macy { 952eda14cbcSMatt Macy } 953eda14cbcSMatt Macy 954eda14cbcSMatt Macy /* 9557a7741afSMartin Matuska * Borrow a raw buffer from an ABD without copying the contents of the ABD 9567a7741afSMartin Matuska * into the buffer. If the ABD is scattered, this will allocate a raw buffer 9577a7741afSMartin Matuska * whose contents are undefined. To copy over the existing data in the ABD, use 9587a7741afSMartin Matuska * abd_borrow_buf_copy() instead. 9597a7741afSMartin Matuska */ 9607a7741afSMartin Matuska void * 9617a7741afSMartin Matuska abd_borrow_buf(abd_t *abd, size_t n) 9627a7741afSMartin Matuska { 9637a7741afSMartin Matuska void *buf; 9647a7741afSMartin Matuska abd_verify(abd); 9657a7741afSMartin Matuska ASSERT3U(abd->abd_size, >=, 0); 9667a7741afSMartin Matuska /* 9677a7741afSMartin Matuska * In the event the ABD is composed of a single user page from Direct 9687a7741afSMartin Matuska * I/O we can not direclty return the raw buffer. This is a consequence 9697a7741afSMartin Matuska * of not being able to write protect the page and the contents of the 9707a7741afSMartin Matuska * page can be changed at any time by the user. 9717a7741afSMartin Matuska */ 9727a7741afSMartin Matuska if (abd_is_from_pages(abd)) { 9737a7741afSMartin Matuska buf = zio_buf_alloc(n); 9747a7741afSMartin Matuska } else if (abd_is_linear(abd)) { 9757a7741afSMartin Matuska buf = abd_to_buf(abd); 9767a7741afSMartin Matuska } else { 9777a7741afSMartin Matuska buf = zio_buf_alloc(n); 9787a7741afSMartin Matuska } 9797a7741afSMartin Matuska 9807a7741afSMartin Matuska #ifdef ZFS_DEBUG 9817a7741afSMartin Matuska (void) zfs_refcount_add_many(&abd->abd_children, n, buf); 9827a7741afSMartin Matuska #endif 9837a7741afSMartin Matuska return (buf); 9847a7741afSMartin Matuska } 9857a7741afSMartin Matuska 9867a7741afSMartin Matuska void * 9877a7741afSMartin Matuska abd_borrow_buf_copy(abd_t *abd, size_t n) 9887a7741afSMartin Matuska { 9897a7741afSMartin Matuska void *buf = abd_borrow_buf(abd, n); 9907a7741afSMartin Matuska 9917a7741afSMartin Matuska /* 9927a7741afSMartin Matuska * In the event the ABD is composed of a single user page from Direct 9937a7741afSMartin Matuska * I/O we must make sure copy the data over into the newly allocated 9947a7741afSMartin Matuska * buffer. This is a consequence of the fact that we can not write 9957a7741afSMartin Matuska * protect the user page and there is a risk the contents of the page 9967a7741afSMartin Matuska * could be changed by the user at any moment. 9977a7741afSMartin Matuska */ 9987a7741afSMartin Matuska if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { 9997a7741afSMartin Matuska abd_copy_to_buf(buf, abd, n); 10007a7741afSMartin Matuska } 10017a7741afSMartin Matuska return (buf); 10027a7741afSMartin Matuska } 10037a7741afSMartin Matuska 10047a7741afSMartin Matuska /* 10057a7741afSMartin Matuska * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will 10067a7741afSMartin Matuska * not change the contents of the ABD. If you want any changes you made to 10077a7741afSMartin Matuska * buf to be copied back to abd, use abd_return_buf_copy() instead. If the 10087a7741afSMartin Matuska * ABD is not constructed from user pages for Direct I/O then an ASSERT 10097a7741afSMartin Matuska * checks to make sure the contents of buffer have not changed since it was 10107a7741afSMartin Matuska * borrowed. We can not ASSERT that the contents of the buffer have not changed 10117a7741afSMartin Matuska * if it is composed of user pages because the pages can not be placed under 10127a7741afSMartin Matuska * write protection and the user could have possibly changed the contents in 101387bf66d4SMartin Matuska * the pages at any time. This is also an issue for Direct I/O reads. Checksum 101487bf66d4SMartin Matuska * verifications in the ZIO pipeline check for this issue and handle it by 101587bf66d4SMartin Matuska * returning an error on checksum verification failure. 10167a7741afSMartin Matuska */ 10177a7741afSMartin Matuska void 10187a7741afSMartin Matuska abd_return_buf(abd_t *abd, void *buf, size_t n) 10197a7741afSMartin Matuska { 10207a7741afSMartin Matuska abd_verify(abd); 10217a7741afSMartin Matuska ASSERT3U(abd->abd_size, >=, n); 10227a7741afSMartin Matuska #ifdef ZFS_DEBUG 10237a7741afSMartin Matuska (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); 10247a7741afSMartin Matuska #endif 10257a7741afSMartin Matuska if (abd_is_from_pages(abd)) { 10267a7741afSMartin Matuska zio_buf_free(buf, n); 10277a7741afSMartin Matuska } else if (abd_is_linear(abd)) { 10287a7741afSMartin Matuska ASSERT3P(buf, ==, abd_to_buf(abd)); 10297a7741afSMartin Matuska } else if (abd_is_gang(abd)) { 10307a7741afSMartin Matuska #ifdef ZFS_DEBUG 10317a7741afSMartin Matuska /* 10327a7741afSMartin Matuska * We have to be careful with gang ABD's that we do not ASSERT0 10337a7741afSMartin Matuska * for any ABD's that contain user pages from Direct I/O. In 10347a7741afSMartin Matuska * order to handle this, we just iterate through the gang ABD 10357a7741afSMartin Matuska * and only verify ABDs that are not from user pages. 10367a7741afSMartin Matuska */ 10377a7741afSMartin Matuska void *cmp_buf = buf; 10387a7741afSMartin Matuska 10397a7741afSMartin Matuska for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); 10407a7741afSMartin Matuska cabd != NULL; 10417a7741afSMartin Matuska cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 10427a7741afSMartin Matuska if (!abd_is_from_pages(cabd)) { 10437a7741afSMartin Matuska ASSERT0(abd_cmp_buf(cabd, cmp_buf, 10447a7741afSMartin Matuska cabd->abd_size)); 10457a7741afSMartin Matuska } 10467a7741afSMartin Matuska cmp_buf = (char *)cmp_buf + cabd->abd_size; 10477a7741afSMartin Matuska } 10487a7741afSMartin Matuska #endif 10497a7741afSMartin Matuska zio_buf_free(buf, n); 10507a7741afSMartin Matuska } else { 10517a7741afSMartin Matuska ASSERT0(abd_cmp_buf(abd, buf, n)); 10527a7741afSMartin Matuska zio_buf_free(buf, n); 10537a7741afSMartin Matuska } 10547a7741afSMartin Matuska } 10557a7741afSMartin Matuska 10567a7741afSMartin Matuska void 10577a7741afSMartin Matuska abd_return_buf_copy(abd_t *abd, void *buf, size_t n) 10587a7741afSMartin Matuska { 10597a7741afSMartin Matuska if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { 10607a7741afSMartin Matuska abd_copy_from_buf(abd, buf, n); 10617a7741afSMartin Matuska } 10627a7741afSMartin Matuska abd_return_buf(abd, buf, n); 10637a7741afSMartin Matuska } 10647a7741afSMartin Matuska 10657a7741afSMartin Matuska /* 10660d4ad640SMartin Matuska * This is abd_iter_page(), the function underneath abd_iterate_page_func(). 10670d4ad640SMartin Matuska * It yields the next page struct and data offset and size within it, without 1068783d3ff6SMartin Matuska * mapping it into the address space. 1069783d3ff6SMartin Matuska */ 10700d4ad640SMartin Matuska 10710d4ad640SMartin Matuska /* 10720d4ad640SMartin Matuska * "Compound pages" are a group of pages that can be referenced from a single 10730d4ad640SMartin Matuska * struct page *. Its organised as a "head" page, followed by a series of 10740d4ad640SMartin Matuska * "tail" pages. 10750d4ad640SMartin Matuska * 10760d4ad640SMartin Matuska * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we 10770d4ad640SMartin Matuska * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a 10780d4ad640SMartin Matuska * great many of the IO buffers we get are going to be of this type. 10790d4ad640SMartin Matuska * 10800d4ad640SMartin Matuska * The tail pages are just regular PAGESIZE pages, and can be safely used 10810d4ad640SMartin Matuska * as-is. However, the head page has length covering itself and all the tail 10820d4ad640SMartin Matuska * pages. If the ABD chunk spans multiple pages, then we can use the head page 10830d4ad640SMartin Matuska * and a >PAGESIZE length, which is far more efficient. 10840d4ad640SMartin Matuska * 10850d4ad640SMartin Matuska * Before kernel 4.5 however, compound page heads were refcounted separately 10860d4ad640SMartin Matuska * from tail pages, such that moving back to the head page would require us to 10870d4ad640SMartin Matuska * take a reference to it and releasing it once we're completely finished with 10887a7741afSMartin Matuska * it. In practice, that meant when our caller is done with the ABD, which we 10890d4ad640SMartin Matuska * have no insight into from here. Rather than contort this API to track head 10907a7741afSMartin Matuska * page references on such ancient kernels, we disabled this special compound 10917a7741afSMartin Matuska * page handling on kernels before 4.5, instead just using treating each page 10927a7741afSMartin Matuska * within it as a regular PAGESIZE page (which it is). This is slightly less 10937a7741afSMartin Matuska * efficient, but makes everything far simpler. 10940d4ad640SMartin Matuska * 10957a7741afSMartin Matuska * We no longer support kernels before 4.5, so in theory none of this is 10967a7741afSMartin Matuska * necessary. However, this code is still relatively new in the grand scheme of 10977a7741afSMartin Matuska * things, so I'm leaving the ability to compile this out for the moment. 10987a7741afSMartin Matuska * 10997a7741afSMartin Matuska * Setting/clearing ABD_ITER_COMPOUND_PAGES below enables/disables the special 11007a7741afSMartin Matuska * handling, by defining the ABD_ITER_PAGE_SIZE(page) macro to understand 11017a7741afSMartin Matuska * compound pages, or not, and compiling in/out the support to detect compound 11027a7741afSMartin Matuska * tail pages and move back to the start. 11030d4ad640SMartin Matuska */ 11047a7741afSMartin Matuska 11057a7741afSMartin Matuska /* On by default */ 11067a7741afSMartin Matuska #define ABD_ITER_COMPOUND_PAGES 11077a7741afSMartin Matuska 11087a7741afSMartin Matuska #ifdef ABD_ITER_COMPOUND_PAGES 11090d4ad640SMartin Matuska #define ABD_ITER_PAGE_SIZE(page) \ 11100d4ad640SMartin Matuska (PageCompound(page) ? page_size(page) : PAGESIZE) 11110d4ad640SMartin Matuska #else 11120d4ad640SMartin Matuska #define ABD_ITER_PAGE_SIZE(page) (PAGESIZE) 11130d4ad640SMartin Matuska #endif 11140d4ad640SMartin Matuska 1115783d3ff6SMartin Matuska void 1116783d3ff6SMartin Matuska abd_iter_page(struct abd_iter *aiter) 1117783d3ff6SMartin Matuska { 1118783d3ff6SMartin Matuska if (abd_iter_at_end(aiter)) { 1119783d3ff6SMartin Matuska aiter->iter_page = NULL; 1120783d3ff6SMartin Matuska aiter->iter_page_doff = 0; 1121783d3ff6SMartin Matuska aiter->iter_page_dsize = 0; 1122783d3ff6SMartin Matuska return; 1123783d3ff6SMartin Matuska } 1124783d3ff6SMartin Matuska 1125783d3ff6SMartin Matuska struct page *page; 1126783d3ff6SMartin Matuska size_t doff, dsize; 1127783d3ff6SMartin Matuska 11280d4ad640SMartin Matuska /* 11290d4ad640SMartin Matuska * Find the page, and the start of the data within it. This is computed 11300d4ad640SMartin Matuska * differently for linear and scatter ABDs; linear is referenced by 11310d4ad640SMartin Matuska * virtual memory location, while scatter is referenced by page 11320d4ad640SMartin Matuska * pointer. 11330d4ad640SMartin Matuska */ 1134783d3ff6SMartin Matuska if (abd_is_linear(aiter->iter_abd)) { 1135783d3ff6SMartin Matuska ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); 1136783d3ff6SMartin Matuska 1137783d3ff6SMartin Matuska /* memory address at iter_pos */ 1138783d3ff6SMartin Matuska void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; 1139783d3ff6SMartin Matuska 1140783d3ff6SMartin Matuska /* struct page for address */ 1141783d3ff6SMartin Matuska page = is_vmalloc_addr(paddr) ? 1142783d3ff6SMartin Matuska vmalloc_to_page(paddr) : virt_to_page(paddr); 1143783d3ff6SMartin Matuska 1144783d3ff6SMartin Matuska /* offset of address within the page */ 1145783d3ff6SMartin Matuska doff = offset_in_page(paddr); 1146783d3ff6SMartin Matuska } else { 1147783d3ff6SMartin Matuska ASSERT(!abd_is_gang(aiter->iter_abd)); 1148783d3ff6SMartin Matuska 1149783d3ff6SMartin Matuska /* current scatter page */ 11500d4ad640SMartin Matuska page = nth_page(sg_page(aiter->iter_sg), 11510d4ad640SMartin Matuska aiter->iter_offset >> PAGE_SHIFT); 1152783d3ff6SMartin Matuska 1153783d3ff6SMartin Matuska /* position within page */ 11540d4ad640SMartin Matuska doff = aiter->iter_offset & (PAGESIZE - 1); 1155783d3ff6SMartin Matuska } 1156783d3ff6SMartin Matuska 11570d4ad640SMartin Matuska #ifdef ABD_ITER_COMPOUND_PAGES 1158783d3ff6SMartin Matuska if (PageTail(page)) { 1159783d3ff6SMartin Matuska /* 11600d4ad640SMartin Matuska * If this is a compound tail page, move back to the head, and 11610d4ad640SMartin Matuska * adjust the offset to match. This may let us yield a much 11620d4ad640SMartin Matuska * larger amount of data from a single logical page, and so 11630d4ad640SMartin Matuska * leave our caller with fewer pages to process. 1164783d3ff6SMartin Matuska */ 1165783d3ff6SMartin Matuska struct page *head = compound_head(page); 1166783d3ff6SMartin Matuska doff += ((page - head) * PAGESIZE); 1167783d3ff6SMartin Matuska page = head; 1168783d3ff6SMartin Matuska } 1169783d3ff6SMartin Matuska #endif 1170783d3ff6SMartin Matuska 11710d4ad640SMartin Matuska ASSERT(page); 11720d4ad640SMartin Matuska 11730d4ad640SMartin Matuska /* 11740d4ad640SMartin Matuska * Compute the maximum amount of data we can take from this page. This 11750d4ad640SMartin Matuska * is the smaller of: 11760d4ad640SMartin Matuska * - the remaining space in the page 11770d4ad640SMartin Matuska * - the remaining space in this scatterlist entry (which may not cover 11780d4ad640SMartin Matuska * the entire page) 11790d4ad640SMartin Matuska * - the remaining space in the abd (which may not cover the entire 11800d4ad640SMartin Matuska * scatterlist entry) 11810d4ad640SMartin Matuska */ 11820d4ad640SMartin Matuska dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff, 11830d4ad640SMartin Matuska aiter->iter_abd->abd_size - aiter->iter_pos); 11840d4ad640SMartin Matuska if (!abd_is_linear(aiter->iter_abd)) 11850d4ad640SMartin Matuska dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset); 11860d4ad640SMartin Matuska ASSERT3U(dsize, >, 0); 11870d4ad640SMartin Matuska 11880d4ad640SMartin Matuska /* final iterator outputs */ 1189783d3ff6SMartin Matuska aiter->iter_page = page; 1190783d3ff6SMartin Matuska aiter->iter_page_doff = doff; 11910d4ad640SMartin Matuska aiter->iter_page_dsize = dsize; 1192783d3ff6SMartin Matuska } 1193783d3ff6SMartin Matuska 1194783d3ff6SMartin Matuska /* 1195783d3ff6SMartin Matuska * Note: ABD BIO functions only needed to support vdev_classic. See comments in 1196783d3ff6SMartin Matuska * vdev_disk.c. 1197783d3ff6SMartin Matuska */ 1198783d3ff6SMartin Matuska 1199783d3ff6SMartin Matuska /* 1200eda14cbcSMatt Macy * bio_nr_pages for ABD. 1201eda14cbcSMatt Macy * @off is the offset in @abd 1202eda14cbcSMatt Macy */ 1203eda14cbcSMatt Macy unsigned long 1204eda14cbcSMatt Macy abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) 1205eda14cbcSMatt Macy { 1206eda14cbcSMatt Macy unsigned long pos; 1207eda14cbcSMatt Macy 1208184c1b94SMartin Matuska if (abd_is_gang(abd)) { 1209184c1b94SMartin Matuska unsigned long count = 0; 1210eda14cbcSMatt Macy 1211184c1b94SMartin Matuska for (abd_t *cabd = abd_gang_get_offset(abd, &off); 1212184c1b94SMartin Matuska cabd != NULL && size != 0; 1213184c1b94SMartin Matuska cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1214184c1b94SMartin Matuska ASSERT3U(off, <, cabd->abd_size); 1215184c1b94SMartin Matuska int mysize = MIN(size, cabd->abd_size - off); 1216184c1b94SMartin Matuska count += abd_nr_pages_off(cabd, mysize, off); 1217184c1b94SMartin Matuska size -= mysize; 1218184c1b94SMartin Matuska off = 0; 1219184c1b94SMartin Matuska } 1220184c1b94SMartin Matuska return (count); 1221184c1b94SMartin Matuska } 1222184c1b94SMartin Matuska 1223eda14cbcSMatt Macy if (abd_is_linear(abd)) 1224eda14cbcSMatt Macy pos = (unsigned long)abd_to_buf(abd) + off; 1225eda14cbcSMatt Macy else 1226eda14cbcSMatt Macy pos = ABD_SCATTER(abd).abd_offset + off; 1227eda14cbcSMatt Macy 1228184c1b94SMartin Matuska return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - 1229184c1b94SMartin Matuska (pos >> PAGE_SHIFT)); 1230eda14cbcSMatt Macy } 1231eda14cbcSMatt Macy 1232eda14cbcSMatt Macy static unsigned int 1233eda14cbcSMatt Macy bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) 1234eda14cbcSMatt Macy { 1235eda14cbcSMatt Macy unsigned int offset, size, i; 1236eda14cbcSMatt Macy struct page *page; 1237eda14cbcSMatt Macy 1238eda14cbcSMatt Macy offset = offset_in_page(buf_ptr); 1239eda14cbcSMatt Macy for (i = 0; i < bio->bi_max_vecs; i++) { 1240eda14cbcSMatt Macy size = PAGE_SIZE - offset; 1241eda14cbcSMatt Macy 1242eda14cbcSMatt Macy if (bio_size <= 0) 1243eda14cbcSMatt Macy break; 1244eda14cbcSMatt Macy 1245eda14cbcSMatt Macy if (size > bio_size) 1246eda14cbcSMatt Macy size = bio_size; 1247eda14cbcSMatt Macy 1248eda14cbcSMatt Macy if (is_vmalloc_addr(buf_ptr)) 1249eda14cbcSMatt Macy page = vmalloc_to_page(buf_ptr); 1250eda14cbcSMatt Macy else 1251eda14cbcSMatt Macy page = virt_to_page(buf_ptr); 1252eda14cbcSMatt Macy 1253eda14cbcSMatt Macy /* 1254eda14cbcSMatt Macy * Some network related block device uses tcp_sendpage, which 1255eda14cbcSMatt Macy * doesn't behave well when using 0-count page, this is a 1256eda14cbcSMatt Macy * safety net to catch them. 1257eda14cbcSMatt Macy */ 1258eda14cbcSMatt Macy ASSERT3S(page_count(page), >, 0); 1259eda14cbcSMatt Macy 1260eda14cbcSMatt Macy if (bio_add_page(bio, page, size, offset) != size) 1261eda14cbcSMatt Macy break; 1262eda14cbcSMatt Macy 1263eda14cbcSMatt Macy buf_ptr += size; 1264eda14cbcSMatt Macy bio_size -= size; 1265eda14cbcSMatt Macy offset = 0; 1266eda14cbcSMatt Macy } 1267eda14cbcSMatt Macy 1268eda14cbcSMatt Macy return (bio_size); 1269eda14cbcSMatt Macy } 1270eda14cbcSMatt Macy 1271eda14cbcSMatt Macy /* 1272eda14cbcSMatt Macy * bio_map for gang ABD. 1273eda14cbcSMatt Macy */ 1274eda14cbcSMatt Macy static unsigned int 1275eda14cbcSMatt Macy abd_gang_bio_map_off(struct bio *bio, abd_t *abd, 1276eda14cbcSMatt Macy unsigned int io_size, size_t off) 1277eda14cbcSMatt Macy { 1278eda14cbcSMatt Macy ASSERT(abd_is_gang(abd)); 1279eda14cbcSMatt Macy 1280eda14cbcSMatt Macy for (abd_t *cabd = abd_gang_get_offset(abd, &off); 1281eda14cbcSMatt Macy cabd != NULL; 1282eda14cbcSMatt Macy cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1283eda14cbcSMatt Macy ASSERT3U(off, <, cabd->abd_size); 1284eda14cbcSMatt Macy int size = MIN(io_size, cabd->abd_size - off); 1285eda14cbcSMatt Macy int remainder = abd_bio_map_off(bio, cabd, size, off); 1286eda14cbcSMatt Macy io_size -= (size - remainder); 1287eda14cbcSMatt Macy if (io_size == 0 || remainder > 0) 1288eda14cbcSMatt Macy return (io_size); 1289eda14cbcSMatt Macy off = 0; 1290eda14cbcSMatt Macy } 1291eda14cbcSMatt Macy ASSERT0(io_size); 1292eda14cbcSMatt Macy return (io_size); 1293eda14cbcSMatt Macy } 1294eda14cbcSMatt Macy 1295eda14cbcSMatt Macy /* 1296eda14cbcSMatt Macy * bio_map for ABD. 1297eda14cbcSMatt Macy * @off is the offset in @abd 1298eda14cbcSMatt Macy * Remaining IO size is returned 1299eda14cbcSMatt Macy */ 1300eda14cbcSMatt Macy unsigned int 1301eda14cbcSMatt Macy abd_bio_map_off(struct bio *bio, abd_t *abd, 1302eda14cbcSMatt Macy unsigned int io_size, size_t off) 1303eda14cbcSMatt Macy { 1304eda14cbcSMatt Macy struct abd_iter aiter; 1305eda14cbcSMatt Macy 1306eda14cbcSMatt Macy ASSERT3U(io_size, <=, abd->abd_size - off); 1307eda14cbcSMatt Macy if (abd_is_linear(abd)) 1308eda14cbcSMatt Macy return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); 1309eda14cbcSMatt Macy 1310eda14cbcSMatt Macy ASSERT(!abd_is_linear(abd)); 1311eda14cbcSMatt Macy if (abd_is_gang(abd)) 1312eda14cbcSMatt Macy return (abd_gang_bio_map_off(bio, abd, io_size, off)); 1313eda14cbcSMatt Macy 1314eda14cbcSMatt Macy abd_iter_init(&aiter, abd); 1315eda14cbcSMatt Macy abd_iter_advance(&aiter, off); 1316eda14cbcSMatt Macy 1317184c1b94SMartin Matuska for (int i = 0; i < bio->bi_max_vecs; i++) { 1318eda14cbcSMatt Macy struct page *pg; 1319eda14cbcSMatt Macy size_t len, sgoff, pgoff; 1320eda14cbcSMatt Macy struct scatterlist *sg; 1321eda14cbcSMatt Macy 1322eda14cbcSMatt Macy if (io_size <= 0) 1323eda14cbcSMatt Macy break; 1324eda14cbcSMatt Macy 1325eda14cbcSMatt Macy sg = aiter.iter_sg; 1326eda14cbcSMatt Macy sgoff = aiter.iter_offset; 1327eda14cbcSMatt Macy pgoff = sgoff & (PAGESIZE - 1); 1328eda14cbcSMatt Macy len = MIN(io_size, PAGESIZE - pgoff); 1329eda14cbcSMatt Macy ASSERT(len > 0); 1330eda14cbcSMatt Macy 1331eda14cbcSMatt Macy pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); 1332eda14cbcSMatt Macy if (bio_add_page(bio, pg, len, pgoff) != len) 1333eda14cbcSMatt Macy break; 1334eda14cbcSMatt Macy 1335eda14cbcSMatt Macy io_size -= len; 1336eda14cbcSMatt Macy abd_iter_advance(&aiter, len); 1337eda14cbcSMatt Macy } 1338eda14cbcSMatt Macy 1339eda14cbcSMatt Macy return (io_size); 1340eda14cbcSMatt Macy } 1341eda14cbcSMatt Macy 1342eda14cbcSMatt Macy /* Tunable Parameters */ 1343eda14cbcSMatt Macy module_param(zfs_abd_scatter_enabled, int, 0644); 1344eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_enabled, 1345eda14cbcSMatt Macy "Toggle whether ABD allocations must be linear."); 1346eda14cbcSMatt Macy module_param(zfs_abd_scatter_min_size, int, 0644); 1347eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_min_size, 1348eda14cbcSMatt Macy "Minimum size of scatter allocations."); 1349eda14cbcSMatt Macy module_param(zfs_abd_scatter_max_order, uint, 0644); 1350eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_abd_scatter_max_order, 1351eda14cbcSMatt Macy "Maximum order allocation used for a scatter ABD."); 1352