1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 /* 39 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 40 */ 41 42 #ifdef _KERNEL 43 44 #include <sys/errno.h> 45 #include <sys/vmem.h> 46 #include <sys/sysmacros.h> 47 #include <sys/types.h> 48 #include <sys/uio_impl.h> 49 #include <sys/sysmacros.h> 50 #include <sys/string.h> 51 #include <sys/zfs_refcount.h> 52 #include <sys/zfs_debug.h> 53 #include <linux/kmap_compat.h> 54 #include <linux/uaccess.h> 55 #include <linux/pagemap.h> 56 #include <linux/mman.h> 57 58 /* 59 * Move "n" bytes at byte address "p"; "rw" indicates the direction 60 * of the move, and the I/O parameters are provided in "uio", which is 61 * update to reflect the data which was moved. Returns 0 on success or 62 * a non-zero errno on failure. 63 */ 64 static int 65 zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 66 { 67 const struct iovec *iov = uio->uio_iov; 68 size_t skip = uio->uio_skip; 69 ulong_t cnt; 70 71 ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE); 72 while (n && uio->uio_resid) { 73 cnt = MIN(iov->iov_len - skip, n); 74 if (rw == UIO_READ) 75 memcpy(iov->iov_base + skip, p, cnt); 76 else 77 memcpy(p, iov->iov_base + skip, cnt); 78 skip += cnt; 79 if (skip == iov->iov_len) { 80 skip = 0; 81 uio->uio_iov = (++iov); 82 uio->uio_iovcnt--; 83 } 84 uio->uio_skip = skip; 85 uio->uio_resid -= cnt; 86 uio->uio_loffset += cnt; 87 p = (caddr_t)p + cnt; 88 n -= cnt; 89 } 90 return (0); 91 } 92 93 static int 94 zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 95 { 96 const struct bio_vec *bv = uio->uio_bvec; 97 size_t skip = uio->uio_skip; 98 ulong_t cnt; 99 100 while (n && uio->uio_resid) { 101 void *paddr; 102 cnt = MIN(bv->bv_len - skip, n); 103 104 paddr = zfs_kmap_local(bv->bv_page); 105 if (rw == UIO_READ) { 106 /* Copy from buffer 'p' to the bvec data */ 107 memcpy(paddr + bv->bv_offset + skip, p, cnt); 108 } else { 109 /* Copy from bvec data to buffer 'p' */ 110 memcpy(p, paddr + bv->bv_offset + skip, cnt); 111 } 112 zfs_kunmap_local(paddr); 113 114 skip += cnt; 115 if (skip == bv->bv_len) { 116 skip = 0; 117 uio->uio_bvec = (++bv); 118 uio->uio_iovcnt--; 119 } 120 uio->uio_skip = skip; 121 uio->uio_resid -= cnt; 122 uio->uio_loffset += cnt; 123 p = (caddr_t)p + cnt; 124 n -= cnt; 125 } 126 return (0); 127 } 128 129 static void 130 zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw, 131 struct bio_vec *bv) 132 { 133 void *paddr; 134 135 paddr = zfs_kmap_local(bv->bv_page); 136 if (rw == UIO_READ) { 137 /* Copy from buffer 'p' to the bvec data */ 138 memcpy(paddr + bv->bv_offset + skip, p, cnt); 139 } else { 140 /* Copy from bvec data to buffer 'p' */ 141 memcpy(p, paddr + bv->bv_offset + skip, cnt); 142 } 143 zfs_kunmap_local(paddr); 144 } 145 146 /* 147 * Copy 'n' bytes of data between the buffer p[] and the data represented 148 * by the request in the uio. 149 */ 150 static int 151 zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 152 { 153 struct request *rq = uio->rq; 154 struct bio_vec bv; 155 struct req_iterator iter; 156 size_t this_seg_start; /* logical offset */ 157 size_t this_seg_end; /* logical offset */ 158 size_t skip_in_seg; 159 size_t copy_from_seg; 160 size_t orig_loffset; 161 int copied = 0; 162 163 /* 164 * Get the original logical offset of this entire request (because 165 * uio->uio_loffset will be modified over time). 166 */ 167 orig_loffset = io_offset(NULL, rq); 168 this_seg_start = orig_loffset; 169 170 rq_for_each_segment(bv, rq, iter) { 171 /* 172 * Lookup what the logical offset of the last byte of this 173 * segment is. 174 */ 175 this_seg_end = this_seg_start + bv.bv_len - 1; 176 177 /* 178 * We only need to operate on segments that have data we're 179 * copying. 180 */ 181 if (uio->uio_loffset >= this_seg_start && 182 uio->uio_loffset <= this_seg_end) { 183 /* 184 * Some, or all, of the data in this segment needs to be 185 * copied. 186 */ 187 188 /* 189 * We may be not be copying from the first byte in the 190 * segment. Figure out how many bytes to skip copying 191 * from the beginning of this segment. 192 */ 193 skip_in_seg = uio->uio_loffset - this_seg_start; 194 195 /* 196 * Calculate the total number of bytes from this 197 * segment that we will be copying. 198 */ 199 copy_from_seg = MIN(bv.bv_len - skip_in_seg, n); 200 201 /* Copy the bytes */ 202 zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv); 203 p = ((char *)p) + copy_from_seg; 204 205 n -= copy_from_seg; 206 uio->uio_resid -= copy_from_seg; 207 uio->uio_loffset += copy_from_seg; 208 copied = 1; /* We copied some data */ 209 } 210 211 this_seg_start = this_seg_end + 1; 212 } 213 214 if (!copied) { 215 /* Didn't copy anything */ 216 uio->uio_resid = 0; 217 } 218 return (0); 219 } 220 221 static int 222 zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 223 { 224 if (uio->rq != NULL) 225 return (zfs_uiomove_bvec_rq(p, n, rw, uio)); 226 return (zfs_uiomove_bvec_impl(p, n, rw, uio)); 227 } 228 229 static int 230 zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, 231 boolean_t revert) 232 { 233 size_t cnt = MIN(n, uio->uio_resid); 234 235 if (uio->uio_skip) 236 iov_iter_advance(uio->uio_iter, uio->uio_skip); 237 238 if (rw == UIO_READ) 239 cnt = copy_to_iter(p, cnt, uio->uio_iter); 240 else 241 cnt = copy_from_iter(p, cnt, uio->uio_iter); 242 243 /* 244 * When operating on a full pipe no bytes are processed. 245 * In which case return EFAULT which is converted to EAGAIN 246 * by the kernel's generic_file_splice_read() function. 247 */ 248 if (cnt == 0) 249 return (EFAULT); 250 251 /* 252 * Revert advancing the uio_iter. This is set by zfs_uiocopy() 253 * to avoid consuming the uio and its iov_iter structure. 254 */ 255 if (revert) 256 iov_iter_revert(uio->uio_iter, cnt); 257 258 uio->uio_resid -= cnt; 259 uio->uio_loffset += cnt; 260 261 return (0); 262 } 263 264 int 265 zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 266 { 267 if (uio->uio_segflg == UIO_BVEC) 268 return (zfs_uiomove_bvec(p, n, rw, uio)); 269 else if (uio->uio_segflg == UIO_ITER) 270 return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE)); 271 else 272 return (zfs_uiomove_iov(p, n, rw, uio)); 273 } 274 EXPORT_SYMBOL(zfs_uiomove); 275 276 /* 277 * Fault in the pages of the first n bytes specified by the uio structure. 278 * 1 byte in each page is touched and the uio struct is unmodified. Any 279 * error will terminate the process as this is only a best attempt to get 280 * the pages resident. 281 */ 282 int 283 zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) 284 { 285 if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC || 286 (uio->uio_extflg & UIO_DIRECT)) { 287 /* 288 * There's never a need to fault in kernel pages or Direct I/O 289 * write pages. Direct I/O write pages have been pinned in so 290 * there is never a time for these pages a fault will occur. 291 */ 292 return (0); 293 } else { 294 ASSERT3S(uio->uio_segflg, ==, UIO_ITER); 295 /* 296 * At least a Linux 4.18 kernel, iov_iter_fault_in_readable() 297 * can be relied on to fault in user pages when referenced. 298 */ 299 if (iov_iter_fault_in_readable(uio->uio_iter, n)) 300 return (EFAULT); 301 } 302 303 return (0); 304 } 305 EXPORT_SYMBOL(zfs_uio_prefaultpages); 306 307 /* 308 * The same as zfs_uiomove() but doesn't modify uio structure. 309 * return in cbytes how many bytes were copied. 310 */ 311 int 312 zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes) 313 { 314 zfs_uio_t uio_copy; 315 int ret; 316 317 memcpy(&uio_copy, uio, sizeof (zfs_uio_t)); 318 319 if (uio->uio_segflg == UIO_BVEC) 320 ret = zfs_uiomove_bvec(p, n, rw, &uio_copy); 321 else if (uio->uio_segflg == UIO_ITER) 322 ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE); 323 else 324 ret = zfs_uiomove_iov(p, n, rw, &uio_copy); 325 326 *cbytes = uio->uio_resid - uio_copy.uio_resid; 327 328 return (ret); 329 } 330 EXPORT_SYMBOL(zfs_uiocopy); 331 332 /* 333 * Drop the next n chars out of *uio. 334 */ 335 void 336 zfs_uioskip(zfs_uio_t *uio, size_t n) 337 { 338 if (n > uio->uio_resid) 339 return; 340 /* 341 * When using a uio with a struct request, we simply 342 * use uio_loffset as a pointer to the next logical byte to 343 * copy in the request. We don't have to do any fancy 344 * accounting with uio_bvec/uio_iovcnt since we don't use 345 * them. 346 */ 347 if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) { 348 uio->uio_skip += n; 349 while (uio->uio_iovcnt && 350 uio->uio_skip >= uio->uio_bvec->bv_len) { 351 uio->uio_skip -= uio->uio_bvec->bv_len; 352 uio->uio_bvec++; 353 uio->uio_iovcnt--; 354 } 355 } else if (uio->uio_segflg == UIO_ITER) { 356 iov_iter_advance(uio->uio_iter, n); 357 } else { 358 ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE); 359 uio->uio_skip += n; 360 while (uio->uio_iovcnt && 361 uio->uio_skip >= uio->uio_iov->iov_len) { 362 uio->uio_skip -= uio->uio_iov->iov_len; 363 uio->uio_iov++; 364 uio->uio_iovcnt--; 365 } 366 } 367 368 uio->uio_loffset += n; 369 uio->uio_resid -= n; 370 } 371 EXPORT_SYMBOL(zfs_uioskip); 372 373 /* 374 * Check if the uio is page-aligned in memory. 375 */ 376 boolean_t 377 zfs_uio_page_aligned(zfs_uio_t *uio) 378 { 379 boolean_t aligned = B_TRUE; 380 381 if (uio->uio_segflg == UIO_SYSSPACE) { 382 const struct iovec *iov = uio->uio_iov; 383 size_t skip = uio->uio_skip; 384 385 for (int i = uio->uio_iovcnt; i > 0; iov++, i--) { 386 uintptr_t addr = (uintptr_t)(iov->iov_base + skip); 387 size_t size = iov->iov_len - skip; 388 if ((addr & (PAGE_SIZE - 1)) || 389 (size & (PAGE_SIZE - 1))) { 390 aligned = B_FALSE; 391 break; 392 } 393 skip = 0; 394 } 395 } else if (uio->uio_segflg == UIO_ITER) { 396 unsigned long alignment = 397 iov_iter_alignment(uio->uio_iter); 398 aligned = IS_P2ALIGNED(alignment, PAGE_SIZE); 399 } else { 400 /* Currently not supported */ 401 aligned = B_FALSE; 402 } 403 404 return (aligned); 405 } 406 407 408 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64) 409 #define ZFS_MARKEED_PAGE 0x0 410 #define IS_ZFS_MARKED_PAGE(_p) 0 411 #define zfs_mark_page(_p) 412 #define zfs_unmark_page(_p) 413 #define IS_ZERO_PAGE(_p) 0 414 415 #else 416 /* 417 * Mark pages to know if they were allocated to replace ZERO_PAGE() for 418 * Direct I/O writes. 419 */ 420 #define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */ 421 #define IS_ZFS_MARKED_PAGE(_p) \ 422 (page_private(_p) == (unsigned long)ZFS_MARKED_PAGE) 423 #define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0)) 424 425 static inline void 426 zfs_mark_page(struct page *page) 427 { 428 ASSERT3P(page, !=, NULL); 429 get_page(page); 430 SetPagePrivate(page); 431 set_page_private(page, ZFS_MARKED_PAGE); 432 } 433 434 static inline void 435 zfs_unmark_page(struct page *page) 436 { 437 ASSERT3P(page, !=, NULL); 438 set_page_private(page, 0UL); 439 ClearPagePrivate(page); 440 put_page(page); 441 } 442 #endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */ 443 444 #if !defined(HAVE_PIN_USER_PAGES_UNLOCKED) 445 static void 446 zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) 447 { 448 ASSERT3P(uio->uio_dio.pages, !=, NULL); 449 450 for (long i = 0; i < uio->uio_dio.npages; i++) { 451 struct page *p = uio->uio_dio.pages[i]; 452 lock_page(p); 453 454 if (IS_ZERO_PAGE(p)) { 455 /* 456 * If the user page points the kernels ZERO_PAGE() a 457 * new zero filled page will just be allocated so the 458 * contents of the page can not be changed by the user 459 * while a Direct I/O write is taking place. 460 */ 461 gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO | 462 __GFP_ZERO | GFP_KERNEL; 463 464 ASSERT0(IS_ZFS_MARKED_PAGE(p)); 465 unlock_page(p); 466 put_page(p); 467 468 uio->uio_dio.pages[i] = 469 __page_cache_alloc(gfp_zero_page); 470 zfs_mark_page(uio->uio_dio.pages[i]); 471 } else { 472 unlock_page(p); 473 } 474 } 475 } 476 #endif 477 478 void 479 zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) 480 { 481 482 ASSERT(uio->uio_extflg & UIO_DIRECT); 483 ASSERT3P(uio->uio_dio.pages, !=, NULL); 484 485 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 486 unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); 487 #else 488 for (long i = 0; i < uio->uio_dio.npages; i++) { 489 struct page *p = uio->uio_dio.pages[i]; 490 491 if (IS_ZFS_MARKED_PAGE(p)) { 492 zfs_unmark_page(p); 493 __free_page(p); 494 continue; 495 } 496 497 put_page(p); 498 } 499 #endif 500 vmem_free(uio->uio_dio.pages, 501 uio->uio_dio.npages * sizeof (struct page *)); 502 } 503 504 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 505 static int 506 zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) 507 { 508 long res; 509 size_t skip = uio->uio_skip; 510 size_t len = uio->uio_resid - skip; 511 unsigned int gup_flags = 0; 512 unsigned long addr; 513 unsigned long nr_pages; 514 515 /* 516 * Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could 517 * possibly be used here in the future to allow for P2P operations with 518 * user pages. 519 */ 520 if (rw == UIO_READ) 521 gup_flags = FOLL_WRITE; 522 523 if (len == 0) 524 return (0); 525 526 #if defined(HAVE_ITER_IS_UBUF) 527 if (iter_is_ubuf(uio->uio_iter)) { 528 nr_pages = DIV_ROUND_UP(len, PAGE_SIZE); 529 addr = (unsigned long)uio->uio_iter->ubuf + skip; 530 res = pin_user_pages_unlocked(addr, nr_pages, 531 &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags); 532 if (res < 0) { 533 return (SET_ERROR(-res)); 534 } else if (len != (res * PAGE_SIZE)) { 535 uio->uio_dio.npages += res; 536 return (SET_ERROR(EFAULT)); 537 } 538 uio->uio_dio.npages += res; 539 return (0); 540 } 541 #endif 542 const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter); 543 for (int i = 0; i < uio->uio_iovcnt; i++) { 544 size_t amt = iovp->iov_len - skip; 545 if (amt == 0) { 546 iovp++; 547 skip = 0; 548 continue; 549 } 550 551 addr = (unsigned long)iovp->iov_base + skip; 552 nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE); 553 res = pin_user_pages_unlocked(addr, nr_pages, 554 &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags); 555 if (res < 0) { 556 return (SET_ERROR(-res)); 557 } else if (amt != (res * PAGE_SIZE)) { 558 uio->uio_dio.npages += res; 559 return (SET_ERROR(EFAULT)); 560 } 561 562 len -= amt; 563 uio->uio_dio.npages += res; 564 skip = 0; 565 iovp++; 566 }; 567 568 ASSERT0(len); 569 570 return (0); 571 } 572 573 #else 574 static int 575 zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) 576 { 577 size_t start; 578 size_t wanted = uio->uio_resid - uio->uio_skip; 579 ssize_t rollback = 0; 580 ssize_t cnt; 581 unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); 582 583 while (wanted) { 584 cnt = iov_iter_get_pages(uio->uio_iter, 585 &uio->uio_dio.pages[uio->uio_dio.npages], 586 wanted, maxpages, &start); 587 if (cnt < 0) { 588 iov_iter_revert(uio->uio_iter, rollback); 589 return (SET_ERROR(-cnt)); 590 } 591 /* 592 * All Direct I/O operations must be page aligned. 593 */ 594 ASSERT(IS_P2ALIGNED(start, PAGE_SIZE)); 595 uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE); 596 rollback += cnt; 597 wanted -= cnt; 598 iov_iter_advance(uio->uio_iter, cnt); 599 600 } 601 ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip); 602 iov_iter_revert(uio->uio_iter, rollback); 603 604 return (0); 605 } 606 #endif /* HAVE_PIN_USER_PAGES_UNLOCKED */ 607 608 /* 609 * This function pins user pages. In the event that the user pages were not 610 * successfully pinned an error value is returned. 611 * 612 * On success, 0 is returned. 613 */ 614 int 615 zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) 616 { 617 int error = 0; 618 long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE); 619 size_t size = npages * sizeof (struct page *); 620 621 if (uio->uio_segflg == UIO_ITER) { 622 uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); 623 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 624 error = zfs_uio_pin_user_pages(uio, rw); 625 #else 626 error = zfs_uio_get_dio_pages_iov_iter(uio, rw); 627 #endif 628 } else { 629 return (SET_ERROR(EOPNOTSUPP)); 630 } 631 632 ASSERT3S(uio->uio_dio.npages, >=, 0); 633 634 if (error) { 635 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 636 unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); 637 #else 638 for (long i = 0; i < uio->uio_dio.npages; i++) 639 put_page(uio->uio_dio.pages[i]); 640 #endif 641 vmem_free(uio->uio_dio.pages, size); 642 return (error); 643 } else { 644 ASSERT3S(uio->uio_dio.npages, ==, npages); 645 } 646 647 #if !defined(HAVE_PIN_USER_PAGES_UNLOCKED) 648 if (rw == UIO_WRITE) 649 zfs_uio_dio_check_for_zero_page(uio); 650 #endif 651 652 uio->uio_extflg |= UIO_DIRECT; 653 654 return (0); 655 } 656 657 #endif /* _KERNEL */ 658