1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (C) 2011 Lawrence Livermore National Security, LLC. 24 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 25 * Written by Brian Behlendorf <behlendorf1@llnl.gov>. 26 * LLNL-CODE-403049. 27 */ 28 29 #ifndef _ZFS_BLKDEV_H 30 #define _ZFS_BLKDEV_H 31 32 #include <linux/blkdev.h> 33 #include <linux/backing-dev.h> 34 #include <linux/hdreg.h> 35 #include <linux/major.h> 36 #include <linux/msdos_fs.h> /* for SECTOR_* */ 37 #include <linux/bio.h> 38 #include <linux/blk-mq.h> 39 40 /* 41 * 6.11 API 42 * Setting the flush flags directly is no longer possible; flush flags are set 43 * on the queue_limits structure and passed to blk_disk_alloc(). In this case 44 * we remove this function entirely. 45 */ 46 #if !defined(HAVE_BLK_ALLOC_DISK_2ARG) || \ 47 !defined(HAVE_BLKDEV_QUEUE_LIMITS_FEATURES) 48 static inline void 49 blk_queue_set_write_cache(struct request_queue *q, bool on) 50 { 51 if (on) { 52 blk_queue_flag_set(QUEUE_FLAG_WC, q); 53 blk_queue_flag_set(QUEUE_FLAG_FUA, q); 54 } else { 55 blk_queue_flag_clear(QUEUE_FLAG_WC, q); 56 blk_queue_flag_clear(QUEUE_FLAG_FUA, q); 57 } 58 } 59 #endif /* !HAVE_BLK_ALLOC_DISK_2ARG || !HAVE_BLKDEV_QUEUE_LIMITS_FEATURES */ 60 61 /* 62 * Detect if a device has a write cache. Used to set the intial value for the 63 * vdev nowritecache flag. 64 * 65 * 4.10: QUEUE_FLAG_WC added. Initialised by the driver, but can be changed 66 * later by the operator. If not set, kernel will return flush requests 67 * immediately without doing anything. 68 * 6.6: QUEUE_FLAG_HW_WC added. Initialised by the driver, can't be changed. 69 * Only controls if the operator is allowed to change _WC. Initial version 70 * buggy; aliased to QUEUE_FLAG_FUA, so unuseable. 71 * 6.6.10, 6.7: QUEUE_FLAG_HW_WC fixed. 72 * 73 * Older than 4.10 we just assume write cache, and let the normal flush fail 74 * detection apply. 75 */ 76 static inline boolean_t 77 zfs_bdev_has_write_cache(struct block_device *bdev) 78 { 79 #if defined(QUEUE_FLAG_HW_WC) && QUEUE_FLAG_HW_WC != QUEUE_FLAG_FUA 80 return (test_bit(QUEUE_FLAG_HW_WC, &bdev_get_queue(bdev)->queue_flags)); 81 #elif defined(QUEUE_FLAG_WC) 82 return (test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags)); 83 #else 84 return (B_TRUE); 85 #endif 86 } 87 88 static inline void 89 blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages) 90 { 91 #if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \ 92 !defined(HAVE_DISK_UPDATE_READAHEAD) 93 #if defined(HAVE_BLK_QUEUE_BDI_DYNAMIC) 94 q->backing_dev_info->ra_pages = ra_pages; 95 #elif defined(HAVE_BLK_QUEUE_DISK_BDI) 96 q->disk->bdi->ra_pages = ra_pages; 97 #else 98 q->backing_dev_info.ra_pages = ra_pages; 99 #endif 100 #endif 101 } 102 103 #define BIO_BI_SECTOR(bio) (bio)->bi_iter.bi_sector 104 #define BIO_BI_SIZE(bio) (bio)->bi_iter.bi_size 105 #define BIO_BI_IDX(bio) (bio)->bi_iter.bi_idx 106 #define BIO_BI_SKIP(bio) (bio)->bi_iter.bi_bvec_done 107 #define bio_for_each_segment4(bv, bvp, b, i) \ 108 bio_for_each_segment((bv), (b), (i)) 109 typedef struct bvec_iter bvec_iterator_t; 110 111 static inline void 112 bio_set_flags_failfast(struct block_device *bdev, int *flags, bool dev, 113 bool transport, bool driver) 114 { 115 #ifdef CONFIG_BUG 116 /* 117 * Disable FAILFAST for loopback devices because of the 118 * following incorrect BUG_ON() in loop_make_request(). 119 * This support is also disabled for md devices because the 120 * test suite layers md devices on top of loopback devices. 121 * This may be removed when the loopback driver is fixed. 122 * 123 * BUG_ON(!lo || (rw != READ && rw != WRITE)); 124 */ 125 if ((MAJOR(bdev->bd_dev) == LOOP_MAJOR) || 126 (MAJOR(bdev->bd_dev) == MD_MAJOR)) 127 return; 128 129 #ifdef BLOCK_EXT_MAJOR 130 if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) 131 return; 132 #endif /* BLOCK_EXT_MAJOR */ 133 #endif /* CONFIG_BUG */ 134 135 if (dev) 136 *flags |= REQ_FAILFAST_DEV; 137 if (transport) 138 *flags |= REQ_FAILFAST_TRANSPORT; 139 if (driver) 140 *flags |= REQ_FAILFAST_DRIVER; 141 } 142 143 /* 144 * Maximum disk label length, it may be undefined for some kernels. 145 */ 146 #if !defined(DISK_NAME_LEN) 147 #define DISK_NAME_LEN 32 148 #endif /* DISK_NAME_LEN */ 149 150 static inline int 151 bi_status_to_errno(blk_status_t status) 152 { 153 switch (status) { 154 case BLK_STS_OK: 155 return (0); 156 case BLK_STS_NOTSUPP: 157 return (EOPNOTSUPP); 158 case BLK_STS_TIMEOUT: 159 return (ETIMEDOUT); 160 case BLK_STS_NOSPC: 161 return (ENOSPC); 162 case BLK_STS_TRANSPORT: 163 return (ENOLINK); 164 case BLK_STS_TARGET: 165 return (EREMOTEIO); 166 #ifdef HAVE_BLK_STS_RESV_CONFLICT 167 case BLK_STS_RESV_CONFLICT: 168 #else 169 case BLK_STS_NEXUS: 170 #endif 171 return (EBADE); 172 case BLK_STS_MEDIUM: 173 return (ENODATA); 174 case BLK_STS_PROTECTION: 175 return (EILSEQ); 176 case BLK_STS_RESOURCE: 177 return (ENOMEM); 178 case BLK_STS_AGAIN: 179 return (EAGAIN); 180 case BLK_STS_IOERR: 181 return (EIO); 182 default: 183 return (EIO); 184 } 185 } 186 187 static inline blk_status_t 188 errno_to_bi_status(int error) 189 { 190 switch (error) { 191 case 0: 192 return (BLK_STS_OK); 193 case EOPNOTSUPP: 194 return (BLK_STS_NOTSUPP); 195 case ETIMEDOUT: 196 return (BLK_STS_TIMEOUT); 197 case ENOSPC: 198 return (BLK_STS_NOSPC); 199 case ENOLINK: 200 return (BLK_STS_TRANSPORT); 201 case EREMOTEIO: 202 return (BLK_STS_TARGET); 203 case EBADE: 204 #ifdef HAVE_BLK_STS_RESV_CONFLICT 205 return (BLK_STS_RESV_CONFLICT); 206 #else 207 return (BLK_STS_NEXUS); 208 #endif 209 case ENODATA: 210 return (BLK_STS_MEDIUM); 211 case EILSEQ: 212 return (BLK_STS_PROTECTION); 213 case ENOMEM: 214 return (BLK_STS_RESOURCE); 215 case EAGAIN: 216 return (BLK_STS_AGAIN); 217 case EIO: 218 return (BLK_STS_IOERR); 219 default: 220 return (BLK_STS_IOERR); 221 } 222 } 223 224 /* 225 * 5.15 MACRO, 226 * GD_DEAD 227 * 228 * 2.6.36 - 5.14 MACRO, 229 * GENHD_FL_UP 230 * 231 * Check the disk status and return B_TRUE if alive 232 * otherwise B_FALSE 233 */ 234 static inline boolean_t 235 zfs_check_disk_status(struct block_device *bdev) 236 { 237 #if defined(GENHD_FL_UP) 238 return (!!(bdev->bd_disk->flags & GENHD_FL_UP)); 239 #elif defined(GD_DEAD) 240 return (!test_bit(GD_DEAD, &bdev->bd_disk->state)); 241 #else 242 /* 243 * This is encountered if neither GENHD_FL_UP nor GD_DEAD is available in 244 * the kernel - likely due to an MACRO change that needs to be chased down. 245 */ 246 #error "Unsupported kernel: no usable disk status check" 247 #endif 248 } 249 250 /* 251 * 5.17 API change 252 * 253 * GENHD_FL_EXT_DEVT flag removed 254 * GENHD_FL_NO_PART_SCAN renamed GENHD_FL_NO_PART 255 */ 256 #ifndef HAVE_GENHD_FL_EXT_DEVT 257 #define GENHD_FL_EXT_DEVT (0) 258 #endif 259 #ifndef HAVE_GENHD_FL_NO_PART 260 #define GENHD_FL_NO_PART (GENHD_FL_NO_PART_SCAN) 261 #endif 262 263 /* 264 * 4.1 API, 265 * 3.10.0 CentOS 7.x API, 266 * blkdev_reread_part() 267 * 268 * For older kernels trigger a re-reading of the partition table by calling 269 * check_disk_change() which calls flush_disk() to invalidate the device. 270 * 271 * For newer kernels (as of 5.10), bdev_check_media_change is used, in favor of 272 * check_disk_change(), with the modification that invalidation is no longer 273 * forced. 274 */ 275 #ifdef HAVE_CHECK_DISK_CHANGE 276 #define zfs_check_media_change(bdev) check_disk_change(bdev) 277 #ifdef HAVE_BLKDEV_REREAD_PART 278 #define vdev_bdev_reread_part(bdev) blkdev_reread_part(bdev) 279 #else 280 #define vdev_bdev_reread_part(bdev) check_disk_change(bdev) 281 #endif /* HAVE_BLKDEV_REREAD_PART */ 282 #else 283 #ifdef HAVE_BDEV_CHECK_MEDIA_CHANGE 284 static inline int 285 zfs_check_media_change(struct block_device *bdev) 286 { 287 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 288 struct gendisk *gd = bdev->bd_disk; 289 const struct block_device_operations *bdo = gd->fops; 290 #endif 291 292 if (!bdev_check_media_change(bdev)) 293 return (0); 294 295 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 296 /* 297 * Force revalidation, to mimic the old behavior of 298 * check_disk_change() 299 */ 300 if (bdo->revalidate_disk) 301 bdo->revalidate_disk(gd); 302 #endif 303 304 return (0); 305 } 306 #define vdev_bdev_reread_part(bdev) zfs_check_media_change(bdev) 307 #elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE) 308 #define vdev_bdev_reread_part(bdev) disk_check_media_change(bdev->bd_disk) 309 #define zfs_check_media_change(bdev) disk_check_media_change(bdev->bd_disk) 310 #else 311 /* 312 * This is encountered if check_disk_change() and bdev_check_media_change() 313 * are not available in the kernel - likely due to an API change that needs 314 * to be chased down. 315 */ 316 #error "Unsupported kernel: no usable disk change check" 317 #endif /* HAVE_BDEV_CHECK_MEDIA_CHANGE */ 318 #endif /* HAVE_CHECK_DISK_CHANGE */ 319 320 /* 321 * 2.6.27 API change 322 * The function was exported for use, prior to this it existed but the 323 * symbol was not exported. 324 * 325 * 5.11 API change 326 * Changed to take a dev_t argument which is set on success and return a 327 * non-zero error code on failure. 328 */ 329 static inline int 330 vdev_lookup_bdev(const char *path, dev_t *dev) 331 { 332 #if defined(HAVE_DEVT_LOOKUP_BDEV) 333 return (lookup_bdev(path, dev)); 334 #elif defined(HAVE_1ARG_LOOKUP_BDEV) 335 struct block_device *bdev = lookup_bdev(path); 336 if (IS_ERR(bdev)) 337 return (PTR_ERR(bdev)); 338 339 *dev = bdev->bd_dev; 340 bdput(bdev); 341 342 return (0); 343 #else 344 #error "Unsupported kernel" 345 #endif 346 } 347 348 #if defined(HAVE_BLK_MODE_T) 349 #define blk_mode_is_open_write(flag) ((flag) & BLK_OPEN_WRITE) 350 #else 351 #define blk_mode_is_open_write(flag) ((flag) & FMODE_WRITE) 352 #endif 353 354 /* 355 * Kernels without bio_set_op_attrs use bi_rw for the bio flags. 356 */ 357 #if !defined(HAVE_BIO_SET_OP_ATTRS) 358 static inline void 359 bio_set_op_attrs(struct bio *bio, unsigned rw, unsigned flags) 360 { 361 bio->bi_opf = rw | flags; 362 } 363 #endif 364 365 /* 366 * bio_set_flush - Set the appropriate flags in a bio to guarantee 367 * data are on non-volatile media on completion. 368 */ 369 static inline void 370 bio_set_flush(struct bio *bio) 371 { 372 bio_set_op_attrs(bio, 0, REQ_PREFLUSH | REQ_OP_WRITE); 373 } 374 375 /* 376 * 4.8 API, 377 * REQ_OP_FLUSH 378 * 379 * in all cases but may have a performance impact for some kernels. It 380 * has the advantage of minimizing kernel specific changes in the zvol code. 381 * 382 */ 383 static inline boolean_t 384 bio_is_flush(struct bio *bio) 385 { 386 return (bio_op(bio) == REQ_OP_FLUSH); 387 } 388 389 /* 390 * 4.8 API, 391 * REQ_FUA flag moved to bio->bi_opf 392 */ 393 static inline boolean_t 394 bio_is_fua(struct bio *bio) 395 { 396 return (bio->bi_opf & REQ_FUA); 397 } 398 399 /* 400 * 4.8 API, 401 * REQ_OP_DISCARD 402 * 403 * In all cases the normal I/O path is used for discards. The only 404 * difference is how the kernel tags individual I/Os as discards. 405 */ 406 static inline boolean_t 407 bio_is_discard(struct bio *bio) 408 { 409 return (bio_op(bio) == REQ_OP_DISCARD); 410 } 411 412 /* 413 * 4.8 API, 414 * REQ_OP_SECURE_ERASE 415 */ 416 static inline boolean_t 417 bio_is_secure_erase(struct bio *bio) 418 { 419 return (bio_op(bio) == REQ_OP_SECURE_ERASE); 420 } 421 422 /* 423 * 2.6.33 API change 424 * Discard granularity and alignment restrictions may now be set. For 425 * older kernels which do not support this it is safe to skip it. 426 */ 427 static inline void 428 blk_queue_discard_granularity(struct request_queue *q, unsigned int dg) 429 { 430 q->limits.discard_granularity = dg; 431 } 432 433 /* 434 * 5.19 API, 435 * bdev_max_discard_sectors() 436 * 437 * 2.6.32 API, 438 * blk_queue_discard() 439 */ 440 static inline boolean_t 441 bdev_discard_supported(struct block_device *bdev) 442 { 443 #if defined(HAVE_BDEV_MAX_DISCARD_SECTORS) 444 return (bdev_max_discard_sectors(bdev) > 0 && 445 bdev_discard_granularity(bdev) > 0); 446 #elif defined(HAVE_BLK_QUEUE_DISCARD) 447 return (blk_queue_discard(bdev_get_queue(bdev)) > 0 && 448 bdev_get_queue(bdev)->limits.discard_granularity > 0); 449 #else 450 #error "Unsupported kernel" 451 #endif 452 } 453 454 /* 455 * 5.19 API, 456 * bdev_max_secure_erase_sectors() 457 * 458 * 4.8 API, 459 * blk_queue_secure_erase() 460 */ 461 static inline boolean_t 462 bdev_secure_discard_supported(struct block_device *bdev) 463 { 464 #if defined(HAVE_BDEV_MAX_SECURE_ERASE_SECTORS) 465 return (!!bdev_max_secure_erase_sectors(bdev)); 466 #elif defined(HAVE_BLK_QUEUE_SECURE_ERASE) 467 return (!!blk_queue_secure_erase(bdev_get_queue(bdev))); 468 #else 469 #error "Unsupported kernel" 470 #endif 471 } 472 473 /* 474 * A common holder for vdev_bdev_open() is used to relax the exclusive open 475 * semantics slightly. Internal vdev disk callers may pass VDEV_HOLDER to 476 * allow them to open the device multiple times. Other kernel callers and 477 * user space processes which don't pass this value will get EBUSY. This is 478 * currently required for the correct operation of hot spares. 479 */ 480 #define VDEV_HOLDER ((void *)0x2401de7) 481 482 static inline unsigned long 483 blk_generic_start_io_acct(struct request_queue *q __attribute__((unused)), 484 struct gendisk *disk __attribute__((unused)), 485 int rw __attribute__((unused)), struct bio *bio) 486 { 487 #if defined(HAVE_BDEV_IO_ACCT_63) 488 return (bdev_start_io_acct(bio->bi_bdev, bio_op(bio), 489 jiffies)); 490 #elif defined(HAVE_BDEV_IO_ACCT_OLD) 491 return (bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), 492 bio_op(bio), jiffies)); 493 #elif defined(HAVE_DISK_IO_ACCT) 494 return (disk_start_io_acct(disk, bio_sectors(bio), bio_op(bio))); 495 #elif defined(HAVE_BIO_IO_ACCT) 496 return (bio_start_io_acct(bio)); 497 #elif defined(HAVE_GENERIC_IO_ACCT_4ARG) 498 unsigned long start_time = jiffies; 499 generic_start_io_acct(q, rw, bio_sectors(bio), &disk->part0); 500 return (start_time); 501 #else 502 /* Unsupported */ 503 return (0); 504 #endif 505 } 506 507 static inline void 508 blk_generic_end_io_acct(struct request_queue *q __attribute__((unused)), 509 struct gendisk *disk __attribute__((unused)), 510 int rw __attribute__((unused)), struct bio *bio, unsigned long start_time) 511 { 512 #if defined(HAVE_BDEV_IO_ACCT_63) 513 bdev_end_io_acct(bio->bi_bdev, bio_op(bio), bio_sectors(bio), 514 start_time); 515 #elif defined(HAVE_BDEV_IO_ACCT_OLD) 516 bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); 517 #elif defined(HAVE_DISK_IO_ACCT) 518 disk_end_io_acct(disk, bio_op(bio), start_time); 519 #elif defined(HAVE_BIO_IO_ACCT) 520 bio_end_io_acct(bio, start_time); 521 #elif defined(HAVE_GENERIC_IO_ACCT_4ARG) 522 generic_end_io_acct(q, rw, &disk->part0, start_time); 523 #endif 524 } 525 526 #ifndef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 527 static inline struct request_queue * 528 blk_generic_alloc_queue(make_request_fn make_request, int node_id) 529 { 530 #if defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN) 531 return (blk_alloc_queue(make_request, node_id)); 532 #elif defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH) 533 return (blk_alloc_queue_rh(make_request, node_id)); 534 #else 535 struct request_queue *q = blk_alloc_queue(GFP_KERNEL); 536 if (q != NULL) 537 blk_queue_make_request(q, make_request); 538 539 return (q); 540 #endif 541 } 542 #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 543 544 /* 545 * All the io_*() helper functions below can operate on a bio, or a rq, but 546 * not both. The older submit_bio() codepath will pass a bio, and the 547 * newer blk-mq codepath will pass a rq. 548 */ 549 static inline int 550 io_data_dir(struct bio *bio, struct request *rq) 551 { 552 if (rq != NULL) { 553 if (op_is_write(req_op(rq))) { 554 return (WRITE); 555 } else { 556 return (READ); 557 } 558 } 559 return (bio_data_dir(bio)); 560 } 561 562 static inline int 563 io_is_flush(struct bio *bio, struct request *rq) 564 { 565 if (rq != NULL) 566 return (req_op(rq) == REQ_OP_FLUSH); 567 return (bio_is_flush(bio)); 568 } 569 570 static inline int 571 io_is_discard(struct bio *bio, struct request *rq) 572 { 573 if (rq != NULL) 574 return (req_op(rq) == REQ_OP_DISCARD); 575 return (bio_is_discard(bio)); 576 } 577 578 static inline int 579 io_is_secure_erase(struct bio *bio, struct request *rq) 580 { 581 if (rq != NULL) 582 return (req_op(rq) == REQ_OP_SECURE_ERASE); 583 return (bio_is_secure_erase(bio)); 584 } 585 586 static inline int 587 io_is_fua(struct bio *bio, struct request *rq) 588 { 589 if (rq != NULL) 590 return (rq->cmd_flags & REQ_FUA); 591 return (bio_is_fua(bio)); 592 } 593 594 595 static inline uint64_t 596 io_offset(struct bio *bio, struct request *rq) 597 { 598 if (rq != NULL) 599 return (blk_rq_pos(rq) << 9); 600 return (BIO_BI_SECTOR(bio) << 9); 601 } 602 603 static inline uint64_t 604 io_size(struct bio *bio, struct request *rq) 605 { 606 if (rq != NULL) 607 return (blk_rq_bytes(rq)); 608 return (BIO_BI_SIZE(bio)); 609 } 610 611 static inline int 612 io_has_data(struct bio *bio, struct request *rq) 613 { 614 if (rq != NULL) 615 return (bio_has_data(rq->bio)); 616 return (bio_has_data(bio)); 617 } 618 #endif /* _ZFS_BLKDEV_H */ 619