1 /* $NetBSD: dev-io.c,v 1.10 2010/12/29 23:14:21 haad Exp $ */ 2 3 /* 4 * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved. 5 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 6 * 7 * This file is part of LVM2. 8 * 9 * This copyrighted material is made available to anyone wishing to use, 10 * modify, copy, or redistribute it subject to the terms and conditions 11 * of the GNU Lesser General Public License v.2.1. 12 * 13 * You should have received a copy of the GNU Lesser General Public License 14 * along with this program; if not, write to the Free Software Foundation, 15 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 */ 17 18 #include "lib.h" 19 #include "lvm-types.h" 20 #include "device.h" 21 #include "metadata.h" 22 #include "lvmcache.h" 23 #include "memlock.h" 24 #include "locking.h" 25 26 #include <limits.h> 27 #include <sys/stat.h> 28 #include <fcntl.h> 29 #include <unistd.h> 30 #include <sys/ioctl.h> 31 32 #ifdef linux 33 # define u64 uint64_t /* Missing without __KERNEL__ */ 34 # undef WNOHANG /* Avoid redefinition */ 35 # undef WUNTRACED /* Avoid redefinition */ 36 # include <linux/fs.h> /* For block ioctl definitions */ 37 # define BLKSIZE_SHIFT SECTOR_SHIFT 38 # ifndef BLKGETSIZE64 /* fs.h out-of-date */ 39 # define BLKGETSIZE64 _IOR(0x12, 114, size_t) 40 # endif /* BLKGETSIZE64 */ 41 #elif __NetBSD__ 42 # include <sys/disk.h> 43 # include <sys/disklabel.h> 44 # include <prop/proplib.h> 45 # include <sys/param.h> 46 #else 47 # include <sys/disk.h> 48 # define BLKBSZGET DKIOCGETBLOCKSIZE 49 # define BLKSSZGET DKIOCGETBLOCKSIZE 50 # define BLKGETSIZE64 DKIOCGETBLOCKCOUNT 51 # define BLKFLSBUF DKIOCSYNCHRONIZECACHE 52 # define BLKSIZE_SHIFT 0 53 #endif 54 55 #ifdef O_DIRECT_SUPPORT 56 # ifndef O_DIRECT 57 # error O_DIRECT support configured but O_DIRECT definition not found in headers 58 # endif 59 #endif 60 61 static DM_LIST_INIT(_open_devices); 62 63 /*----------------------------------------------------------------- 64 * The standard io loop that keeps submitting an io until it's 65 * all gone. 66 *---------------------------------------------------------------*/ 67 static int _io(struct device_area *where, void *buffer, int should_write) 68 { 69 int fd = dev_fd(where->dev); 70 ssize_t n = 0; 71 size_t total = 0; 72 73 if (fd < 0) { 74 log_error("Attempt to read an unopened device (%s).", 75 dev_name(where->dev)); 76 return 0; 77 } 78 79 /* 80 * Skip all writes in test mode. 81 */ 82 if (should_write && test_mode()) 83 return 1; 84 85 if (where->size > SSIZE_MAX) { 86 log_error("Read size too large: %" PRIu64, where->size); 87 return 0; 88 } 89 90 if (lseek(fd, (off_t) where->start, SEEK_SET) < 0) { 91 log_error("%s: lseek %" PRIu64 " failed: %s", 92 dev_name(where->dev), (uint64_t) where->start, 93 strerror(errno)); 94 return 0; 95 } 96 97 while (total < (size_t) where->size) { 98 do 99 n = should_write ? 100 write(fd, buffer, (size_t) where->size - total) : 101 read(fd, buffer, (size_t) where->size - total); 102 while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN))); 103 104 if (n < 0) 105 log_error("%s: %s failed after %" PRIu64 " of %" PRIu64 106 " at %" PRIu64 ": %s", dev_name(where->dev), 107 should_write ? "write" : "read", 108 (uint64_t) total, 109 (uint64_t) where->size, 110 (uint64_t) where->start, strerror(errno)); 111 112 if (n <= 0) 113 break; 114 115 total += n; 116 buffer += n; 117 } 118 119 return (total == (size_t) where->size); 120 } 121 122 /*----------------------------------------------------------------- 123 * LVM2 uses O_DIRECT when performing metadata io, which requires 124 * block size aligned accesses. If any io is not aligned we have 125 * to perform the io via a bounce buffer, obviously this is quite 126 * inefficient. 127 *---------------------------------------------------------------*/ 128 129 /* 130 * Get the sector size from an _open_ device. 131 */ 132 static int _get_block_size(struct device *dev, unsigned int *size) 133 { 134 const char *name = dev_name(dev); 135 #ifdef __NetBSD__ 136 struct disklabel lab; 137 prop_dictionary_t disk_dict, geom_dict; 138 uint32_t secsize; 139 #endif 140 141 if ((dev->block_size == -1)) { 142 #ifdef __NetBSD__ 143 if (prop_dictionary_recv_ioctl(dev_fd(dev), DIOCGDISKINFO, &disk_dict)) { 144 if (ioctl(dev_fd(dev), DIOCGDINFO, &lab) < 0) { 145 dev->block_size = DEV_BSIZE; 146 } else 147 dev->block_size = lab.d_secsize; 148 } else { 149 geom_dict = prop_dictionary_get(disk_dict, "geometry"); 150 prop_dictionary_get_uint32(geom_dict, "sector-size", &secsize); 151 dev->block_size = secsize; 152 } 153 #else 154 if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) { 155 log_sys_error("ioctl BLKBSZGET", name); 156 return 0; 157 } 158 #endif 159 log_debug("%s: block size is %u bytes", name, dev->block_size); 160 } 161 162 *size = (unsigned int) dev->block_size; 163 164 return 1; 165 } 166 167 /* 168 * Widens a region to be an aligned region. 169 */ 170 static void _widen_region(unsigned int block_size, struct device_area *region, 171 struct device_area *result) 172 { 173 uint64_t mask = block_size - 1, delta; 174 memcpy(result, region, sizeof(*result)); 175 176 /* adjust the start */ 177 delta = result->start & mask; 178 if (delta) { 179 result->start -= delta; 180 result->size += delta; 181 } 182 183 /* adjust the end */ 184 delta = (result->start + result->size) & mask; 185 if (delta) 186 result->size += block_size - delta; 187 } 188 189 static int _aligned_io(struct device_area *where, void *buffer, 190 int should_write) 191 { 192 void *bounce; 193 unsigned int block_size = 0; 194 uintptr_t mask; 195 struct device_area widened; 196 197 if (!(where->dev->flags & DEV_REGULAR) && 198 !_get_block_size(where->dev, &block_size)) 199 return_0; 200 201 if (!block_size) 202 block_size = lvm_getpagesize(); 203 204 _widen_region(block_size, where, &widened); 205 206 /* Do we need to use a bounce buffer? */ 207 mask = block_size - 1; 208 if (!memcmp(where, &widened, sizeof(widened)) && 209 !((uintptr_t) buffer & mask)) 210 return _io(where, buffer, should_write); 211 212 /* Allocate a bounce buffer with an extra block */ 213 if (!(bounce = alloca((size_t) widened.size + block_size))) { 214 log_error("Bounce buffer alloca failed"); 215 return 0; 216 } 217 218 /* 219 * Realign start of bounce buffer (using the extra sector) 220 */ 221 if (((uintptr_t) bounce) & mask) 222 bounce = (void *) ((((uintptr_t) bounce) + mask) & ~mask); 223 224 /* channel the io through the bounce buffer */ 225 if (!_io(&widened, bounce, 0)) { 226 if (!should_write) 227 return_0; 228 /* FIXME pre-extend the file */ 229 memset(bounce, '\n', widened.size); 230 } 231 232 if (should_write) { 233 memcpy(bounce + (where->start - widened.start), buffer, 234 (size_t) where->size); 235 236 /* ... then we write */ 237 return _io(&widened, bounce, 1); 238 } 239 240 memcpy(buffer, bounce + (where->start - widened.start), 241 (size_t) where->size); 242 243 return 1; 244 } 245 246 static int _dev_get_size_file(const struct device *dev, uint64_t *size) 247 { 248 const char *name = dev_name(dev); 249 struct stat info; 250 251 if (stat(name, &info)) { 252 log_sys_error("stat", name); 253 return 0; 254 } 255 256 *size = info.st_size; 257 *size >>= SECTOR_SHIFT; /* Convert to sectors */ 258 259 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); 260 261 return 1; 262 } 263 264 static int _dev_get_size_dev(const struct device *dev, uint64_t *size) 265 { 266 int fd; 267 const char *name = dev_name(dev); 268 #ifdef __NetBSD__ 269 struct disklabel lab; 270 struct dkwedge_info dkw; 271 struct stat stat; 272 #endif 273 274 if ((fd = open(name, O_RDONLY)) < 0) { 275 #ifndef __NetBSD__ 276 log_sys_error("open", name); 277 #endif 278 return 0; 279 } 280 281 #ifdef __NetBSD__ 282 /* Get info about partition/wedge */ 283 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) == -1) { 284 if (ioctl(fd, DIOCGDINFO, &lab) == -1) { 285 log_debug("Please implement DIOCGWEDGEINFO or " 286 "DIOCGDINFO for disk device %s", name); 287 close(fd); 288 return 0; 289 } else { 290 if (fstat(fd, &stat) < 0) 291 log_debug("fstat on device %s failure", name); 292 293 *size = lab.d_partitions[DISKPART(stat.st_rdev)].p_size; 294 } 295 } else 296 *size = dkw.dkw_size; 297 #else 298 if (ioctl(fd, BLKGETSIZE64, size) < 0) { 299 log_sys_error("ioctl BLKGETSIZE64", name); 300 if (close(fd)) 301 log_sys_error("close", name); 302 return 0; 303 } 304 305 *size >>= BLKSIZE_SHIFT; /* Convert to sectors */ 306 #endif 307 if (close(fd)) 308 log_sys_error("close", name); 309 310 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); 311 312 return 1; 313 } 314 315 static int _dev_read_ahead_dev(struct device *dev, uint32_t *read_ahead) 316 { 317 #ifdef linux 318 long read_ahead_long; 319 320 if (dev->read_ahead != -1) { 321 *read_ahead = (uint32_t) dev->read_ahead; 322 return 1; 323 } 324 325 if (!dev_open(dev)) 326 return_0; 327 328 if (ioctl(dev->fd, BLKRAGET, &read_ahead_long) < 0) { 329 log_sys_error("ioctl BLKRAGET", dev_name(dev)); 330 if (!dev_close(dev)) 331 stack; 332 return 0; 333 } 334 335 if (!dev_close(dev)) 336 stack; 337 338 *read_ahead = (uint32_t) read_ahead_long; 339 dev->read_ahead = read_ahead_long; 340 341 log_very_verbose("%s: read_ahead is %u sectors", 342 dev_name(dev), *read_ahead); 343 #endif 344 return 1; 345 } 346 347 /*----------------------------------------------------------------- 348 * Public functions 349 *---------------------------------------------------------------*/ 350 351 int dev_get_size(const struct device *dev, uint64_t *size) 352 { 353 if (!dev) 354 return 0; 355 356 if ((dev->flags & DEV_REGULAR)) 357 return _dev_get_size_file(dev, size); 358 else 359 return _dev_get_size_dev(dev, size); 360 } 361 362 int dev_get_read_ahead(struct device *dev, uint32_t *read_ahead) 363 { 364 if (!dev) 365 return 0; 366 367 if (dev->flags & DEV_REGULAR) { 368 *read_ahead = 0; 369 return 1; 370 } 371 372 return _dev_read_ahead_dev(dev, read_ahead); 373 } 374 375 /* FIXME Unused 376 int dev_get_sectsize(struct device *dev, uint32_t *size) 377 { 378 int fd; 379 int s; 380 const char *name = dev_name(dev); 381 382 if ((fd = open(name, O_RDONLY)) < 0) { 383 log_sys_error("open", name); 384 return 0; 385 } 386 387 if (ioctl(fd, BLKSSZGET, &s) < 0) { 388 log_sys_error("ioctl BLKSSZGET", name); 389 if (close(fd)) 390 log_sys_error("close", name); 391 return 0; 392 } 393 394 if (close(fd)) 395 log_sys_error("close", name); 396 397 *size = (uint32_t) s; 398 399 log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size); 400 401 return 1; 402 } 403 */ 404 405 void dev_flush(struct device *dev) 406 { 407 #ifdef __linux__ 408 if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0) 409 return; 410 #endif 411 412 if (fsync(dev->fd) >= 0) 413 return; 414 415 sync(); 416 } 417 418 int dev_open_flags(struct device *dev, int flags, int direct, int quiet) 419 { 420 struct stat buf; 421 const char *name; 422 int need_excl = 0, need_rw = 0; 423 424 if ((flags & O_ACCMODE) == O_RDWR) 425 need_rw = 1; 426 427 if ((flags & O_EXCL)) 428 need_excl = 1; 429 430 if (dev->fd >= 0) { 431 if (((dev->flags & DEV_OPENED_RW) || !need_rw) && 432 ((dev->flags & DEV_OPENED_EXCL) || !need_excl)) { 433 dev->open_count++; 434 return 1; 435 } 436 437 if (dev->open_count && !need_excl) { 438 /* FIXME Ensure we never get here */ 439 log_debug("WARNING: %s already opened read-only", 440 dev_name(dev)); 441 dev->open_count++; 442 } 443 444 dev_close_immediate(dev); 445 } 446 447 if (memlock()) 448 log_error("WARNING: dev_open(%s) called while suspended", 449 dev_name(dev)); 450 451 if (dev->flags & DEV_REGULAR) 452 name = dev_name(dev); 453 else if (!(name = dev_name_confirmed(dev, quiet))) 454 return_0; 455 456 if (!(dev->flags & DEV_REGULAR)) { 457 if (stat(name, &buf) < 0) { 458 log_sys_error("%s: stat failed", name); 459 return 0; 460 } 461 if (buf.st_rdev != dev->dev) { 462 log_error("%s: device changed", name); 463 return 0; 464 } 465 } 466 467 #ifdef O_DIRECT_SUPPORT 468 if (direct) { 469 if (!(dev->flags & DEV_O_DIRECT_TESTED)) 470 dev->flags |= DEV_O_DIRECT; 471 472 if ((dev->flags & DEV_O_DIRECT)) 473 flags |= O_DIRECT; 474 } 475 #endif 476 477 #ifdef O_NOATIME 478 /* Don't update atime on device inodes */ 479 if (!(dev->flags & DEV_REGULAR)) 480 flags |= O_NOATIME; 481 #endif 482 483 if ((dev->fd = open(name, flags, 0777)) < 0) { 484 #ifdef O_DIRECT_SUPPORT 485 if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) { 486 flags &= ~O_DIRECT; 487 if ((dev->fd = open(name, flags, 0777)) >= 0) { 488 dev->flags &= ~DEV_O_DIRECT; 489 log_debug("%s: Not using O_DIRECT", name); 490 goto opened; 491 } 492 } 493 #endif 494 if (quiet) 495 log_sys_debug("open", name); 496 else 497 log_sys_error("open", name); 498 499 return 0; 500 } 501 502 #ifdef O_DIRECT_SUPPORT 503 opened: 504 if (direct) 505 dev->flags |= DEV_O_DIRECT_TESTED; 506 #endif 507 dev->open_count++; 508 dev->flags &= ~DEV_ACCESSED_W; 509 510 if (need_rw) 511 dev->flags |= DEV_OPENED_RW; 512 else 513 dev->flags &= ~DEV_OPENED_RW; 514 515 if (need_excl) 516 dev->flags |= DEV_OPENED_EXCL; 517 else 518 dev->flags &= ~DEV_OPENED_EXCL; 519 520 if (!(dev->flags & DEV_REGULAR) && 521 ((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) { 522 log_error("%s: fstat failed: Has device name changed?", name); 523 dev_close_immediate(dev); 524 return 0; 525 } 526 527 #ifndef O_DIRECT_SUPPORT 528 if (!(dev->flags & DEV_REGULAR)) 529 dev_flush(dev); 530 #endif 531 532 if ((flags & O_CREAT) && !(flags & O_TRUNC)) 533 dev->end = lseek(dev->fd, (off_t) 0, SEEK_END); 534 535 dm_list_add(&_open_devices, &dev->open_list); 536 537 log_debug("Opened %s %s%s%s", dev_name(dev), 538 dev->flags & DEV_OPENED_RW ? "RW" : "RO", 539 dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "", 540 dev->flags & DEV_O_DIRECT ? " O_DIRECT" : ""); 541 542 return 1; 543 } 544 545 int dev_open_quiet(struct device *dev) 546 { 547 int flags; 548 549 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 550 551 return dev_open_flags(dev, flags, 1, 1); 552 } 553 554 int dev_open(struct device *dev) 555 { 556 int flags; 557 558 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 559 560 return dev_open_flags(dev, flags, 1, 0); 561 } 562 563 int dev_test_excl(struct device *dev) 564 { 565 int flags; 566 int r; 567 568 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 569 flags |= O_EXCL; 570 571 r = dev_open_flags(dev, flags, 1, 1); 572 if (r) 573 dev_close_immediate(dev); 574 575 return r; 576 } 577 578 static void _close(struct device *dev) 579 { 580 if (close(dev->fd)) 581 log_sys_error("close", dev_name(dev)); 582 dev->fd = -1; 583 dev->block_size = -1; 584 dm_list_del(&dev->open_list); 585 586 log_debug("Closed %s", dev_name(dev)); 587 588 if (dev->flags & DEV_ALLOCED) { 589 dm_free((void *) dm_list_item(dev->aliases.n, struct str_list)-> 590 str); 591 dm_free(dev->aliases.n); 592 dm_free(dev); 593 } 594 } 595 596 static int _dev_close(struct device *dev, int immediate) 597 { 598 struct lvmcache_info *info; 599 600 if (dev->fd < 0) { 601 log_error("Attempt to close device '%s' " 602 "which is not open.", dev_name(dev)); 603 return 0; 604 } 605 606 #ifndef O_DIRECT_SUPPORT 607 if (dev->flags & DEV_ACCESSED_W) 608 dev_flush(dev); 609 #endif 610 611 if (dev->open_count > 0) 612 dev->open_count--; 613 614 if (immediate && dev->open_count) 615 log_debug("%s: Immediate close attempt while still referenced", 616 dev_name(dev)); 617 618 /* Close unless device is known to belong to a locked VG */ 619 if (immediate || 620 (dev->open_count < 1 && 621 (!(info = info_from_pvid(dev->pvid, 0)) || 622 !info->vginfo || 623 !vgname_is_locked(info->vginfo->vgname)))) 624 _close(dev); 625 626 return 1; 627 } 628 629 int dev_close(struct device *dev) 630 { 631 return _dev_close(dev, 0); 632 } 633 634 int dev_close_immediate(struct device *dev) 635 { 636 return _dev_close(dev, 1); 637 } 638 639 void dev_close_all(void) 640 { 641 struct dm_list *doh, *doht; 642 struct device *dev; 643 644 dm_list_iterate_safe(doh, doht, &_open_devices) { 645 dev = dm_list_struct_base(doh, struct device, open_list); 646 if (dev->open_count < 1) 647 _close(dev); 648 } 649 } 650 651 int dev_read(struct device *dev, uint64_t offset, size_t len, void *buffer) 652 { 653 struct device_area where; 654 655 if (!dev->open_count) 656 return_0; 657 658 where.dev = dev; 659 where.start = offset; 660 where.size = len; 661 662 return _aligned_io(&where, buffer, 0); 663 } 664 665 /* 666 * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted 667 * by (offset,len) and (offset2,len2). Thus, the total size of 668 * 'buf' should be len+len2. 669 */ 670 int dev_read_circular(struct device *dev, uint64_t offset, size_t len, 671 uint64_t offset2, size_t len2, void *buf) 672 { 673 if (!dev_read(dev, offset, len, buf)) { 674 log_error("Read from %s failed", dev_name(dev)); 675 return 0; 676 } 677 678 /* 679 * The second region is optional, and allows for 680 * a circular buffer on the device. 681 */ 682 if (!len2) 683 return 1; 684 685 if (!dev_read(dev, offset2, len2, buf + len)) { 686 log_error("Circular read from %s failed", 687 dev_name(dev)); 688 return 0; 689 } 690 691 return 1; 692 } 693 694 /* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after. 695 * But fails if concurrent processes writing 696 */ 697 698 /* FIXME pre-extend the file */ 699 int dev_append(struct device *dev, size_t len, void *buffer) 700 { 701 int r; 702 703 if (!dev->open_count) 704 return_0; 705 706 r = dev_write(dev, dev->end, len, buffer); 707 dev->end += (uint64_t) len; 708 709 #ifndef O_DIRECT_SUPPORT 710 dev_flush(dev); 711 #endif 712 return r; 713 } 714 715 int dev_write(struct device *dev, uint64_t offset, size_t len, void *buffer) 716 { 717 struct device_area where; 718 719 if (!dev->open_count) 720 return_0; 721 722 where.dev = dev; 723 where.start = offset; 724 where.size = len; 725 726 dev->flags |= DEV_ACCESSED_W; 727 728 return _aligned_io(&where, buffer, 1); 729 } 730 731 int dev_set(struct device *dev, uint64_t offset, size_t len, int value) 732 { 733 size_t s; 734 char buffer[4096] __attribute((aligned(8))); 735 736 if (!dev_open(dev)) 737 return_0; 738 739 if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE)) 740 log_debug("Wiping %s at %" PRIu64 " length %" PRIsize_t, 741 dev_name(dev), offset, len); 742 else 743 log_debug("Wiping %s at sector %" PRIu64 " length %" PRIsize_t 744 " sectors", dev_name(dev), offset >> SECTOR_SHIFT, 745 len >> SECTOR_SHIFT); 746 747 memset(buffer, value, sizeof(buffer)); 748 while (1) { 749 s = len > sizeof(buffer) ? sizeof(buffer) : len; 750 if (!dev_write(dev, offset, s, buffer)) 751 break; 752 753 len -= s; 754 if (!len) 755 break; 756 757 offset += s; 758 } 759 760 dev->flags |= DEV_ACCESSED_W; 761 762 if (!dev_close(dev)) 763 stack; 764 765 return (len == 0); 766 } 767