1 /* $NetBSD: dev-io.c,v 1.6 2009/12/02 01:53:25 haad Exp $ */ 2 3 /* 4 * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved. 5 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 6 * 7 * This file is part of LVM2. 8 * 9 * This copyrighted material is made available to anyone wishing to use, 10 * modify, copy, or redistribute it subject to the terms and conditions 11 * of the GNU Lesser General Public License v.2.1. 12 * 13 * You should have received a copy of the GNU Lesser General Public License 14 * along with this program; if not, write to the Free Software Foundation, 15 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 */ 17 18 #include "lib.h" 19 #include "lvm-types.h" 20 #include "device.h" 21 #include "metadata.h" 22 #include "lvmcache.h" 23 #include "memlock.h" 24 #include "locking.h" 25 26 #include <limits.h> 27 #include <sys/stat.h> 28 #include <fcntl.h> 29 #include <unistd.h> 30 #include <sys/ioctl.h> 31 32 #ifdef linux 33 # define u64 uint64_t /* Missing without __KERNEL__ */ 34 # undef WNOHANG /* Avoid redefinition */ 35 # undef WUNTRACED /* Avoid redefinition */ 36 # include <linux/fs.h> /* For block ioctl definitions */ 37 # define BLKSIZE_SHIFT SECTOR_SHIFT 38 # ifndef BLKGETSIZE64 /* fs.h out-of-date */ 39 # define BLKGETSIZE64 _IOR(0x12, 114, size_t) 40 # endif /* BLKGETSIZE64 */ 41 #elif __NetBSD__ 42 # include <sys/disk.h> 43 # include <sys/disklabel.h> 44 # include <sys/param.h> 45 #else 46 # include <sys/disk.h> 47 # define BLKBSZGET DKIOCGETBLOCKSIZE 48 # define BLKSSZGET DKIOCGETBLOCKSIZE 49 # define BLKGETSIZE64 DKIOCGETBLOCKCOUNT 50 # define BLKFLSBUF DKIOCSYNCHRONIZECACHE 51 # define BLKSIZE_SHIFT 0 52 #endif 53 54 #ifdef O_DIRECT_SUPPORT 55 # ifndef O_DIRECT 56 # error O_DIRECT support configured but O_DIRECT definition not found in headers 57 # endif 58 #endif 59 60 static DM_LIST_INIT(_open_devices); 61 62 /*----------------------------------------------------------------- 63 * The standard io loop that keeps submitting an io until it's 64 * all gone. 65 *---------------------------------------------------------------*/ 66 static int _io(struct device_area *where, void *buffer, int should_write) 67 { 68 int fd = dev_fd(where->dev); 69 ssize_t n = 0; 70 size_t total = 0; 71 72 if (fd < 0) { 73 log_error("Attempt to read an unopened device (%s).", 74 dev_name(where->dev)); 75 return 0; 76 } 77 78 /* 79 * Skip all writes in test mode. 80 */ 81 if (should_write && test_mode()) 82 return 1; 83 84 if (where->size > SSIZE_MAX) { 85 log_error("Read size too large: %" PRIu64, where->size); 86 return 0; 87 } 88 89 if (lseek(fd, (off_t) where->start, SEEK_SET) < 0) { 90 log_error("%s: lseek %" PRIu64 " failed: %s", 91 dev_name(where->dev), (uint64_t) where->start, 92 strerror(errno)); 93 return 0; 94 } 95 96 while (total < (size_t) where->size) { 97 do 98 n = should_write ? 99 write(fd, buffer, (size_t) where->size - total) : 100 read(fd, buffer, (size_t) where->size - total); 101 while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN))); 102 103 if (n < 0) 104 log_error("%s: %s failed after %" PRIu64 " of %" PRIu64 105 " at %" PRIu64 ": %s", dev_name(where->dev), 106 should_write ? "write" : "read", 107 (uint64_t) total, 108 (uint64_t) where->size, 109 (uint64_t) where->start, strerror(errno)); 110 111 if (n <= 0) 112 break; 113 114 total += n; 115 buffer += n; 116 } 117 118 return (total == (size_t) where->size); 119 } 120 121 /*----------------------------------------------------------------- 122 * LVM2 uses O_DIRECT when performing metadata io, which requires 123 * block size aligned accesses. If any io is not aligned we have 124 * to perform the io via a bounce buffer, obviously this is quite 125 * inefficient. 126 *---------------------------------------------------------------*/ 127 128 /* 129 * Get the sector size from an _open_ device. 130 */ 131 static int _get_block_size(struct device *dev, unsigned int *size) 132 { 133 const char *name = dev_name(dev); 134 #ifdef __NetBSD__ 135 struct disklabel lab; 136 #endif 137 138 if ((dev->block_size == -1)) { 139 #ifdef __NetBSD__ 140 if (ioctl(dev_fd(dev), DIOCGDINFO, &lab) < 0) { 141 dev->block_size = DEV_BSIZE; 142 } else 143 dev->block_size = lab.d_secsize; 144 #else 145 if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) { 146 log_sys_error("ioctl BLKBSZGET", name); 147 return 0; 148 } 149 #endif 150 log_debug("%s: block size is %u bytes", name, dev->block_size); 151 } 152 153 *size = (unsigned int) dev->block_size; 154 155 return 1; 156 } 157 158 /* 159 * Widens a region to be an aligned region. 160 */ 161 static void _widen_region(unsigned int block_size, struct device_area *region, 162 struct device_area *result) 163 { 164 uint64_t mask = block_size - 1, delta; 165 memcpy(result, region, sizeof(*result)); 166 167 /* adjust the start */ 168 delta = result->start & mask; 169 if (delta) { 170 result->start -= delta; 171 result->size += delta; 172 } 173 174 /* adjust the end */ 175 delta = (result->start + result->size) & mask; 176 if (delta) 177 result->size += block_size - delta; 178 } 179 180 static int _aligned_io(struct device_area *where, void *buffer, 181 int should_write) 182 { 183 void *bounce; 184 unsigned int block_size = 0; 185 uintptr_t mask; 186 struct device_area widened; 187 188 if (!(where->dev->flags & DEV_REGULAR) && 189 !_get_block_size(where->dev, &block_size)) 190 return_0; 191 192 if (!block_size) 193 block_size = lvm_getpagesize(); 194 195 _widen_region(block_size, where, &widened); 196 197 /* Do we need to use a bounce buffer? */ 198 mask = block_size - 1; 199 if (!memcmp(where, &widened, sizeof(widened)) && 200 !((uintptr_t) buffer & mask)) 201 return _io(where, buffer, should_write); 202 203 /* Allocate a bounce buffer with an extra block */ 204 if (!(bounce = alloca((size_t) widened.size + block_size))) { 205 log_error("Bounce buffer alloca failed"); 206 return 0; 207 } 208 209 /* 210 * Realign start of bounce buffer (using the extra sector) 211 */ 212 if (((uintptr_t) bounce) & mask) 213 bounce = (void *) ((((uintptr_t) bounce) + mask) & ~mask); 214 215 /* channel the io through the bounce buffer */ 216 if (!_io(&widened, bounce, 0)) { 217 if (!should_write) 218 return_0; 219 /* FIXME pre-extend the file */ 220 memset(bounce, '\n', widened.size); 221 } 222 223 if (should_write) { 224 memcpy(bounce + (where->start - widened.start), buffer, 225 (size_t) where->size); 226 227 /* ... then we write */ 228 return _io(&widened, bounce, 1); 229 } 230 231 memcpy(buffer, bounce + (where->start - widened.start), 232 (size_t) where->size); 233 234 return 1; 235 } 236 237 static int _dev_get_size_file(const struct device *dev, uint64_t *size) 238 { 239 const char *name = dev_name(dev); 240 struct stat info; 241 242 if (stat(name, &info)) { 243 log_sys_error("stat", name); 244 return 0; 245 } 246 247 *size = info.st_size; 248 *size >>= SECTOR_SHIFT; /* Convert to sectors */ 249 250 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); 251 252 return 1; 253 } 254 255 static int _dev_get_size_dev(const struct device *dev, uint64_t *size) 256 { 257 int fd; 258 const char *name = dev_name(dev); 259 #ifdef __NetBSD__ 260 struct disklabel lab; 261 struct dkwedge_info dkw; 262 #endif 263 264 if ((fd = open(name, O_RDONLY)) < 0) { 265 #ifndef __NetBSD__ 266 log_sys_error("open", name); 267 #endif 268 return 0; 269 } 270 271 #ifdef __NetBSD__ 272 if ((*size = lseek (fd, 0, SEEK_END)) < 0) { 273 log_sys_error("lseek SEEK_END", name); 274 close(fd); 275 return 0; 276 } 277 278 if (ioctl(fd, DIOCGDINFO, &lab) < 0) { 279 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) < 0) { 280 log_debug("ioctl DIOCGWEDGEINFO", name); 281 close(fd); 282 return 0; 283 } else 284 if (dkw.dkw_size) 285 *size = dkw.dkw_size; 286 } else 287 if (lab.d_secsize) 288 *size /= lab.d_secsize; 289 #else 290 if (ioctl(fd, BLKGETSIZE64, size) < 0) { 291 log_sys_error("ioctl BLKGETSIZE64", name); 292 if (close(fd)) 293 log_sys_error("close", name); 294 return 0; 295 } 296 297 *size >>= BLKSIZE_SHIFT; /* Convert to sectors */ 298 #endif 299 if (close(fd)) 300 log_sys_error("close", name); 301 302 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); 303 304 return 1; 305 } 306 307 static int _dev_read_ahead_dev(struct device *dev, uint32_t *read_ahead) 308 { 309 #ifdef linux 310 long read_ahead_long; 311 312 if (dev->read_ahead != -1) { 313 *read_ahead = (uint32_t) dev->read_ahead; 314 return 1; 315 } 316 317 if (!dev_open(dev)) 318 return_0; 319 320 if (ioctl(dev->fd, BLKRAGET, &read_ahead_long) < 0) { 321 log_sys_error("ioctl BLKRAGET", dev_name(dev)); 322 if (!dev_close(dev)) 323 stack; 324 return 0; 325 } 326 327 if (!dev_close(dev)) 328 stack; 329 330 *read_ahead = (uint32_t) read_ahead_long; 331 dev->read_ahead = read_ahead_long; 332 333 log_very_verbose("%s: read_ahead is %u sectors", 334 dev_name(dev), *read_ahead); 335 #endif 336 return 1; 337 } 338 339 /*----------------------------------------------------------------- 340 * Public functions 341 *---------------------------------------------------------------*/ 342 343 int dev_get_size(const struct device *dev, uint64_t *size) 344 { 345 if (!dev) 346 return 0; 347 348 if ((dev->flags & DEV_REGULAR)) 349 return _dev_get_size_file(dev, size); 350 else 351 return _dev_get_size_dev(dev, size); 352 } 353 354 int dev_get_read_ahead(struct device *dev, uint32_t *read_ahead) 355 { 356 if (!dev) 357 return 0; 358 359 if (dev->flags & DEV_REGULAR) { 360 *read_ahead = 0; 361 return 1; 362 } 363 364 return _dev_read_ahead_dev(dev, read_ahead); 365 } 366 367 /* FIXME Unused 368 int dev_get_sectsize(struct device *dev, uint32_t *size) 369 { 370 int fd; 371 int s; 372 const char *name = dev_name(dev); 373 374 if ((fd = open(name, O_RDONLY)) < 0) { 375 log_sys_error("open", name); 376 return 0; 377 } 378 379 if (ioctl(fd, BLKSSZGET, &s) < 0) { 380 log_sys_error("ioctl BLKSSZGET", name); 381 if (close(fd)) 382 log_sys_error("close", name); 383 return 0; 384 } 385 386 if (close(fd)) 387 log_sys_error("close", name); 388 389 *size = (uint32_t) s; 390 391 log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size); 392 393 return 1; 394 } 395 */ 396 397 void dev_flush(struct device *dev) 398 { 399 #ifdef __linux__ 400 if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0) 401 return; 402 #endif 403 404 if (fsync(dev->fd) >= 0) 405 return; 406 407 sync(); 408 } 409 410 int dev_open_flags(struct device *dev, int flags, int direct, int quiet) 411 { 412 struct stat buf; 413 const char *name; 414 int need_excl = 0, need_rw = 0; 415 416 if ((flags & O_ACCMODE) == O_RDWR) 417 need_rw = 1; 418 419 if ((flags & O_EXCL)) 420 need_excl = 1; 421 422 if (dev->fd >= 0) { 423 if (((dev->flags & DEV_OPENED_RW) || !need_rw) && 424 ((dev->flags & DEV_OPENED_EXCL) || !need_excl)) { 425 dev->open_count++; 426 return 1; 427 } 428 429 if (dev->open_count && !need_excl) { 430 /* FIXME Ensure we never get here */ 431 log_debug("WARNING: %s already opened read-only", 432 dev_name(dev)); 433 dev->open_count++; 434 } 435 436 dev_close_immediate(dev); 437 } 438 439 if (memlock()) 440 log_error("WARNING: dev_open(%s) called while suspended", 441 dev_name(dev)); 442 443 if (dev->flags & DEV_REGULAR) 444 name = dev_name(dev); 445 else if (!(name = dev_name_confirmed(dev, quiet))) 446 return_0; 447 448 if (!(dev->flags & DEV_REGULAR)) { 449 if (stat(name, &buf) < 0) { 450 log_sys_error("%s: stat failed", name); 451 return 0; 452 } 453 if (buf.st_rdev != dev->dev) { 454 log_error("%s: device changed", name); 455 return 0; 456 } 457 } 458 459 #ifdef O_DIRECT_SUPPORT 460 if (direct) { 461 if (!(dev->flags & DEV_O_DIRECT_TESTED)) 462 dev->flags |= DEV_O_DIRECT; 463 464 if ((dev->flags & DEV_O_DIRECT)) 465 flags |= O_DIRECT; 466 } 467 #endif 468 469 #ifdef O_NOATIME 470 /* Don't update atime on device inodes */ 471 if (!(dev->flags & DEV_REGULAR)) 472 flags |= O_NOATIME; 473 #endif 474 475 if ((dev->fd = open(name, flags, 0777)) < 0) { 476 #ifdef O_DIRECT_SUPPORT 477 if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) { 478 flags &= ~O_DIRECT; 479 if ((dev->fd = open(name, flags, 0777)) >= 0) { 480 dev->flags &= ~DEV_O_DIRECT; 481 log_debug("%s: Not using O_DIRECT", name); 482 goto opened; 483 } 484 } 485 #endif 486 if (quiet) 487 log_sys_debug("open", name); 488 else 489 log_sys_error("open", name); 490 491 return 0; 492 } 493 494 #ifdef O_DIRECT_SUPPORT 495 opened: 496 if (direct) 497 dev->flags |= DEV_O_DIRECT_TESTED; 498 #endif 499 dev->open_count++; 500 dev->flags &= ~DEV_ACCESSED_W; 501 502 if (need_rw) 503 dev->flags |= DEV_OPENED_RW; 504 else 505 dev->flags &= ~DEV_OPENED_RW; 506 507 if (need_excl) 508 dev->flags |= DEV_OPENED_EXCL; 509 else 510 dev->flags &= ~DEV_OPENED_EXCL; 511 512 if (!(dev->flags & DEV_REGULAR) && 513 ((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) { 514 log_error("%s: fstat failed: Has device name changed?", name); 515 dev_close_immediate(dev); 516 return 0; 517 } 518 519 #ifndef O_DIRECT_SUPPORT 520 if (!(dev->flags & DEV_REGULAR)) 521 dev_flush(dev); 522 #endif 523 524 if ((flags & O_CREAT) && !(flags & O_TRUNC)) 525 dev->end = lseek(dev->fd, (off_t) 0, SEEK_END); 526 527 dm_list_add(&_open_devices, &dev->open_list); 528 529 log_debug("Opened %s %s%s%s", dev_name(dev), 530 dev->flags & DEV_OPENED_RW ? "RW" : "RO", 531 dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "", 532 dev->flags & DEV_O_DIRECT ? " O_DIRECT" : ""); 533 534 return 1; 535 } 536 537 int dev_open_quiet(struct device *dev) 538 { 539 int flags; 540 541 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 542 543 return dev_open_flags(dev, flags, 1, 1); 544 } 545 546 int dev_open(struct device *dev) 547 { 548 int flags; 549 550 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 551 552 return dev_open_flags(dev, flags, 1, 0); 553 } 554 555 int dev_test_excl(struct device *dev) 556 { 557 int flags; 558 int r; 559 560 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 561 flags |= O_EXCL; 562 563 r = dev_open_flags(dev, flags, 1, 1); 564 if (r) 565 dev_close_immediate(dev); 566 567 return r; 568 } 569 570 static void _close(struct device *dev) 571 { 572 if (close(dev->fd)) 573 log_sys_error("close", dev_name(dev)); 574 dev->fd = -1; 575 dev->block_size = -1; 576 dm_list_del(&dev->open_list); 577 578 log_debug("Closed %s", dev_name(dev)); 579 580 if (dev->flags & DEV_ALLOCED) { 581 dm_free((void *) dm_list_item(dev->aliases.n, struct str_list)-> 582 str); 583 dm_free(dev->aliases.n); 584 dm_free(dev); 585 } 586 } 587 588 static int _dev_close(struct device *dev, int immediate) 589 { 590 struct lvmcache_info *info; 591 592 if (dev->fd < 0) { 593 log_error("Attempt to close device '%s' " 594 "which is not open.", dev_name(dev)); 595 return 0; 596 } 597 598 #ifndef O_DIRECT_SUPPORT 599 if (dev->flags & DEV_ACCESSED_W) 600 dev_flush(dev); 601 #endif 602 603 if (dev->open_count > 0) 604 dev->open_count--; 605 606 if (immediate && dev->open_count) 607 log_debug("%s: Immediate close attempt while still referenced", 608 dev_name(dev)); 609 610 /* Close unless device is known to belong to a locked VG */ 611 if (immediate || 612 (dev->open_count < 1 && 613 (!(info = info_from_pvid(dev->pvid, 0)) || 614 !info->vginfo || 615 !vgname_is_locked(info->vginfo->vgname)))) 616 _close(dev); 617 618 return 1; 619 } 620 621 int dev_close(struct device *dev) 622 { 623 return _dev_close(dev, 0); 624 } 625 626 int dev_close_immediate(struct device *dev) 627 { 628 return _dev_close(dev, 1); 629 } 630 631 void dev_close_all(void) 632 { 633 struct dm_list *doh, *doht; 634 struct device *dev; 635 636 dm_list_iterate_safe(doh, doht, &_open_devices) { 637 dev = dm_list_struct_base(doh, struct device, open_list); 638 if (dev->open_count < 1) 639 _close(dev); 640 } 641 } 642 643 int dev_read(struct device *dev, uint64_t offset, size_t len, void *buffer) 644 { 645 struct device_area where; 646 647 if (!dev->open_count) 648 return_0; 649 650 where.dev = dev; 651 where.start = offset; 652 where.size = len; 653 654 return _aligned_io(&where, buffer, 0); 655 } 656 657 /* 658 * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted 659 * by (offset,len) and (offset2,len2). Thus, the total size of 660 * 'buf' should be len+len2. 661 */ 662 int dev_read_circular(struct device *dev, uint64_t offset, size_t len, 663 uint64_t offset2, size_t len2, void *buf) 664 { 665 if (!dev_read(dev, offset, len, buf)) { 666 log_error("Read from %s failed", dev_name(dev)); 667 return 0; 668 } 669 670 /* 671 * The second region is optional, and allows for 672 * a circular buffer on the device. 673 */ 674 if (!len2) 675 return 1; 676 677 if (!dev_read(dev, offset2, len2, buf + len)) { 678 log_error("Circular read from %s failed", 679 dev_name(dev)); 680 return 0; 681 } 682 683 return 1; 684 } 685 686 /* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after. 687 * But fails if concurrent processes writing 688 */ 689 690 /* FIXME pre-extend the file */ 691 int dev_append(struct device *dev, size_t len, void *buffer) 692 { 693 int r; 694 695 if (!dev->open_count) 696 return_0; 697 698 r = dev_write(dev, dev->end, len, buffer); 699 dev->end += (uint64_t) len; 700 701 #ifndef O_DIRECT_SUPPORT 702 dev_flush(dev); 703 #endif 704 return r; 705 } 706 707 int dev_write(struct device *dev, uint64_t offset, size_t len, void *buffer) 708 { 709 struct device_area where; 710 711 if (!dev->open_count) 712 return_0; 713 714 where.dev = dev; 715 where.start = offset; 716 where.size = len; 717 718 dev->flags |= DEV_ACCESSED_W; 719 720 return _aligned_io(&where, buffer, 1); 721 } 722 723 int dev_set(struct device *dev, uint64_t offset, size_t len, int value) 724 { 725 size_t s; 726 char buffer[4096] __attribute((aligned(8))); 727 728 if (!dev_open(dev)) 729 return_0; 730 731 if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE)) 732 log_debug("Wiping %s at %" PRIu64 " length %" PRIsize_t, 733 dev_name(dev), offset, len); 734 else 735 log_debug("Wiping %s at sector %" PRIu64 " length %" PRIsize_t 736 " sectors", dev_name(dev), offset >> SECTOR_SHIFT, 737 len >> SECTOR_SHIFT); 738 739 memset(buffer, value, sizeof(buffer)); 740 while (1) { 741 s = len > sizeof(buffer) ? sizeof(buffer) : len; 742 if (!dev_write(dev, offset, s, buffer)) 743 break; 744 745 len -= s; 746 if (!len) 747 break; 748 749 offset += s; 750 } 751 752 dev->flags |= DEV_ACCESSED_W; 753 754 if (!dev_close(dev)) 755 stack; 756 757 return (len == 0); 758 } 759