1 /* $NetBSD: dev-io.c,v 1.4 2009/02/18 12:16:13 haad Exp $ */ 2 3 /* 4 * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved. 5 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 6 * 7 * This file is part of LVM2. 8 * 9 * This copyrighted material is made available to anyone wishing to use, 10 * modify, copy, or redistribute it subject to the terms and conditions 11 * of the GNU Lesser General Public License v.2.1. 12 * 13 * You should have received a copy of the GNU Lesser General Public License 14 * along with this program; if not, write to the Free Software Foundation, 15 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 */ 17 18 #include "lib.h" 19 #include "lvm-types.h" 20 #include "device.h" 21 #include "metadata.h" 22 #include "lvmcache.h" 23 #include "memlock.h" 24 #include "locking.h" 25 26 #include <limits.h> 27 #include <sys/stat.h> 28 #include <fcntl.h> 29 #include <unistd.h> 30 #include <sys/ioctl.h> 31 32 #ifdef linux 33 # define u64 uint64_t /* Missing without __KERNEL__ */ 34 # undef WNOHANG /* Avoid redefinition */ 35 # undef WUNTRACED /* Avoid redefinition */ 36 # include <linux/fs.h> /* For block ioctl definitions */ 37 # define BLKSIZE_SHIFT SECTOR_SHIFT 38 # ifndef BLKGETSIZE64 /* fs.h out-of-date */ 39 # define BLKGETSIZE64 _IOR(0x12, 114, size_t) 40 # endif /* BLKGETSIZE64 */ 41 #elif __NetBSD__ 42 # include <sys/disk.h> 43 # include <sys/disklabel.h> 44 # include <sys/param.h> 45 #else 46 # include <sys/disk.h> 47 # define BLKBSZGET DKIOCGETBLOCKSIZE 48 # define BLKSSZGET DKIOCGETBLOCKSIZE 49 # define BLKGETSIZE64 DKIOCGETBLOCKCOUNT 50 # define BLKFLSBUF DKIOCSYNCHRONIZECACHE 51 # define BLKSIZE_SHIFT 0 52 #endif 53 54 #ifdef O_DIRECT_SUPPORT 55 # ifndef O_DIRECT 56 # error O_DIRECT support configured but O_DIRECT definition not found in headers 57 # endif 58 #endif 59 60 static DM_LIST_INIT(_open_devices); 61 62 /*----------------------------------------------------------------- 63 * The standard io loop that keeps submitting an io until it's 64 * all gone. 65 *---------------------------------------------------------------*/ 66 static int _io(struct device_area *where, void *buffer, int should_write) 67 { 68 int fd = dev_fd(where->dev); 69 ssize_t n = 0; 70 size_t total = 0; 71 72 if (fd < 0) { 73 log_error("Attempt to read an unopened device (%s).", 74 dev_name(where->dev)); 75 return 0; 76 } 77 78 /* 79 * Skip all writes in test mode. 80 */ 81 if (should_write && test_mode()) 82 return 1; 83 84 if (where->size > SSIZE_MAX) { 85 log_error("Read size too large: %" PRIu64, where->size); 86 return 0; 87 } 88 89 if (lseek(fd, (off_t) where->start, SEEK_SET) < 0) { 90 log_error("%s: lseek %" PRIu64 " failed: %s", 91 dev_name(where->dev), (uint64_t) where->start, 92 strerror(errno)); 93 return 0; 94 } 95 96 while (total < (size_t) where->size) { 97 do 98 n = should_write ? 99 write(fd, buffer, (size_t) where->size - total) : 100 read(fd, buffer, (size_t) where->size - total); 101 while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN))); 102 103 if (n < 0) 104 log_error("%s: %s failed after %" PRIu64 " of %" PRIu64 105 " at %" PRIu64 ": %s", dev_name(where->dev), 106 should_write ? "write" : "read", 107 (uint64_t) total, 108 (uint64_t) where->size, 109 (uint64_t) where->start, strerror(errno)); 110 111 if (n <= 0) 112 break; 113 114 total += n; 115 buffer += n; 116 } 117 118 return (total == (size_t) where->size); 119 } 120 121 /*----------------------------------------------------------------- 122 * LVM2 uses O_DIRECT when performing metadata io, which requires 123 * block size aligned accesses. If any io is not aligned we have 124 * to perform the io via a bounce buffer, obviously this is quite 125 * inefficient. 126 *---------------------------------------------------------------*/ 127 128 /* 129 * Get the sector size from an _open_ device. 130 */ 131 static int _get_block_size(struct device *dev, unsigned int *size) 132 { 133 const char *name = dev_name(dev); 134 #ifdef __NetBSD__ 135 struct disklabel lab; 136 #endif 137 138 if ((dev->block_size == -1)) { 139 #ifdef __NetBSD__ 140 if (ioctl(dev_fd(dev), DIOCGDINFO, &lab) < 0) { 141 dev->block_size = DEV_BSIZE; 142 } else 143 dev->block_size = lab.d_secsize; 144 #else 145 if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) { 146 log_sys_error("ioctl BLKBSZGET", name); 147 return 0; 148 } 149 #endif 150 log_debug("%s: block size is %u bytes", name, dev->block_size); 151 } 152 153 *size = (unsigned int) dev->block_size; 154 155 return 1; 156 } 157 158 /* 159 * Widens a region to be an aligned region. 160 */ 161 static void _widen_region(unsigned int block_size, struct device_area *region, 162 struct device_area *result) 163 { 164 uint64_t mask = block_size - 1, delta; 165 memcpy(result, region, sizeof(*result)); 166 167 /* adjust the start */ 168 delta = result->start & mask; 169 if (delta) { 170 result->start -= delta; 171 result->size += delta; 172 } 173 174 /* adjust the end */ 175 delta = (result->start + result->size) & mask; 176 if (delta) 177 result->size += block_size - delta; 178 } 179 180 static int _aligned_io(struct device_area *where, void *buffer, 181 int should_write) 182 { 183 void *bounce; 184 unsigned int block_size = 0; 185 uintptr_t mask; 186 struct device_area widened; 187 188 if (!(where->dev->flags & DEV_REGULAR) && 189 !_get_block_size(where->dev, &block_size)) 190 return_0; 191 192 if (!block_size) 193 block_size = lvm_getpagesize(); 194 195 _widen_region(block_size, where, &widened); 196 197 /* Do we need to use a bounce buffer? */ 198 mask = block_size - 1; 199 if (!memcmp(where, &widened, sizeof(widened)) && 200 !((uintptr_t) buffer & mask)) 201 return _io(where, buffer, should_write); 202 203 /* Allocate a bounce buffer with an extra block */ 204 if (!(bounce = alloca((size_t) widened.size + block_size))) { 205 log_error("Bounce buffer alloca failed"); 206 return 0; 207 } 208 209 /* 210 * Realign start of bounce buffer (using the extra sector) 211 */ 212 if (((uintptr_t) bounce) & mask) 213 bounce = (void *) ((((uintptr_t) bounce) + mask) & ~mask); 214 215 /* channel the io through the bounce buffer */ 216 if (!_io(&widened, bounce, 0)) { 217 if (!should_write) 218 return_0; 219 /* FIXME pre-extend the file */ 220 memset(bounce, '\n', widened.size); 221 } 222 223 if (should_write) { 224 memcpy(bounce + (where->start - widened.start), buffer, 225 (size_t) where->size); 226 227 /* ... then we write */ 228 return _io(&widened, bounce, 1); 229 } 230 231 memcpy(buffer, bounce + (where->start - widened.start), 232 (size_t) where->size); 233 234 return 1; 235 } 236 237 static int _dev_get_size_file(const struct device *dev, uint64_t *size) 238 { 239 const char *name = dev_name(dev); 240 struct stat info; 241 242 if (stat(name, &info)) { 243 log_sys_error("stat", name); 244 return 0; 245 } 246 247 *size = info.st_size; 248 *size >>= SECTOR_SHIFT; /* Convert to sectors */ 249 250 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); 251 252 return 1; 253 } 254 255 static int _dev_get_size_dev(const struct device *dev, uint64_t *size) 256 { 257 int fd; 258 const char *name = dev_name(dev); 259 #ifdef __NetBSD__ 260 struct disklabel lab; 261 struct dkwedge_info dkw; 262 #endif 263 264 if ((fd = open(name, O_RDONLY)) < 0) { 265 #ifndef __NetBSD__ 266 log_sys_error("open", name); 267 #endif 268 return 0; 269 } 270 271 #ifdef __NetBSD__ 272 if ((*size = lseek (fd, 0, SEEK_END)) < 0) { 273 log_sys_error("lseek SEEK_END", name); 274 close(fd); 275 return 0; 276 } 277 278 if (ioctl(fd, DIOCGDINFO, &lab) < 0) { 279 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) < 0) { 280 log_debug("ioctl DIOCGWEDGEINFO", name); 281 close(fd); 282 return 0; 283 } else 284 if (dkw.dkw_size) 285 *size = dkw.dkw_size; 286 } else 287 if (lab.d_secsize) 288 *size /= lab.d_secsize; 289 #else 290 if (ioctl(fd, BLKGETSIZE64, size) < 0) { 291 log_sys_error("ioctl BLKGETSIZE64", name); 292 if (close(fd)) 293 log_sys_error("close", name); 294 return 0; 295 } 296 297 *size >>= BLKSIZE_SHIFT; /* Convert to sectors */ 298 #endif 299 if (close(fd)) 300 log_sys_error("close", name); 301 302 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); 303 304 return 1; 305 } 306 307 /*----------------------------------------------------------------- 308 * Public functions 309 *---------------------------------------------------------------*/ 310 311 int dev_get_size(const struct device *dev, uint64_t *size) 312 { 313 if (!dev) 314 return 0; 315 316 if ((dev->flags & DEV_REGULAR)) 317 return _dev_get_size_file(dev, size); 318 else 319 return _dev_get_size_dev(dev, size); 320 } 321 322 /* FIXME Unused 323 int dev_get_sectsize(struct device *dev, uint32_t *size) 324 { 325 int fd; 326 int s; 327 const char *name = dev_name(dev); 328 329 if ((fd = open(name, O_RDONLY)) < 0) { 330 log_sys_error("open", name); 331 return 0; 332 } 333 334 if (ioctl(fd, BLKSSZGET, &s) < 0) { 335 log_sys_error("ioctl BLKSSZGET", name); 336 if (close(fd)) 337 log_sys_error("close", name); 338 return 0; 339 } 340 341 if (close(fd)) 342 log_sys_error("close", name); 343 344 *size = (uint32_t) s; 345 346 log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size); 347 348 return 1; 349 } 350 */ 351 352 void dev_flush(struct device *dev) 353 { 354 #ifdef __linux__ 355 if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0) 356 return; 357 #endif 358 359 if (fsync(dev->fd) >= 0) 360 return; 361 362 sync(); 363 } 364 365 int dev_open_flags(struct device *dev, int flags, int direct, int quiet) 366 { 367 struct stat buf; 368 const char *name; 369 int need_excl = 0, need_rw = 0; 370 371 if ((flags & O_ACCMODE) == O_RDWR) 372 need_rw = 1; 373 374 if ((flags & O_EXCL)) 375 need_excl = 1; 376 377 if (dev->fd >= 0) { 378 if (((dev->flags & DEV_OPENED_RW) || !need_rw) && 379 ((dev->flags & DEV_OPENED_EXCL) || !need_excl)) { 380 dev->open_count++; 381 return 1; 382 } 383 384 if (dev->open_count && !need_excl) { 385 /* FIXME Ensure we never get here */ 386 log_debug("WARNING: %s already opened read-only", 387 dev_name(dev)); 388 dev->open_count++; 389 } 390 391 dev_close_immediate(dev); 392 } 393 394 if (memlock()) 395 log_error("WARNING: dev_open(%s) called while suspended", 396 dev_name(dev)); 397 398 if (dev->flags & DEV_REGULAR) 399 name = dev_name(dev); 400 else if (!(name = dev_name_confirmed(dev, quiet))) 401 return_0; 402 403 if (!(dev->flags & DEV_REGULAR)) { 404 if (stat(name, &buf) < 0) { 405 log_sys_error("%s: stat failed", name); 406 return 0; 407 } 408 if (buf.st_rdev != dev->dev) { 409 log_error("%s: device changed", name); 410 return 0; 411 } 412 } 413 414 #ifdef O_DIRECT_SUPPORT 415 if (direct) { 416 if (!(dev->flags & DEV_O_DIRECT_TESTED)) 417 dev->flags |= DEV_O_DIRECT; 418 419 if ((dev->flags & DEV_O_DIRECT)) 420 flags |= O_DIRECT; 421 } 422 #endif 423 424 #ifdef O_NOATIME 425 /* Don't update atime on device inodes */ 426 if (!(dev->flags & DEV_REGULAR)) 427 flags |= O_NOATIME; 428 #endif 429 430 if ((dev->fd = open(name, flags, 0777)) < 0) { 431 #ifdef O_DIRECT_SUPPORT 432 if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) { 433 flags &= ~O_DIRECT; 434 if ((dev->fd = open(name, flags, 0777)) >= 0) { 435 dev->flags &= ~DEV_O_DIRECT; 436 log_debug("%s: Not using O_DIRECT", name); 437 goto opened; 438 } 439 } 440 #endif 441 if (quiet) 442 log_sys_debug("open", name); 443 else 444 log_sys_error("open", name); 445 446 return 0; 447 } 448 449 #ifdef O_DIRECT_SUPPORT 450 opened: 451 if (direct) 452 dev->flags |= DEV_O_DIRECT_TESTED; 453 #endif 454 dev->open_count++; 455 dev->flags &= ~DEV_ACCESSED_W; 456 457 if (need_rw) 458 dev->flags |= DEV_OPENED_RW; 459 else 460 dev->flags &= ~DEV_OPENED_RW; 461 462 if (need_excl) 463 dev->flags |= DEV_OPENED_EXCL; 464 else 465 dev->flags &= ~DEV_OPENED_EXCL; 466 467 if (!(dev->flags & DEV_REGULAR) && 468 ((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) { 469 log_error("%s: fstat failed: Has device name changed?", name); 470 dev_close_immediate(dev); 471 return 0; 472 } 473 474 #ifndef O_DIRECT_SUPPORT 475 if (!(dev->flags & DEV_REGULAR)) 476 dev_flush(dev); 477 #endif 478 479 if ((flags & O_CREAT) && !(flags & O_TRUNC)) 480 dev->end = lseek(dev->fd, (off_t) 0, SEEK_END); 481 482 dm_list_add(&_open_devices, &dev->open_list); 483 484 log_debug("Opened %s %s%s%s", dev_name(dev), 485 dev->flags & DEV_OPENED_RW ? "RW" : "RO", 486 dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "", 487 dev->flags & DEV_O_DIRECT ? " O_DIRECT" : ""); 488 489 return 1; 490 } 491 492 int dev_open_quiet(struct device *dev) 493 { 494 int flags; 495 496 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 497 498 return dev_open_flags(dev, flags, 1, 1); 499 } 500 501 int dev_open(struct device *dev) 502 { 503 int flags; 504 505 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 506 507 return dev_open_flags(dev, flags, 1, 0); 508 } 509 510 int dev_test_excl(struct device *dev) 511 { 512 int flags; 513 int r; 514 515 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 516 flags |= O_EXCL; 517 518 r = dev_open_flags(dev, flags, 1, 1); 519 if (r) 520 dev_close_immediate(dev); 521 522 return r; 523 } 524 525 static void _close(struct device *dev) 526 { 527 if (close(dev->fd)) 528 log_sys_error("close", dev_name(dev)); 529 dev->fd = -1; 530 dev->block_size = -1; 531 dm_list_del(&dev->open_list); 532 533 log_debug("Closed %s", dev_name(dev)); 534 535 if (dev->flags & DEV_ALLOCED) { 536 dm_free((void *) dm_list_item(dev->aliases.n, struct str_list)-> 537 str); 538 dm_free(dev->aliases.n); 539 dm_free(dev); 540 } 541 } 542 543 static int _dev_close(struct device *dev, int immediate) 544 { 545 struct lvmcache_info *info; 546 547 if (dev->fd < 0) { 548 log_error("Attempt to close device '%s' " 549 "which is not open.", dev_name(dev)); 550 return 0; 551 } 552 553 #ifndef O_DIRECT_SUPPORT 554 if (dev->flags & DEV_ACCESSED_W) 555 dev_flush(dev); 556 #endif 557 558 if (dev->open_count > 0) 559 dev->open_count--; 560 561 if (immediate && dev->open_count) 562 log_debug("%s: Immediate close attempt while still referenced", 563 dev_name(dev)); 564 565 /* Close unless device is known to belong to a locked VG */ 566 if (immediate || 567 (dev->open_count < 1 && 568 (!(info = info_from_pvid(dev->pvid, 0)) || 569 !info->vginfo || 570 !vgname_is_locked(info->vginfo->vgname)))) 571 _close(dev); 572 573 return 1; 574 } 575 576 int dev_close(struct device *dev) 577 { 578 return _dev_close(dev, 0); 579 } 580 581 int dev_close_immediate(struct device *dev) 582 { 583 return _dev_close(dev, 1); 584 } 585 586 void dev_close_all(void) 587 { 588 struct dm_list *doh, *doht; 589 struct device *dev; 590 591 dm_list_iterate_safe(doh, doht, &_open_devices) { 592 dev = dm_list_struct_base(doh, struct device, open_list); 593 if (dev->open_count < 1) 594 _close(dev); 595 } 596 } 597 598 int dev_read(struct device *dev, uint64_t offset, size_t len, void *buffer) 599 { 600 struct device_area where; 601 602 if (!dev->open_count) 603 return_0; 604 605 where.dev = dev; 606 where.start = offset; 607 where.size = len; 608 609 return _aligned_io(&where, buffer, 0); 610 } 611 612 /* 613 * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted 614 * by (offset,len) and (offset2,len2). Thus, the total size of 615 * 'buf' should be len+len2. 616 */ 617 int dev_read_circular(struct device *dev, uint64_t offset, size_t len, 618 uint64_t offset2, size_t len2, void *buf) 619 { 620 if (!dev_read(dev, offset, len, buf)) { 621 log_error("Read from %s failed", dev_name(dev)); 622 return 0; 623 } 624 625 /* 626 * The second region is optional, and allows for 627 * a circular buffer on the device. 628 */ 629 if (!len2) 630 return 1; 631 632 if (!dev_read(dev, offset2, len2, buf + len)) { 633 log_error("Circular read from %s failed", 634 dev_name(dev)); 635 return 0; 636 } 637 638 return 1; 639 } 640 641 /* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after. 642 * But fails if concurrent processes writing 643 */ 644 645 /* FIXME pre-extend the file */ 646 int dev_append(struct device *dev, size_t len, void *buffer) 647 { 648 int r; 649 650 if (!dev->open_count) 651 return_0; 652 653 r = dev_write(dev, dev->end, len, buffer); 654 dev->end += (uint64_t) len; 655 656 #ifndef O_DIRECT_SUPPORT 657 dev_flush(dev); 658 #endif 659 return r; 660 } 661 662 int dev_write(struct device *dev, uint64_t offset, size_t len, void *buffer) 663 { 664 struct device_area where; 665 666 if (!dev->open_count) 667 return_0; 668 669 where.dev = dev; 670 where.start = offset; 671 where.size = len; 672 673 dev->flags |= DEV_ACCESSED_W; 674 675 return _aligned_io(&where, buffer, 1); 676 } 677 678 int dev_set(struct device *dev, uint64_t offset, size_t len, int value) 679 { 680 size_t s; 681 char buffer[4096] __attribute((aligned(8))); 682 683 if (!dev_open(dev)) 684 return_0; 685 686 if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE)) 687 log_debug("Wiping %s at %" PRIu64 " length %" PRIsize_t, 688 dev_name(dev), offset, len); 689 else 690 log_debug("Wiping %s at sector %" PRIu64 " length %" PRIsize_t 691 " sectors", dev_name(dev), offset >> SECTOR_SHIFT, 692 len >> SECTOR_SHIFT); 693 694 memset(buffer, value, sizeof(buffer)); 695 while (1) { 696 s = len > sizeof(buffer) ? sizeof(buffer) : len; 697 if (!dev_write(dev, offset, s, buffer)) 698 break; 699 700 len -= s; 701 if (!len) 702 break; 703 704 offset += s; 705 } 706 707 dev->flags |= DEV_ACCESSED_W; 708 709 if (!dev_close(dev)) 710 stack; 711 712 return (len == 0); 713 } 714