1 /* $NetBSD: dev-io.c,v 1.3 2009/01/06 23:21:16 haad Exp $ */ 2 3 /* 4 * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved. 5 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 6 * 7 * This file is part of LVM2. 8 * 9 * This copyrighted material is made available to anyone wishing to use, 10 * modify, copy, or redistribute it subject to the terms and conditions 11 * of the GNU Lesser General Public License v.2.1. 12 * 13 * You should have received a copy of the GNU Lesser General Public License 14 * along with this program; if not, write to the Free Software Foundation, 15 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 */ 17 18 #include "lib.h" 19 #include "lvm-types.h" 20 #include "device.h" 21 #include "metadata.h" 22 #include "lvmcache.h" 23 #include "memlock.h" 24 #include "locking.h" 25 26 #include <limits.h> 27 #include <sys/stat.h> 28 #include <fcntl.h> 29 #include <unistd.h> 30 #include <sys/ioctl.h> 31 32 #ifdef linux 33 # define u64 uint64_t /* Missing without __KERNEL__ */ 34 # undef WNOHANG /* Avoid redefinition */ 35 # undef WUNTRACED /* Avoid redefinition */ 36 # include <linux/fs.h> /* For block ioctl definitions */ 37 # define BLKSIZE_SHIFT SECTOR_SHIFT 38 # ifndef BLKGETSIZE64 /* fs.h out-of-date */ 39 # define BLKGETSIZE64 _IOR(0x12, 114, size_t) 40 # endif /* BLKGETSIZE64 */ 41 #elif __NetBSD__ 42 # include <sys/disk.h> 43 # include <sys/disklabel.h> 44 # include <sys/param.h> 45 #else 46 # include <sys/disk.h> 47 # define BLKBSZGET DKIOCGETBLOCKSIZE 48 # define BLKSSZGET DKIOCGETBLOCKSIZE 49 # define BLKGETSIZE64 DKIOCGETBLOCKCOUNT 50 # define BLKFLSBUF DKIOCSYNCHRONIZECACHE 51 # define BLKSIZE_SHIFT 0 52 #endif 53 54 #ifdef O_DIRECT_SUPPORT 55 # ifndef O_DIRECT 56 # error O_DIRECT support configured but O_DIRECT definition not found in headers 57 # endif 58 #endif 59 60 static DM_LIST_INIT(_open_devices); 61 62 /*----------------------------------------------------------------- 63 * The standard io loop that keeps submitting an io until it's 64 * all gone. 65 *---------------------------------------------------------------*/ 66 static int _io(struct device_area *where, void *buffer, int should_write) 67 { 68 int fd = dev_fd(where->dev); 69 ssize_t n = 0; 70 size_t total = 0; 71 72 if (fd < 0) { 73 log_error("Attempt to read an unopened device (%s).", 74 dev_name(where->dev)); 75 return 0; 76 } 77 78 /* 79 * Skip all writes in test mode. 80 */ 81 if (should_write && test_mode()) 82 return 1; 83 84 if (where->size > SSIZE_MAX) { 85 log_error("Read size too large: %" PRIu64, where->size); 86 return 0; 87 } 88 89 if (lseek(fd, (off_t) where->start, SEEK_SET) < 0) { 90 log_error("%s: lseek %" PRIu64 " failed: %s", 91 dev_name(where->dev), (uint64_t) where->start, 92 strerror(errno)); 93 return 0; 94 } 95 96 while (total < (size_t) where->size) { 97 do 98 n = should_write ? 99 write(fd, buffer, (size_t) where->size - total) : 100 read(fd, buffer, (size_t) where->size - total); 101 while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN))); 102 103 if (n < 0) 104 log_error("%s: %s failed after %" PRIu64 " of %" PRIu64 105 " at %" PRIu64 ": %s", dev_name(where->dev), 106 should_write ? "write" : "read", 107 (uint64_t) total, 108 (uint64_t) where->size, 109 (uint64_t) where->start, strerror(errno)); 110 111 if (n <= 0) 112 break; 113 114 total += n; 115 buffer += n; 116 } 117 118 return (total == (size_t) where->size); 119 } 120 121 /*----------------------------------------------------------------- 122 * LVM2 uses O_DIRECT when performing metadata io, which requires 123 * block size aligned accesses. If any io is not aligned we have 124 * to perform the io via a bounce buffer, obviously this is quite 125 * inefficient. 126 *---------------------------------------------------------------*/ 127 128 /* 129 * Get the sector size from an _open_ device. 130 */ 131 static int _get_block_size(struct device *dev, unsigned int *size) 132 { 133 const char *name = dev_name(dev); 134 #ifdef __NetBSD__ 135 struct disklabel lab; 136 #endif 137 138 if ((dev->block_size == -1)) { 139 #ifdef __NetBSD__ 140 if (ioctl(dev_fd(dev), DIOCGDINFO, &lab) < 0) { 141 dev->block_size = DEV_BSIZE; 142 } else 143 dev->block_size = lab.d_secsize; 144 #else 145 if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) { 146 log_sys_error("ioctl BLKBSZGET", name); 147 return 0; 148 } 149 #endif 150 log_debug("%s: block size is %u bytes", name, dev->block_size); 151 } 152 153 *size = (unsigned int) dev->block_size; 154 155 return 1; 156 } 157 158 /* 159 * Widens a region to be an aligned region. 160 */ 161 static void _widen_region(unsigned int block_size, struct device_area *region, 162 struct device_area *result) 163 { 164 uint64_t mask = block_size - 1, delta; 165 memcpy(result, region, sizeof(*result)); 166 167 /* adjust the start */ 168 delta = result->start & mask; 169 if (delta) { 170 result->start -= delta; 171 result->size += delta; 172 } 173 174 /* adjust the end */ 175 delta = (result->start + result->size) & mask; 176 if (delta) 177 result->size += block_size - delta; 178 } 179 180 static int _aligned_io(struct device_area *where, void *buffer, 181 int should_write) 182 { 183 void *bounce; 184 unsigned int block_size = 0; 185 uintptr_t mask; 186 struct device_area widened; 187 188 if (!(where->dev->flags & DEV_REGULAR) && 189 !_get_block_size(where->dev, &block_size)) 190 return_0; 191 192 if (!block_size) 193 block_size = lvm_getpagesize(); 194 195 _widen_region(block_size, where, &widened); 196 197 /* Do we need to use a bounce buffer? */ 198 mask = block_size - 1; 199 if (!memcmp(where, &widened, sizeof(widened)) && 200 !((uintptr_t) buffer & mask)) 201 return _io(where, buffer, should_write); 202 203 /* Allocate a bounce buffer with an extra block */ 204 if (!(bounce = alloca((size_t) widened.size + block_size))) { 205 log_error("Bounce buffer alloca failed"); 206 return 0; 207 } 208 209 /* 210 * Realign start of bounce buffer (using the extra sector) 211 */ 212 if (((uintptr_t) bounce) & mask) 213 bounce = (void *) ((((uintptr_t) bounce) + mask) & ~mask); 214 215 /* channel the io through the bounce buffer */ 216 if (!_io(&widened, bounce, 0)) { 217 if (!should_write) 218 return_0; 219 /* FIXME pre-extend the file */ 220 memset(bounce, '\n', widened.size); 221 } 222 223 if (should_write) { 224 memcpy(bounce + (where->start - widened.start), buffer, 225 (size_t) where->size); 226 227 /* ... then we write */ 228 return _io(&widened, bounce, 1); 229 } 230 231 memcpy(buffer, bounce + (where->start - widened.start), 232 (size_t) where->size); 233 234 return 1; 235 } 236 237 static int _dev_get_size_file(const struct device *dev, uint64_t *size) 238 { 239 const char *name = dev_name(dev); 240 struct stat info; 241 242 if (stat(name, &info)) { 243 log_sys_error("stat", name); 244 return 0; 245 } 246 247 *size = info.st_size; 248 *size >>= SECTOR_SHIFT; /* Convert to sectors */ 249 250 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); 251 252 return 1; 253 } 254 255 static int _dev_get_size_dev(const struct device *dev, uint64_t *size) 256 { 257 int fd; 258 const char *name = dev_name(dev); 259 #ifdef __NetBSD__ 260 struct disklabel lab; 261 struct dkwedge_info dkw; 262 #endif 263 264 if ((fd = open(name, O_RDONLY)) < 0) { 265 #ifndef __NetBSD__ 266 log_sys_error("open", name); 267 #endif 268 return 0; 269 } 270 271 #ifdef __NetBSD__ 272 if ((*size = lseek (fd, 0, SEEK_END)) < 0) { 273 log_sys_error("lseek SEEK_END", name); 274 close(fd); 275 return 0; 276 } 277 278 if (ioctl(fd, DIOCGDINFO, &lab) < 0) { 279 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) < 0) { 280 log_debug("ioctl DIOCGWEDGEINFO", name); 281 close(fd); 282 return 0; 283 } else 284 if (dkw.dkw_size) 285 *size = dkw.dkw_size; 286 } else 287 if (lab.d_secsize) 288 *size /= lab.d_secsize; 289 #else 290 if (ioctl(fd, BLKGETSIZE64, size) < 0) { 291 log_sys_error("ioctl BLKGETSIZE64", name); 292 if (close(fd)) 293 log_sys_error("close", name); 294 return 0; 295 } 296 297 *size >>= BLKSIZE_SHIFT; /* Convert to sectors */ 298 #endif 299 if (close(fd)) 300 log_sys_error("close", name); 301 302 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); 303 304 return 1; 305 } 306 307 /*----------------------------------------------------------------- 308 * Public functions 309 *---------------------------------------------------------------*/ 310 311 int dev_get_size(const struct device *dev, uint64_t *size) 312 { 313 if ((dev->flags & DEV_REGULAR)) 314 return _dev_get_size_file(dev, size); 315 else 316 return _dev_get_size_dev(dev, size); 317 } 318 319 /* FIXME Unused 320 int dev_get_sectsize(struct device *dev, uint32_t *size) 321 { 322 int fd; 323 int s; 324 const char *name = dev_name(dev); 325 326 if ((fd = open(name, O_RDONLY)) < 0) { 327 log_sys_error("open", name); 328 return 0; 329 } 330 331 if (ioctl(fd, BLKSSZGET, &s) < 0) { 332 log_sys_error("ioctl BLKSSZGET", name); 333 if (close(fd)) 334 log_sys_error("close", name); 335 return 0; 336 } 337 338 if (close(fd)) 339 log_sys_error("close", name); 340 341 *size = (uint32_t) s; 342 343 log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size); 344 345 return 1; 346 } 347 */ 348 349 void dev_flush(struct device *dev) 350 { 351 #ifdef __linux__ 352 if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0) 353 return; 354 #endif 355 356 if (fsync(dev->fd) >= 0) 357 return; 358 359 sync(); 360 } 361 362 int dev_open_flags(struct device *dev, int flags, int direct, int quiet) 363 { 364 struct stat buf; 365 const char *name; 366 int need_excl = 0, need_rw = 0; 367 368 if ((flags & O_ACCMODE) == O_RDWR) 369 need_rw = 1; 370 371 if ((flags & O_EXCL)) 372 need_excl = 1; 373 374 if (dev->fd >= 0) { 375 if (((dev->flags & DEV_OPENED_RW) || !need_rw) && 376 ((dev->flags & DEV_OPENED_EXCL) || !need_excl)) { 377 dev->open_count++; 378 return 1; 379 } 380 381 if (dev->open_count && !need_excl) { 382 /* FIXME Ensure we never get here */ 383 log_debug("WARNING: %s already opened read-only", 384 dev_name(dev)); 385 dev->open_count++; 386 } 387 388 dev_close_immediate(dev); 389 } 390 391 if (memlock()) 392 log_error("WARNING: dev_open(%s) called while suspended", 393 dev_name(dev)); 394 395 if (dev->flags & DEV_REGULAR) 396 name = dev_name(dev); 397 else if (!(name = dev_name_confirmed(dev, quiet))) 398 return_0; 399 400 if (!(dev->flags & DEV_REGULAR)) { 401 if (stat(name, &buf) < 0) { 402 log_sys_error("%s: stat failed", name); 403 return 0; 404 } 405 if (buf.st_rdev != dev->dev) { 406 log_error("%s: device changed", name); 407 return 0; 408 } 409 } 410 411 #ifdef O_DIRECT_SUPPORT 412 if (direct) { 413 if (!(dev->flags & DEV_O_DIRECT_TESTED)) 414 dev->flags |= DEV_O_DIRECT; 415 416 if ((dev->flags & DEV_O_DIRECT)) 417 flags |= O_DIRECT; 418 } 419 #endif 420 421 #ifdef O_NOATIME 422 /* Don't update atime on device inodes */ 423 if (!(dev->flags & DEV_REGULAR)) 424 flags |= O_NOATIME; 425 #endif 426 427 if ((dev->fd = open(name, flags, 0777)) < 0) { 428 #ifdef O_DIRECT_SUPPORT 429 if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) { 430 flags &= ~O_DIRECT; 431 if ((dev->fd = open(name, flags, 0777)) >= 0) { 432 dev->flags &= ~DEV_O_DIRECT; 433 log_debug("%s: Not using O_DIRECT", name); 434 goto opened; 435 } 436 } 437 #endif 438 if (quiet) 439 log_sys_debug("open", name); 440 else 441 log_sys_error("open", name); 442 443 return 0; 444 } 445 446 #ifdef O_DIRECT_SUPPORT 447 opened: 448 if (direct) 449 dev->flags |= DEV_O_DIRECT_TESTED; 450 #endif 451 dev->open_count++; 452 dev->flags &= ~DEV_ACCESSED_W; 453 454 if (need_rw) 455 dev->flags |= DEV_OPENED_RW; 456 else 457 dev->flags &= ~DEV_OPENED_RW; 458 459 if (need_excl) 460 dev->flags |= DEV_OPENED_EXCL; 461 else 462 dev->flags &= ~DEV_OPENED_EXCL; 463 464 if (!(dev->flags & DEV_REGULAR) && 465 ((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) { 466 log_error("%s: fstat failed: Has device name changed?", name); 467 dev_close_immediate(dev); 468 return 0; 469 } 470 471 #ifndef O_DIRECT_SUPPORT 472 if (!(dev->flags & DEV_REGULAR)) 473 dev_flush(dev); 474 #endif 475 476 if ((flags & O_CREAT) && !(flags & O_TRUNC)) 477 dev->end = lseek(dev->fd, (off_t) 0, SEEK_END); 478 479 dm_list_add(&_open_devices, &dev->open_list); 480 481 log_debug("Opened %s %s%s%s", dev_name(dev), 482 dev->flags & DEV_OPENED_RW ? "RW" : "RO", 483 dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "", 484 dev->flags & DEV_O_DIRECT ? " O_DIRECT" : ""); 485 486 return 1; 487 } 488 489 int dev_open_quiet(struct device *dev) 490 { 491 int flags; 492 493 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 494 495 return dev_open_flags(dev, flags, 1, 1); 496 } 497 498 int dev_open(struct device *dev) 499 { 500 int flags; 501 502 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 503 504 return dev_open_flags(dev, flags, 1, 0); 505 } 506 507 int dev_test_excl(struct device *dev) 508 { 509 int flags; 510 int r; 511 512 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 513 flags |= O_EXCL; 514 515 r = dev_open_flags(dev, flags, 1, 1); 516 if (r) 517 dev_close_immediate(dev); 518 519 return r; 520 } 521 522 static void _close(struct device *dev) 523 { 524 if (close(dev->fd)) 525 log_sys_error("close", dev_name(dev)); 526 dev->fd = -1; 527 dev->block_size = -1; 528 dm_list_del(&dev->open_list); 529 530 log_debug("Closed %s", dev_name(dev)); 531 532 if (dev->flags & DEV_ALLOCED) { 533 dm_free((void *) dm_list_item(dev->aliases.n, struct str_list)-> 534 str); 535 dm_free(dev->aliases.n); 536 dm_free(dev); 537 } 538 } 539 540 static int _dev_close(struct device *dev, int immediate) 541 { 542 struct lvmcache_info *info; 543 544 if (dev->fd < 0) { 545 log_error("Attempt to close device '%s' " 546 "which is not open.", dev_name(dev)); 547 return 0; 548 } 549 550 #ifndef O_DIRECT_SUPPORT 551 if (dev->flags & DEV_ACCESSED_W) 552 dev_flush(dev); 553 #endif 554 555 if (dev->open_count > 0) 556 dev->open_count--; 557 558 if (immediate && dev->open_count) 559 log_debug("%s: Immediate close attempt while still referenced", 560 dev_name(dev)); 561 562 /* Close unless device is known to belong to a locked VG */ 563 if (immediate || 564 (dev->open_count < 1 && 565 (!(info = info_from_pvid(dev->pvid, 0)) || 566 !info->vginfo || 567 !vgname_is_locked(info->vginfo->vgname)))) 568 _close(dev); 569 570 return 1; 571 } 572 573 int dev_close(struct device *dev) 574 { 575 return _dev_close(dev, 0); 576 } 577 578 int dev_close_immediate(struct device *dev) 579 { 580 return _dev_close(dev, 1); 581 } 582 583 void dev_close_all(void) 584 { 585 struct dm_list *doh, *doht; 586 struct device *dev; 587 588 dm_list_iterate_safe(doh, doht, &_open_devices) { 589 dev = dm_list_struct_base(doh, struct device, open_list); 590 if (dev->open_count < 1) 591 _close(dev); 592 } 593 } 594 595 int dev_read(struct device *dev, uint64_t offset, size_t len, void *buffer) 596 { 597 struct device_area where; 598 599 if (!dev->open_count) 600 return_0; 601 602 where.dev = dev; 603 where.start = offset; 604 where.size = len; 605 606 return _aligned_io(&where, buffer, 0); 607 } 608 609 /* 610 * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted 611 * by (offset,len) and (offset2,len2). Thus, the total size of 612 * 'buf' should be len+len2. 613 */ 614 int dev_read_circular(struct device *dev, uint64_t offset, size_t len, 615 uint64_t offset2, size_t len2, void *buf) 616 { 617 if (!dev_read(dev, offset, len, buf)) { 618 log_error("Read from %s failed", dev_name(dev)); 619 return 0; 620 } 621 622 /* 623 * The second region is optional, and allows for 624 * a circular buffer on the device. 625 */ 626 if (!len2) 627 return 1; 628 629 if (!dev_read(dev, offset2, len2, buf + len)) { 630 log_error("Circular read from %s failed", 631 dev_name(dev)); 632 return 0; 633 } 634 635 return 1; 636 } 637 638 /* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after. 639 * But fails if concurrent processes writing 640 */ 641 642 /* FIXME pre-extend the file */ 643 int dev_append(struct device *dev, size_t len, void *buffer) 644 { 645 int r; 646 647 if (!dev->open_count) 648 return_0; 649 650 r = dev_write(dev, dev->end, len, buffer); 651 dev->end += (uint64_t) len; 652 653 #ifndef O_DIRECT_SUPPORT 654 dev_flush(dev); 655 #endif 656 return r; 657 } 658 659 int dev_write(struct device *dev, uint64_t offset, size_t len, void *buffer) 660 { 661 struct device_area where; 662 663 if (!dev->open_count) 664 return_0; 665 666 where.dev = dev; 667 where.start = offset; 668 where.size = len; 669 670 dev->flags |= DEV_ACCESSED_W; 671 672 return _aligned_io(&where, buffer, 1); 673 } 674 675 int dev_set(struct device *dev, uint64_t offset, size_t len, int value) 676 { 677 size_t s; 678 char buffer[4096] __attribute((aligned(8))); 679 680 if (!dev_open(dev)) 681 return_0; 682 683 if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE)) 684 log_debug("Wiping %s at %" PRIu64 " length %" PRIsize_t, 685 dev_name(dev), offset, len); 686 else 687 log_debug("Wiping %s at sector %" PRIu64 " length %" PRIsize_t 688 " sectors", dev_name(dev), offset >> SECTOR_SHIFT, 689 len >> SECTOR_SHIFT); 690 691 memset(buffer, value, sizeof(buffer)); 692 while (1) { 693 s = len > sizeof(buffer) ? sizeof(buffer) : len; 694 if (!dev_write(dev, offset, s, buffer)) 695 break; 696 697 len -= s; 698 if (!len) 699 break; 700 701 offset += s; 702 } 703 704 dev->flags |= DEV_ACCESSED_W; 705 706 if (!dev_close(dev)) 707 stack; 708 709 return (len == 0); 710 } 711