xref: /netbsd-src/external/gpl2/lvm2/dist/lib/device/dev-io.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$NetBSD: dev-io.c,v 1.6 2009/12/02 01:53:25 haad Exp $	*/
2 
3 /*
4  * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
5  * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6  *
7  * This file is part of LVM2.
8  *
9  * This copyrighted material is made available to anyone wishing to use,
10  * modify, copy, or redistribute it subject to the terms and conditions
11  * of the GNU Lesser General Public License v.2.1.
12  *
13  * You should have received a copy of the GNU Lesser General Public License
14  * along with this program; if not, write to the Free Software Foundation,
15  * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16  */
17 
18 #include "lib.h"
19 #include "lvm-types.h"
20 #include "device.h"
21 #include "metadata.h"
22 #include "lvmcache.h"
23 #include "memlock.h"
24 #include "locking.h"
25 
26 #include <limits.h>
27 #include <sys/stat.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <sys/ioctl.h>
31 
32 #ifdef linux
33 #  define u64 uint64_t		/* Missing without __KERNEL__ */
34 #  undef WNOHANG		/* Avoid redefinition */
35 #  undef WUNTRACED		/* Avoid redefinition */
36 #  include <linux/fs.h>		/* For block ioctl definitions */
37 #  define BLKSIZE_SHIFT SECTOR_SHIFT
38 #  ifndef BLKGETSIZE64		/* fs.h out-of-date */
39 #    define BLKGETSIZE64 _IOR(0x12, 114, size_t)
40 #  endif /* BLKGETSIZE64 */
41 #elif __NetBSD__
42 #  include <sys/disk.h>
43 #  include <sys/disklabel.h>
44 #  include <sys/param.h>
45 #else
46 #  include <sys/disk.h>
47 #  define BLKBSZGET DKIOCGETBLOCKSIZE
48 #  define BLKSSZGET DKIOCGETBLOCKSIZE
49 #  define BLKGETSIZE64 DKIOCGETBLOCKCOUNT
50 #  define BLKFLSBUF DKIOCSYNCHRONIZECACHE
51 #  define BLKSIZE_SHIFT 0
52 #endif
53 
54 #ifdef O_DIRECT_SUPPORT
55 #  ifndef O_DIRECT
56 #    error O_DIRECT support configured but O_DIRECT definition not found in headers
57 #  endif
58 #endif
59 
60 static DM_LIST_INIT(_open_devices);
61 
62 /*-----------------------------------------------------------------
63  * The standard io loop that keeps submitting an io until it's
64  * all gone.
65  *---------------------------------------------------------------*/
66 static int _io(struct device_area *where, void *buffer, int should_write)
67 {
68 	int fd = dev_fd(where->dev);
69 	ssize_t n = 0;
70 	size_t total = 0;
71 
72 	if (fd < 0) {
73 		log_error("Attempt to read an unopened device (%s).",
74 			  dev_name(where->dev));
75 		return 0;
76 	}
77 
78 	/*
79 	 * Skip all writes in test mode.
80 	 */
81 	if (should_write && test_mode())
82 		return 1;
83 
84 	if (where->size > SSIZE_MAX) {
85 		log_error("Read size too large: %" PRIu64, where->size);
86 		return 0;
87 	}
88 
89 	if (lseek(fd, (off_t) where->start, SEEK_SET) < 0) {
90 		log_error("%s: lseek %" PRIu64 " failed: %s",
91 			  dev_name(where->dev), (uint64_t) where->start,
92 			  strerror(errno));
93 		return 0;
94 	}
95 
96 	while (total < (size_t) where->size) {
97 		do
98 			n = should_write ?
99 			    write(fd, buffer, (size_t) where->size - total) :
100 			    read(fd, buffer, (size_t) where->size - total);
101 		while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN)));
102 
103 		if (n < 0)
104 			log_error("%s: %s failed after %" PRIu64 " of %" PRIu64
105 				  " at %" PRIu64 ": %s", dev_name(where->dev),
106 				  should_write ? "write" : "read",
107 				  (uint64_t) total,
108 				  (uint64_t) where->size,
109 				  (uint64_t) where->start, strerror(errno));
110 
111 		if (n <= 0)
112 			break;
113 
114 		total += n;
115 		buffer += n;
116 	}
117 
118 	return (total == (size_t) where->size);
119 }
120 
121 /*-----------------------------------------------------------------
122  * LVM2 uses O_DIRECT when performing metadata io, which requires
123  * block size aligned accesses.  If any io is not aligned we have
124  * to perform the io via a bounce buffer, obviously this is quite
125  * inefficient.
126  *---------------------------------------------------------------*/
127 
128 /*
129  * Get the sector size from an _open_ device.
130  */
131 static int _get_block_size(struct device *dev, unsigned int *size)
132 {
133 	const char *name = dev_name(dev);
134 #ifdef __NetBSD__
135 	struct disklabel	lab;
136 #endif
137 
138 	if ((dev->block_size == -1)) {
139 #ifdef __NetBSD__
140 		if (ioctl(dev_fd(dev), DIOCGDINFO, &lab) < 0) {
141 			dev->block_size = DEV_BSIZE;
142 		} else
143 			dev->block_size = lab.d_secsize;
144 #else
145 		if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) {
146 			log_sys_error("ioctl BLKBSZGET", name);
147 			return 0;
148 		}
149 #endif
150 		log_debug("%s: block size is %u bytes", name, dev->block_size);
151 	}
152 
153 	*size = (unsigned int) dev->block_size;
154 
155 	return 1;
156 }
157 
158 /*
159  * Widens a region to be an aligned region.
160  */
161 static void _widen_region(unsigned int block_size, struct device_area *region,
162 			  struct device_area *result)
163 {
164 	uint64_t mask = block_size - 1, delta;
165 	memcpy(result, region, sizeof(*result));
166 
167 	/* adjust the start */
168 	delta = result->start & mask;
169 	if (delta) {
170 		result->start -= delta;
171 		result->size += delta;
172 	}
173 
174 	/* adjust the end */
175 	delta = (result->start + result->size) & mask;
176 	if (delta)
177 		result->size += block_size - delta;
178 }
179 
180 static int _aligned_io(struct device_area *where, void *buffer,
181 		       int should_write)
182 {
183 	void *bounce;
184 	unsigned int block_size = 0;
185 	uintptr_t mask;
186 	struct device_area widened;
187 
188 	if (!(where->dev->flags & DEV_REGULAR) &&
189 	    !_get_block_size(where->dev, &block_size))
190 		return_0;
191 
192 	if (!block_size)
193 		block_size = lvm_getpagesize();
194 
195 	_widen_region(block_size, where, &widened);
196 
197 	/* Do we need to use a bounce buffer? */
198 	mask = block_size - 1;
199 	if (!memcmp(where, &widened, sizeof(widened)) &&
200 	    !((uintptr_t) buffer & mask))
201 		return _io(where, buffer, should_write);
202 
203 	/* Allocate a bounce buffer with an extra block */
204 	if (!(bounce = alloca((size_t) widened.size + block_size))) {
205 		log_error("Bounce buffer alloca failed");
206 		return 0;
207 	}
208 
209 	/*
210 	 * Realign start of bounce buffer (using the extra sector)
211 	 */
212 	if (((uintptr_t) bounce) & mask)
213 		bounce = (void *) ((((uintptr_t) bounce) + mask) & ~mask);
214 
215 	/* channel the io through the bounce buffer */
216 	if (!_io(&widened, bounce, 0)) {
217 		if (!should_write)
218 			return_0;
219 		/* FIXME pre-extend the file */
220 		memset(bounce, '\n', widened.size);
221 	}
222 
223 	if (should_write) {
224 		memcpy(bounce + (where->start - widened.start), buffer,
225 		       (size_t) where->size);
226 
227 		/* ... then we write */
228 		return _io(&widened, bounce, 1);
229 	}
230 
231 	memcpy(buffer, bounce + (where->start - widened.start),
232 	       (size_t) where->size);
233 
234 	return 1;
235 }
236 
237 static int _dev_get_size_file(const struct device *dev, uint64_t *size)
238 {
239 	const char *name = dev_name(dev);
240 	struct stat info;
241 
242 	if (stat(name, &info)) {
243 		log_sys_error("stat", name);
244 		return 0;
245 	}
246 
247 	*size = info.st_size;
248 	*size >>= SECTOR_SHIFT;	/* Convert to sectors */
249 
250 	log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
251 
252 	return 1;
253 }
254 
255 static int _dev_get_size_dev(const struct device *dev, uint64_t *size)
256 {
257 	int fd;
258 	const char *name = dev_name(dev);
259 #ifdef __NetBSD__
260 	struct disklabel	lab;
261 	struct dkwedge_info     dkw;
262 #endif
263 
264 	if ((fd = open(name, O_RDONLY)) < 0) {
265 #ifndef __NetBSD__
266 		log_sys_error("open", name);
267 #endif
268 		return 0;
269 		}
270 
271 #ifdef __NetBSD__
272 	if ((*size = lseek (fd, 0, SEEK_END)) < 0) {
273 		log_sys_error("lseek SEEK_END", name);
274 		close(fd);
275 		return 0;
276 	}
277 
278 	if (ioctl(fd, DIOCGDINFO, &lab) < 0) {
279 		if (ioctl(fd, DIOCGWEDGEINFO, &dkw) < 0) {
280 			log_debug("ioctl DIOCGWEDGEINFO", name);
281 			close(fd);
282 			return 0;
283 		} else
284 			if (dkw.dkw_size)
285 				*size = dkw.dkw_size;
286 	} else
287 		if (lab.d_secsize)
288 			*size /= lab.d_secsize;
289 #else
290 	if (ioctl(fd, BLKGETSIZE64, size) < 0) {
291 		log_sys_error("ioctl BLKGETSIZE64", name);
292 		if (close(fd))
293 			log_sys_error("close", name);
294 		return 0;
295 	}
296 
297 	*size >>= BLKSIZE_SHIFT;	/* Convert to sectors */
298 #endif
299 	if (close(fd))
300 		log_sys_error("close", name);
301 
302 	log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
303 
304 	return 1;
305 }
306 
307 static int _dev_read_ahead_dev(struct device *dev, uint32_t *read_ahead)
308 {
309 #ifdef linux
310 	long read_ahead_long;
311 
312 	if (dev->read_ahead != -1) {
313 		*read_ahead = (uint32_t) dev->read_ahead;
314 		return 1;
315 	}
316 
317 	if (!dev_open(dev))
318 		return_0;
319 
320 	if (ioctl(dev->fd, BLKRAGET, &read_ahead_long) < 0) {
321 		log_sys_error("ioctl BLKRAGET", dev_name(dev));
322 		if (!dev_close(dev))
323 			stack;
324 		return 0;
325 	}
326 
327 	if (!dev_close(dev))
328 		stack;
329 
330 	*read_ahead = (uint32_t) read_ahead_long;
331 	dev->read_ahead = read_ahead_long;
332 
333 	log_very_verbose("%s: read_ahead is %u sectors",
334 			 dev_name(dev), *read_ahead);
335 #endif
336 	return 1;
337 }
338 
339 /*-----------------------------------------------------------------
340  * Public functions
341  *---------------------------------------------------------------*/
342 
343 int dev_get_size(const struct device *dev, uint64_t *size)
344 {
345 	if (!dev)
346 		return 0;
347 
348 	if ((dev->flags & DEV_REGULAR))
349 		return _dev_get_size_file(dev, size);
350 	else
351 		return _dev_get_size_dev(dev, size);
352 }
353 
354 int dev_get_read_ahead(struct device *dev, uint32_t *read_ahead)
355 {
356 	if (!dev)
357 		return 0;
358 
359 	if (dev->flags & DEV_REGULAR) {
360 		*read_ahead = 0;
361 		return 1;
362 	}
363 
364 	return _dev_read_ahead_dev(dev, read_ahead);
365 }
366 
367 /* FIXME Unused
368 int dev_get_sectsize(struct device *dev, uint32_t *size)
369 {
370 	int fd;
371 	int s;
372 	const char *name = dev_name(dev);
373 
374 	if ((fd = open(name, O_RDONLY)) < 0) {
375 		log_sys_error("open", name);
376 		return 0;
377 	}
378 
379 	if (ioctl(fd, BLKSSZGET, &s) < 0) {
380 		log_sys_error("ioctl BLKSSZGET", name);
381 		if (close(fd))
382 			log_sys_error("close", name);
383 		return 0;
384 	}
385 
386 	if (close(fd))
387 		log_sys_error("close", name);
388 
389 	*size = (uint32_t) s;
390 
391 	log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size);
392 
393 	return 1;
394 }
395 */
396 
397 void dev_flush(struct device *dev)
398 {
399 #ifdef __linux__
400 	if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0)
401 		return;
402 #endif
403 
404 	if (fsync(dev->fd) >= 0)
405 		return;
406 
407 	sync();
408 }
409 
410 int dev_open_flags(struct device *dev, int flags, int direct, int quiet)
411 {
412 	struct stat buf;
413 	const char *name;
414 	int need_excl = 0, need_rw = 0;
415 
416 	if ((flags & O_ACCMODE) == O_RDWR)
417 		need_rw = 1;
418 
419 	if ((flags & O_EXCL))
420 		need_excl = 1;
421 
422 	if (dev->fd >= 0) {
423 		if (((dev->flags & DEV_OPENED_RW) || !need_rw) &&
424 		    ((dev->flags & DEV_OPENED_EXCL) || !need_excl)) {
425 			dev->open_count++;
426 			return 1;
427 		}
428 
429 		if (dev->open_count && !need_excl) {
430 			/* FIXME Ensure we never get here */
431 			log_debug("WARNING: %s already opened read-only",
432 				  dev_name(dev));
433 			dev->open_count++;
434 		}
435 
436 		dev_close_immediate(dev);
437 	}
438 
439 	if (memlock())
440 		log_error("WARNING: dev_open(%s) called while suspended",
441 			  dev_name(dev));
442 
443 	if (dev->flags & DEV_REGULAR)
444 		name = dev_name(dev);
445 	else if (!(name = dev_name_confirmed(dev, quiet)))
446 		return_0;
447 
448 	if (!(dev->flags & DEV_REGULAR)) {
449 		if (stat(name, &buf) < 0) {
450 			log_sys_error("%s: stat failed", name);
451 			return 0;
452 		}
453 		if (buf.st_rdev != dev->dev) {
454 			log_error("%s: device changed", name);
455 			return 0;
456 		}
457 	}
458 
459 #ifdef O_DIRECT_SUPPORT
460 	if (direct) {
461 		if (!(dev->flags & DEV_O_DIRECT_TESTED))
462 			dev->flags |= DEV_O_DIRECT;
463 
464 		if ((dev->flags & DEV_O_DIRECT))
465 			flags |= O_DIRECT;
466 	}
467 #endif
468 
469 #ifdef O_NOATIME
470 	/* Don't update atime on device inodes */
471 	if (!(dev->flags & DEV_REGULAR))
472 		flags |= O_NOATIME;
473 #endif
474 
475 	if ((dev->fd = open(name, flags, 0777)) < 0) {
476 #ifdef O_DIRECT_SUPPORT
477 		if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) {
478 			flags &= ~O_DIRECT;
479 			if ((dev->fd = open(name, flags, 0777)) >= 0) {
480 				dev->flags &= ~DEV_O_DIRECT;
481 				log_debug("%s: Not using O_DIRECT", name);
482 				goto opened;
483 			}
484 		}
485 #endif
486 		if (quiet)
487 			log_sys_debug("open", name);
488 		else
489 			log_sys_error("open", name);
490 
491 		return 0;
492 	}
493 
494 #ifdef O_DIRECT_SUPPORT
495       opened:
496 	if (direct)
497 		dev->flags |= DEV_O_DIRECT_TESTED;
498 #endif
499 	dev->open_count++;
500 	dev->flags &= ~DEV_ACCESSED_W;
501 
502 	if (need_rw)
503 		dev->flags |= DEV_OPENED_RW;
504 	else
505 		dev->flags &= ~DEV_OPENED_RW;
506 
507 	if (need_excl)
508 		dev->flags |= DEV_OPENED_EXCL;
509 	else
510 		dev->flags &= ~DEV_OPENED_EXCL;
511 
512 	if (!(dev->flags & DEV_REGULAR) &&
513 	    ((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) {
514 		log_error("%s: fstat failed: Has device name changed?", name);
515 		dev_close_immediate(dev);
516 		return 0;
517 	}
518 
519 #ifndef O_DIRECT_SUPPORT
520 	if (!(dev->flags & DEV_REGULAR))
521 		dev_flush(dev);
522 #endif
523 
524 	if ((flags & O_CREAT) && !(flags & O_TRUNC))
525 		dev->end = lseek(dev->fd, (off_t) 0, SEEK_END);
526 
527 	dm_list_add(&_open_devices, &dev->open_list);
528 
529 	log_debug("Opened %s %s%s%s", dev_name(dev),
530 		  dev->flags & DEV_OPENED_RW ? "RW" : "RO",
531 		  dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "",
532 		  dev->flags & DEV_O_DIRECT ? " O_DIRECT" : "");
533 
534 	return 1;
535 }
536 
537 int dev_open_quiet(struct device *dev)
538 {
539 	int flags;
540 
541 	flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
542 
543 	return dev_open_flags(dev, flags, 1, 1);
544 }
545 
546 int dev_open(struct device *dev)
547 {
548 	int flags;
549 
550 	flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
551 
552 	return dev_open_flags(dev, flags, 1, 0);
553 }
554 
555 int dev_test_excl(struct device *dev)
556 {
557 	int flags;
558 	int r;
559 
560 	flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
561 	flags |= O_EXCL;
562 
563 	r = dev_open_flags(dev, flags, 1, 1);
564 	if (r)
565 		dev_close_immediate(dev);
566 
567 	return r;
568 }
569 
570 static void _close(struct device *dev)
571 {
572 	if (close(dev->fd))
573 		log_sys_error("close", dev_name(dev));
574 	dev->fd = -1;
575 	dev->block_size = -1;
576 	dm_list_del(&dev->open_list);
577 
578 	log_debug("Closed %s", dev_name(dev));
579 
580 	if (dev->flags & DEV_ALLOCED) {
581 		dm_free((void *) dm_list_item(dev->aliases.n, struct str_list)->
582 			 str);
583 		dm_free(dev->aliases.n);
584 		dm_free(dev);
585 	}
586 }
587 
588 static int _dev_close(struct device *dev, int immediate)
589 {
590 	struct lvmcache_info *info;
591 
592 	if (dev->fd < 0) {
593 		log_error("Attempt to close device '%s' "
594 			  "which is not open.", dev_name(dev));
595 		return 0;
596 	}
597 
598 #ifndef O_DIRECT_SUPPORT
599 	if (dev->flags & DEV_ACCESSED_W)
600 		dev_flush(dev);
601 #endif
602 
603 	if (dev->open_count > 0)
604 		dev->open_count--;
605 
606 	if (immediate && dev->open_count)
607 		log_debug("%s: Immediate close attempt while still referenced",
608 			  dev_name(dev));
609 
610 	/* Close unless device is known to belong to a locked VG */
611 	if (immediate ||
612 	    (dev->open_count < 1 &&
613 	     (!(info = info_from_pvid(dev->pvid, 0)) ||
614 	      !info->vginfo ||
615 	      !vgname_is_locked(info->vginfo->vgname))))
616 		_close(dev);
617 
618 	return 1;
619 }
620 
621 int dev_close(struct device *dev)
622 {
623 	return _dev_close(dev, 0);
624 }
625 
626 int dev_close_immediate(struct device *dev)
627 {
628 	return _dev_close(dev, 1);
629 }
630 
631 void dev_close_all(void)
632 {
633 	struct dm_list *doh, *doht;
634 	struct device *dev;
635 
636 	dm_list_iterate_safe(doh, doht, &_open_devices) {
637 		dev = dm_list_struct_base(doh, struct device, open_list);
638 		if (dev->open_count < 1)
639 			_close(dev);
640 	}
641 }
642 
643 int dev_read(struct device *dev, uint64_t offset, size_t len, void *buffer)
644 {
645 	struct device_area where;
646 
647 	if (!dev->open_count)
648 		return_0;
649 
650 	where.dev = dev;
651 	where.start = offset;
652 	where.size = len;
653 
654 	return _aligned_io(&where, buffer, 0);
655 }
656 
657 /*
658  * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted
659  * by (offset,len) and (offset2,len2).  Thus, the total size of
660  * 'buf' should be len+len2.
661  */
662 int dev_read_circular(struct device *dev, uint64_t offset, size_t len,
663 		      uint64_t offset2, size_t len2, void *buf)
664 {
665 	if (!dev_read(dev, offset, len, buf)) {
666 		log_error("Read from %s failed", dev_name(dev));
667 		return 0;
668 	}
669 
670 	/*
671 	 * The second region is optional, and allows for
672 	 * a circular buffer on the device.
673 	 */
674 	if (!len2)
675 		return 1;
676 
677 	if (!dev_read(dev, offset2, len2, buf + len)) {
678 		log_error("Circular read from %s failed",
679 			  dev_name(dev));
680 		return 0;
681 	}
682 
683 	return 1;
684 }
685 
686 /* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after.
687  *       But fails if concurrent processes writing
688  */
689 
690 /* FIXME pre-extend the file */
691 int dev_append(struct device *dev, size_t len, void *buffer)
692 {
693 	int r;
694 
695 	if (!dev->open_count)
696 		return_0;
697 
698 	r = dev_write(dev, dev->end, len, buffer);
699 	dev->end += (uint64_t) len;
700 
701 #ifndef O_DIRECT_SUPPORT
702 	dev_flush(dev);
703 #endif
704 	return r;
705 }
706 
707 int dev_write(struct device *dev, uint64_t offset, size_t len, void *buffer)
708 {
709 	struct device_area where;
710 
711 	if (!dev->open_count)
712 		return_0;
713 
714 	where.dev = dev;
715 	where.start = offset;
716 	where.size = len;
717 
718 	dev->flags |= DEV_ACCESSED_W;
719 
720 	return _aligned_io(&where, buffer, 1);
721 }
722 
723 int dev_set(struct device *dev, uint64_t offset, size_t len, int value)
724 {
725 	size_t s;
726 	char buffer[4096] __attribute((aligned(8)));
727 
728 	if (!dev_open(dev))
729 		return_0;
730 
731 	if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE))
732 		log_debug("Wiping %s at %" PRIu64 " length %" PRIsize_t,
733 			  dev_name(dev), offset, len);
734 	else
735 		log_debug("Wiping %s at sector %" PRIu64 " length %" PRIsize_t
736 			  " sectors", dev_name(dev), offset >> SECTOR_SHIFT,
737 			  len >> SECTOR_SHIFT);
738 
739 	memset(buffer, value, sizeof(buffer));
740 	while (1) {
741 		s = len > sizeof(buffer) ? sizeof(buffer) : len;
742 		if (!dev_write(dev, offset, s, buffer))
743 			break;
744 
745 		len -= s;
746 		if (!len)
747 			break;
748 
749 		offset += s;
750 	}
751 
752 	dev->flags |= DEV_ACCESSED_W;
753 
754 	if (!dev_close(dev))
755 		stack;
756 
757 	return (len == 0);
758 }
759