xref: /netbsd-src/external/gpl2/lvm2/dist/lib/device/dev-io.c (revision 274254cdae52594c1aa480a736aef78313d15c9c)
1 /*	$NetBSD: dev-io.c,v 1.4 2009/02/18 12:16:13 haad Exp $	*/
2 
3 /*
4  * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
5  * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6  *
7  * This file is part of LVM2.
8  *
9  * This copyrighted material is made available to anyone wishing to use,
10  * modify, copy, or redistribute it subject to the terms and conditions
11  * of the GNU Lesser General Public License v.2.1.
12  *
13  * You should have received a copy of the GNU Lesser General Public License
14  * along with this program; if not, write to the Free Software Foundation,
15  * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16  */
17 
18 #include "lib.h"
19 #include "lvm-types.h"
20 #include "device.h"
21 #include "metadata.h"
22 #include "lvmcache.h"
23 #include "memlock.h"
24 #include "locking.h"
25 
26 #include <limits.h>
27 #include <sys/stat.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <sys/ioctl.h>
31 
32 #ifdef linux
33 #  define u64 uint64_t		/* Missing without __KERNEL__ */
34 #  undef WNOHANG		/* Avoid redefinition */
35 #  undef WUNTRACED		/* Avoid redefinition */
36 #  include <linux/fs.h>		/* For block ioctl definitions */
37 #  define BLKSIZE_SHIFT SECTOR_SHIFT
38 #  ifndef BLKGETSIZE64		/* fs.h out-of-date */
39 #    define BLKGETSIZE64 _IOR(0x12, 114, size_t)
40 #  endif /* BLKGETSIZE64 */
41 #elif __NetBSD__
42 #  include <sys/disk.h>
43 #  include <sys/disklabel.h>
44 #  include <sys/param.h>
45 #else
46 #  include <sys/disk.h>
47 #  define BLKBSZGET DKIOCGETBLOCKSIZE
48 #  define BLKSSZGET DKIOCGETBLOCKSIZE
49 #  define BLKGETSIZE64 DKIOCGETBLOCKCOUNT
50 #  define BLKFLSBUF DKIOCSYNCHRONIZECACHE
51 #  define BLKSIZE_SHIFT 0
52 #endif
53 
54 #ifdef O_DIRECT_SUPPORT
55 #  ifndef O_DIRECT
56 #    error O_DIRECT support configured but O_DIRECT definition not found in headers
57 #  endif
58 #endif
59 
60 static DM_LIST_INIT(_open_devices);
61 
62 /*-----------------------------------------------------------------
63  * The standard io loop that keeps submitting an io until it's
64  * all gone.
65  *---------------------------------------------------------------*/
66 static int _io(struct device_area *where, void *buffer, int should_write)
67 {
68 	int fd = dev_fd(where->dev);
69 	ssize_t n = 0;
70 	size_t total = 0;
71 
72 	if (fd < 0) {
73 		log_error("Attempt to read an unopened device (%s).",
74 			  dev_name(where->dev));
75 		return 0;
76 	}
77 
78 	/*
79 	 * Skip all writes in test mode.
80 	 */
81 	if (should_write && test_mode())
82 		return 1;
83 
84 	if (where->size > SSIZE_MAX) {
85 		log_error("Read size too large: %" PRIu64, where->size);
86 		return 0;
87 	}
88 
89 	if (lseek(fd, (off_t) where->start, SEEK_SET) < 0) {
90 		log_error("%s: lseek %" PRIu64 " failed: %s",
91 			  dev_name(where->dev), (uint64_t) where->start,
92 			  strerror(errno));
93 		return 0;
94 	}
95 
96 	while (total < (size_t) where->size) {
97 		do
98 			n = should_write ?
99 			    write(fd, buffer, (size_t) where->size - total) :
100 			    read(fd, buffer, (size_t) where->size - total);
101 		while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN)));
102 
103 		if (n < 0)
104 			log_error("%s: %s failed after %" PRIu64 " of %" PRIu64
105 				  " at %" PRIu64 ": %s", dev_name(where->dev),
106 				  should_write ? "write" : "read",
107 				  (uint64_t) total,
108 				  (uint64_t) where->size,
109 				  (uint64_t) where->start, strerror(errno));
110 
111 		if (n <= 0)
112 			break;
113 
114 		total += n;
115 		buffer += n;
116 	}
117 
118 	return (total == (size_t) where->size);
119 }
120 
121 /*-----------------------------------------------------------------
122  * LVM2 uses O_DIRECT when performing metadata io, which requires
123  * block size aligned accesses.  If any io is not aligned we have
124  * to perform the io via a bounce buffer, obviously this is quite
125  * inefficient.
126  *---------------------------------------------------------------*/
127 
128 /*
129  * Get the sector size from an _open_ device.
130  */
131 static int _get_block_size(struct device *dev, unsigned int *size)
132 {
133 	const char *name = dev_name(dev);
134 #ifdef __NetBSD__
135 	struct disklabel	lab;
136 #endif
137 
138 	if ((dev->block_size == -1)) {
139 #ifdef __NetBSD__
140 		if (ioctl(dev_fd(dev), DIOCGDINFO, &lab) < 0) {
141 			dev->block_size = DEV_BSIZE;
142 		} else
143 			dev->block_size = lab.d_secsize;
144 #else
145 		if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) {
146 			log_sys_error("ioctl BLKBSZGET", name);
147 			return 0;
148 		}
149 #endif
150 		log_debug("%s: block size is %u bytes", name, dev->block_size);
151 	}
152 
153 	*size = (unsigned int) dev->block_size;
154 
155 	return 1;
156 }
157 
158 /*
159  * Widens a region to be an aligned region.
160  */
161 static void _widen_region(unsigned int block_size, struct device_area *region,
162 			  struct device_area *result)
163 {
164 	uint64_t mask = block_size - 1, delta;
165 	memcpy(result, region, sizeof(*result));
166 
167 	/* adjust the start */
168 	delta = result->start & mask;
169 	if (delta) {
170 		result->start -= delta;
171 		result->size += delta;
172 	}
173 
174 	/* adjust the end */
175 	delta = (result->start + result->size) & mask;
176 	if (delta)
177 		result->size += block_size - delta;
178 }
179 
180 static int _aligned_io(struct device_area *where, void *buffer,
181 		       int should_write)
182 {
183 	void *bounce;
184 	unsigned int block_size = 0;
185 	uintptr_t mask;
186 	struct device_area widened;
187 
188 	if (!(where->dev->flags & DEV_REGULAR) &&
189 	    !_get_block_size(where->dev, &block_size))
190 		return_0;
191 
192 	if (!block_size)
193 		block_size = lvm_getpagesize();
194 
195 	_widen_region(block_size, where, &widened);
196 
197 	/* Do we need to use a bounce buffer? */
198 	mask = block_size - 1;
199 	if (!memcmp(where, &widened, sizeof(widened)) &&
200 	    !((uintptr_t) buffer & mask))
201 		return _io(where, buffer, should_write);
202 
203 	/* Allocate a bounce buffer with an extra block */
204 	if (!(bounce = alloca((size_t) widened.size + block_size))) {
205 		log_error("Bounce buffer alloca failed");
206 		return 0;
207 	}
208 
209 	/*
210 	 * Realign start of bounce buffer (using the extra sector)
211 	 */
212 	if (((uintptr_t) bounce) & mask)
213 		bounce = (void *) ((((uintptr_t) bounce) + mask) & ~mask);
214 
215 	/* channel the io through the bounce buffer */
216 	if (!_io(&widened, bounce, 0)) {
217 		if (!should_write)
218 			return_0;
219 		/* FIXME pre-extend the file */
220 		memset(bounce, '\n', widened.size);
221 	}
222 
223 	if (should_write) {
224 		memcpy(bounce + (where->start - widened.start), buffer,
225 		       (size_t) where->size);
226 
227 		/* ... then we write */
228 		return _io(&widened, bounce, 1);
229 	}
230 
231 	memcpy(buffer, bounce + (where->start - widened.start),
232 	       (size_t) where->size);
233 
234 	return 1;
235 }
236 
237 static int _dev_get_size_file(const struct device *dev, uint64_t *size)
238 {
239 	const char *name = dev_name(dev);
240 	struct stat info;
241 
242 	if (stat(name, &info)) {
243 		log_sys_error("stat", name);
244 		return 0;
245 	}
246 
247 	*size = info.st_size;
248 	*size >>= SECTOR_SHIFT;	/* Convert to sectors */
249 
250 	log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
251 
252 	return 1;
253 }
254 
255 static int _dev_get_size_dev(const struct device *dev, uint64_t *size)
256 {
257 	int fd;
258 	const char *name = dev_name(dev);
259 #ifdef __NetBSD__
260 	struct disklabel	lab;
261 	struct dkwedge_info     dkw;
262 #endif
263 
264 	if ((fd = open(name, O_RDONLY)) < 0) {
265 #ifndef __NetBSD__
266 		log_sys_error("open", name);
267 #endif
268 		return 0;
269 		}
270 
271 #ifdef __NetBSD__
272 	if ((*size = lseek (fd, 0, SEEK_END)) < 0) {
273 		log_sys_error("lseek SEEK_END", name);
274 		close(fd);
275 		return 0;
276 	}
277 
278 	if (ioctl(fd, DIOCGDINFO, &lab) < 0) {
279 		if (ioctl(fd, DIOCGWEDGEINFO, &dkw) < 0) {
280 			log_debug("ioctl DIOCGWEDGEINFO", name);
281 			close(fd);
282 			return 0;
283 		} else
284 			if (dkw.dkw_size)
285 				*size = dkw.dkw_size;
286 	} else
287 		if (lab.d_secsize)
288 			*size /= lab.d_secsize;
289 #else
290 	if (ioctl(fd, BLKGETSIZE64, size) < 0) {
291 		log_sys_error("ioctl BLKGETSIZE64", name);
292 		if (close(fd))
293 			log_sys_error("close", name);
294 		return 0;
295 	}
296 
297 	*size >>= BLKSIZE_SHIFT;	/* Convert to sectors */
298 #endif
299 	if (close(fd))
300 		log_sys_error("close", name);
301 
302 	log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
303 
304 	return 1;
305 }
306 
307 /*-----------------------------------------------------------------
308  * Public functions
309  *---------------------------------------------------------------*/
310 
311 int dev_get_size(const struct device *dev, uint64_t *size)
312 {
313 	if (!dev)
314 		return 0;
315 
316 	if ((dev->flags & DEV_REGULAR))
317 		return _dev_get_size_file(dev, size);
318 	else
319 		return _dev_get_size_dev(dev, size);
320 }
321 
322 /* FIXME Unused
323 int dev_get_sectsize(struct device *dev, uint32_t *size)
324 {
325 	int fd;
326 	int s;
327 	const char *name = dev_name(dev);
328 
329 	if ((fd = open(name, O_RDONLY)) < 0) {
330 		log_sys_error("open", name);
331 		return 0;
332 	}
333 
334 	if (ioctl(fd, BLKSSZGET, &s) < 0) {
335 		log_sys_error("ioctl BLKSSZGET", name);
336 		if (close(fd))
337 			log_sys_error("close", name);
338 		return 0;
339 	}
340 
341 	if (close(fd))
342 		log_sys_error("close", name);
343 
344 	*size = (uint32_t) s;
345 
346 	log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size);
347 
348 	return 1;
349 }
350 */
351 
352 void dev_flush(struct device *dev)
353 {
354 #ifdef __linux__
355 	if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0)
356 		return;
357 #endif
358 
359 	if (fsync(dev->fd) >= 0)
360 		return;
361 
362 	sync();
363 }
364 
365 int dev_open_flags(struct device *dev, int flags, int direct, int quiet)
366 {
367 	struct stat buf;
368 	const char *name;
369 	int need_excl = 0, need_rw = 0;
370 
371 	if ((flags & O_ACCMODE) == O_RDWR)
372 		need_rw = 1;
373 
374 	if ((flags & O_EXCL))
375 		need_excl = 1;
376 
377 	if (dev->fd >= 0) {
378 		if (((dev->flags & DEV_OPENED_RW) || !need_rw) &&
379 		    ((dev->flags & DEV_OPENED_EXCL) || !need_excl)) {
380 			dev->open_count++;
381 			return 1;
382 		}
383 
384 		if (dev->open_count && !need_excl) {
385 			/* FIXME Ensure we never get here */
386 			log_debug("WARNING: %s already opened read-only",
387 				  dev_name(dev));
388 			dev->open_count++;
389 		}
390 
391 		dev_close_immediate(dev);
392 	}
393 
394 	if (memlock())
395 		log_error("WARNING: dev_open(%s) called while suspended",
396 			  dev_name(dev));
397 
398 	if (dev->flags & DEV_REGULAR)
399 		name = dev_name(dev);
400 	else if (!(name = dev_name_confirmed(dev, quiet)))
401 		return_0;
402 
403 	if (!(dev->flags & DEV_REGULAR)) {
404 		if (stat(name, &buf) < 0) {
405 			log_sys_error("%s: stat failed", name);
406 			return 0;
407 		}
408 		if (buf.st_rdev != dev->dev) {
409 			log_error("%s: device changed", name);
410 			return 0;
411 		}
412 	}
413 
414 #ifdef O_DIRECT_SUPPORT
415 	if (direct) {
416 		if (!(dev->flags & DEV_O_DIRECT_TESTED))
417 			dev->flags |= DEV_O_DIRECT;
418 
419 		if ((dev->flags & DEV_O_DIRECT))
420 			flags |= O_DIRECT;
421 	}
422 #endif
423 
424 #ifdef O_NOATIME
425 	/* Don't update atime on device inodes */
426 	if (!(dev->flags & DEV_REGULAR))
427 		flags |= O_NOATIME;
428 #endif
429 
430 	if ((dev->fd = open(name, flags, 0777)) < 0) {
431 #ifdef O_DIRECT_SUPPORT
432 		if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) {
433 			flags &= ~O_DIRECT;
434 			if ((dev->fd = open(name, flags, 0777)) >= 0) {
435 				dev->flags &= ~DEV_O_DIRECT;
436 				log_debug("%s: Not using O_DIRECT", name);
437 				goto opened;
438 			}
439 		}
440 #endif
441 		if (quiet)
442 			log_sys_debug("open", name);
443 		else
444 			log_sys_error("open", name);
445 
446 		return 0;
447 	}
448 
449 #ifdef O_DIRECT_SUPPORT
450       opened:
451 	if (direct)
452 		dev->flags |= DEV_O_DIRECT_TESTED;
453 #endif
454 	dev->open_count++;
455 	dev->flags &= ~DEV_ACCESSED_W;
456 
457 	if (need_rw)
458 		dev->flags |= DEV_OPENED_RW;
459 	else
460 		dev->flags &= ~DEV_OPENED_RW;
461 
462 	if (need_excl)
463 		dev->flags |= DEV_OPENED_EXCL;
464 	else
465 		dev->flags &= ~DEV_OPENED_EXCL;
466 
467 	if (!(dev->flags & DEV_REGULAR) &&
468 	    ((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) {
469 		log_error("%s: fstat failed: Has device name changed?", name);
470 		dev_close_immediate(dev);
471 		return 0;
472 	}
473 
474 #ifndef O_DIRECT_SUPPORT
475 	if (!(dev->flags & DEV_REGULAR))
476 		dev_flush(dev);
477 #endif
478 
479 	if ((flags & O_CREAT) && !(flags & O_TRUNC))
480 		dev->end = lseek(dev->fd, (off_t) 0, SEEK_END);
481 
482 	dm_list_add(&_open_devices, &dev->open_list);
483 
484 	log_debug("Opened %s %s%s%s", dev_name(dev),
485 		  dev->flags & DEV_OPENED_RW ? "RW" : "RO",
486 		  dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "",
487 		  dev->flags & DEV_O_DIRECT ? " O_DIRECT" : "");
488 
489 	return 1;
490 }
491 
492 int dev_open_quiet(struct device *dev)
493 {
494 	int flags;
495 
496 	flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
497 
498 	return dev_open_flags(dev, flags, 1, 1);
499 }
500 
501 int dev_open(struct device *dev)
502 {
503 	int flags;
504 
505 	flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
506 
507 	return dev_open_flags(dev, flags, 1, 0);
508 }
509 
510 int dev_test_excl(struct device *dev)
511 {
512 	int flags;
513 	int r;
514 
515 	flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
516 	flags |= O_EXCL;
517 
518 	r = dev_open_flags(dev, flags, 1, 1);
519 	if (r)
520 		dev_close_immediate(dev);
521 
522 	return r;
523 }
524 
525 static void _close(struct device *dev)
526 {
527 	if (close(dev->fd))
528 		log_sys_error("close", dev_name(dev));
529 	dev->fd = -1;
530 	dev->block_size = -1;
531 	dm_list_del(&dev->open_list);
532 
533 	log_debug("Closed %s", dev_name(dev));
534 
535 	if (dev->flags & DEV_ALLOCED) {
536 		dm_free((void *) dm_list_item(dev->aliases.n, struct str_list)->
537 			 str);
538 		dm_free(dev->aliases.n);
539 		dm_free(dev);
540 	}
541 }
542 
543 static int _dev_close(struct device *dev, int immediate)
544 {
545 	struct lvmcache_info *info;
546 
547 	if (dev->fd < 0) {
548 		log_error("Attempt to close device '%s' "
549 			  "which is not open.", dev_name(dev));
550 		return 0;
551 	}
552 
553 #ifndef O_DIRECT_SUPPORT
554 	if (dev->flags & DEV_ACCESSED_W)
555 		dev_flush(dev);
556 #endif
557 
558 	if (dev->open_count > 0)
559 		dev->open_count--;
560 
561 	if (immediate && dev->open_count)
562 		log_debug("%s: Immediate close attempt while still referenced",
563 			  dev_name(dev));
564 
565 	/* Close unless device is known to belong to a locked VG */
566 	if (immediate ||
567 	    (dev->open_count < 1 &&
568 	     (!(info = info_from_pvid(dev->pvid, 0)) ||
569 	      !info->vginfo ||
570 	      !vgname_is_locked(info->vginfo->vgname))))
571 		_close(dev);
572 
573 	return 1;
574 }
575 
576 int dev_close(struct device *dev)
577 {
578 	return _dev_close(dev, 0);
579 }
580 
581 int dev_close_immediate(struct device *dev)
582 {
583 	return _dev_close(dev, 1);
584 }
585 
586 void dev_close_all(void)
587 {
588 	struct dm_list *doh, *doht;
589 	struct device *dev;
590 
591 	dm_list_iterate_safe(doh, doht, &_open_devices) {
592 		dev = dm_list_struct_base(doh, struct device, open_list);
593 		if (dev->open_count < 1)
594 			_close(dev);
595 	}
596 }
597 
598 int dev_read(struct device *dev, uint64_t offset, size_t len, void *buffer)
599 {
600 	struct device_area where;
601 
602 	if (!dev->open_count)
603 		return_0;
604 
605 	where.dev = dev;
606 	where.start = offset;
607 	where.size = len;
608 
609 	return _aligned_io(&where, buffer, 0);
610 }
611 
612 /*
613  * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted
614  * by (offset,len) and (offset2,len2).  Thus, the total size of
615  * 'buf' should be len+len2.
616  */
617 int dev_read_circular(struct device *dev, uint64_t offset, size_t len,
618 		      uint64_t offset2, size_t len2, void *buf)
619 {
620 	if (!dev_read(dev, offset, len, buf)) {
621 		log_error("Read from %s failed", dev_name(dev));
622 		return 0;
623 	}
624 
625 	/*
626 	 * The second region is optional, and allows for
627 	 * a circular buffer on the device.
628 	 */
629 	if (!len2)
630 		return 1;
631 
632 	if (!dev_read(dev, offset2, len2, buf + len)) {
633 		log_error("Circular read from %s failed",
634 			  dev_name(dev));
635 		return 0;
636 	}
637 
638 	return 1;
639 }
640 
641 /* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after.
642  *       But fails if concurrent processes writing
643  */
644 
645 /* FIXME pre-extend the file */
646 int dev_append(struct device *dev, size_t len, void *buffer)
647 {
648 	int r;
649 
650 	if (!dev->open_count)
651 		return_0;
652 
653 	r = dev_write(dev, dev->end, len, buffer);
654 	dev->end += (uint64_t) len;
655 
656 #ifndef O_DIRECT_SUPPORT
657 	dev_flush(dev);
658 #endif
659 	return r;
660 }
661 
662 int dev_write(struct device *dev, uint64_t offset, size_t len, void *buffer)
663 {
664 	struct device_area where;
665 
666 	if (!dev->open_count)
667 		return_0;
668 
669 	where.dev = dev;
670 	where.start = offset;
671 	where.size = len;
672 
673 	dev->flags |= DEV_ACCESSED_W;
674 
675 	return _aligned_io(&where, buffer, 1);
676 }
677 
678 int dev_set(struct device *dev, uint64_t offset, size_t len, int value)
679 {
680 	size_t s;
681 	char buffer[4096] __attribute((aligned(8)));
682 
683 	if (!dev_open(dev))
684 		return_0;
685 
686 	if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE))
687 		log_debug("Wiping %s at %" PRIu64 " length %" PRIsize_t,
688 			  dev_name(dev), offset, len);
689 	else
690 		log_debug("Wiping %s at sector %" PRIu64 " length %" PRIsize_t
691 			  " sectors", dev_name(dev), offset >> SECTOR_SHIFT,
692 			  len >> SECTOR_SHIFT);
693 
694 	memset(buffer, value, sizeof(buffer));
695 	while (1) {
696 		s = len > sizeof(buffer) ? sizeof(buffer) : len;
697 		if (!dev_write(dev, offset, s, buffer))
698 			break;
699 
700 		len -= s;
701 		if (!len)
702 			break;
703 
704 		offset += s;
705 	}
706 
707 	dev->flags |= DEV_ACCESSED_W;
708 
709 	if (!dev_close(dev))
710 		stack;
711 
712 	return (len == 0);
713 }
714