xref: /netbsd-src/external/gpl2/lvm2/dist/lib/device/dev-io.c (revision 404fbe5fb94ca1e054339640cabb2801ce52dd30)
1 /*	$NetBSD: dev-io.c,v 1.3 2009/01/06 23:21:16 haad Exp $	*/
2 
3 /*
4  * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
5  * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6  *
7  * This file is part of LVM2.
8  *
9  * This copyrighted material is made available to anyone wishing to use,
10  * modify, copy, or redistribute it subject to the terms and conditions
11  * of the GNU Lesser General Public License v.2.1.
12  *
13  * You should have received a copy of the GNU Lesser General Public License
14  * along with this program; if not, write to the Free Software Foundation,
15  * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16  */
17 
18 #include "lib.h"
19 #include "lvm-types.h"
20 #include "device.h"
21 #include "metadata.h"
22 #include "lvmcache.h"
23 #include "memlock.h"
24 #include "locking.h"
25 
26 #include <limits.h>
27 #include <sys/stat.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <sys/ioctl.h>
31 
32 #ifdef linux
33 #  define u64 uint64_t		/* Missing without __KERNEL__ */
34 #  undef WNOHANG		/* Avoid redefinition */
35 #  undef WUNTRACED		/* Avoid redefinition */
36 #  include <linux/fs.h>		/* For block ioctl definitions */
37 #  define BLKSIZE_SHIFT SECTOR_SHIFT
38 #  ifndef BLKGETSIZE64		/* fs.h out-of-date */
39 #    define BLKGETSIZE64 _IOR(0x12, 114, size_t)
40 #  endif /* BLKGETSIZE64 */
41 #elif __NetBSD__
42 #  include <sys/disk.h>
43 #  include <sys/disklabel.h>
44 #  include <sys/param.h>
45 #else
46 #  include <sys/disk.h>
47 #  define BLKBSZGET DKIOCGETBLOCKSIZE
48 #  define BLKSSZGET DKIOCGETBLOCKSIZE
49 #  define BLKGETSIZE64 DKIOCGETBLOCKCOUNT
50 #  define BLKFLSBUF DKIOCSYNCHRONIZECACHE
51 #  define BLKSIZE_SHIFT 0
52 #endif
53 
54 #ifdef O_DIRECT_SUPPORT
55 #  ifndef O_DIRECT
56 #    error O_DIRECT support configured but O_DIRECT definition not found in headers
57 #  endif
58 #endif
59 
60 static DM_LIST_INIT(_open_devices);
61 
62 /*-----------------------------------------------------------------
63  * The standard io loop that keeps submitting an io until it's
64  * all gone.
65  *---------------------------------------------------------------*/
66 static int _io(struct device_area *where, void *buffer, int should_write)
67 {
68 	int fd = dev_fd(where->dev);
69 	ssize_t n = 0;
70 	size_t total = 0;
71 
72 	if (fd < 0) {
73 		log_error("Attempt to read an unopened device (%s).",
74 			  dev_name(where->dev));
75 		return 0;
76 	}
77 
78 	/*
79 	 * Skip all writes in test mode.
80 	 */
81 	if (should_write && test_mode())
82 		return 1;
83 
84 	if (where->size > SSIZE_MAX) {
85 		log_error("Read size too large: %" PRIu64, where->size);
86 		return 0;
87 	}
88 
89 	if (lseek(fd, (off_t) where->start, SEEK_SET) < 0) {
90 		log_error("%s: lseek %" PRIu64 " failed: %s",
91 			  dev_name(where->dev), (uint64_t) where->start,
92 			  strerror(errno));
93 		return 0;
94 	}
95 
96 	while (total < (size_t) where->size) {
97 		do
98 			n = should_write ?
99 			    write(fd, buffer, (size_t) where->size - total) :
100 			    read(fd, buffer, (size_t) where->size - total);
101 		while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN)));
102 
103 		if (n < 0)
104 			log_error("%s: %s failed after %" PRIu64 " of %" PRIu64
105 				  " at %" PRIu64 ": %s", dev_name(where->dev),
106 				  should_write ? "write" : "read",
107 				  (uint64_t) total,
108 				  (uint64_t) where->size,
109 				  (uint64_t) where->start, strerror(errno));
110 
111 		if (n <= 0)
112 			break;
113 
114 		total += n;
115 		buffer += n;
116 	}
117 
118 	return (total == (size_t) where->size);
119 }
120 
121 /*-----------------------------------------------------------------
122  * LVM2 uses O_DIRECT when performing metadata io, which requires
123  * block size aligned accesses.  If any io is not aligned we have
124  * to perform the io via a bounce buffer, obviously this is quite
125  * inefficient.
126  *---------------------------------------------------------------*/
127 
128 /*
129  * Get the sector size from an _open_ device.
130  */
131 static int _get_block_size(struct device *dev, unsigned int *size)
132 {
133 	const char *name = dev_name(dev);
134 #ifdef __NetBSD__
135 	struct disklabel	lab;
136 #endif
137 
138 	if ((dev->block_size == -1)) {
139 #ifdef __NetBSD__
140 		if (ioctl(dev_fd(dev), DIOCGDINFO, &lab) < 0) {
141 			dev->block_size = DEV_BSIZE;
142 		} else
143 			dev->block_size = lab.d_secsize;
144 #else
145 		if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) {
146 			log_sys_error("ioctl BLKBSZGET", name);
147 			return 0;
148 		}
149 #endif
150 		log_debug("%s: block size is %u bytes", name, dev->block_size);
151 	}
152 
153 	*size = (unsigned int) dev->block_size;
154 
155 	return 1;
156 }
157 
158 /*
159  * Widens a region to be an aligned region.
160  */
161 static void _widen_region(unsigned int block_size, struct device_area *region,
162 			  struct device_area *result)
163 {
164 	uint64_t mask = block_size - 1, delta;
165 	memcpy(result, region, sizeof(*result));
166 
167 	/* adjust the start */
168 	delta = result->start & mask;
169 	if (delta) {
170 		result->start -= delta;
171 		result->size += delta;
172 	}
173 
174 	/* adjust the end */
175 	delta = (result->start + result->size) & mask;
176 	if (delta)
177 		result->size += block_size - delta;
178 }
179 
180 static int _aligned_io(struct device_area *where, void *buffer,
181 		       int should_write)
182 {
183 	void *bounce;
184 	unsigned int block_size = 0;
185 	uintptr_t mask;
186 	struct device_area widened;
187 
188 	if (!(where->dev->flags & DEV_REGULAR) &&
189 	    !_get_block_size(where->dev, &block_size))
190 		return_0;
191 
192 	if (!block_size)
193 		block_size = lvm_getpagesize();
194 
195 	_widen_region(block_size, where, &widened);
196 
197 	/* Do we need to use a bounce buffer? */
198 	mask = block_size - 1;
199 	if (!memcmp(where, &widened, sizeof(widened)) &&
200 	    !((uintptr_t) buffer & mask))
201 		return _io(where, buffer, should_write);
202 
203 	/* Allocate a bounce buffer with an extra block */
204 	if (!(bounce = alloca((size_t) widened.size + block_size))) {
205 		log_error("Bounce buffer alloca failed");
206 		return 0;
207 	}
208 
209 	/*
210 	 * Realign start of bounce buffer (using the extra sector)
211 	 */
212 	if (((uintptr_t) bounce) & mask)
213 		bounce = (void *) ((((uintptr_t) bounce) + mask) & ~mask);
214 
215 	/* channel the io through the bounce buffer */
216 	if (!_io(&widened, bounce, 0)) {
217 		if (!should_write)
218 			return_0;
219 		/* FIXME pre-extend the file */
220 		memset(bounce, '\n', widened.size);
221 	}
222 
223 	if (should_write) {
224 		memcpy(bounce + (where->start - widened.start), buffer,
225 		       (size_t) where->size);
226 
227 		/* ... then we write */
228 		return _io(&widened, bounce, 1);
229 	}
230 
231 	memcpy(buffer, bounce + (where->start - widened.start),
232 	       (size_t) where->size);
233 
234 	return 1;
235 }
236 
237 static int _dev_get_size_file(const struct device *dev, uint64_t *size)
238 {
239 	const char *name = dev_name(dev);
240 	struct stat info;
241 
242 	if (stat(name, &info)) {
243 		log_sys_error("stat", name);
244 		return 0;
245 	}
246 
247 	*size = info.st_size;
248 	*size >>= SECTOR_SHIFT;	/* Convert to sectors */
249 
250 	log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
251 
252 	return 1;
253 }
254 
255 static int _dev_get_size_dev(const struct device *dev, uint64_t *size)
256 {
257 	int fd;
258 	const char *name = dev_name(dev);
259 #ifdef __NetBSD__
260 	struct disklabel	lab;
261 	struct dkwedge_info     dkw;
262 #endif
263 
264 	if ((fd = open(name, O_RDONLY)) < 0) {
265 #ifndef __NetBSD__
266 		log_sys_error("open", name);
267 #endif
268 		return 0;
269 		}
270 
271 #ifdef __NetBSD__
272 	if ((*size = lseek (fd, 0, SEEK_END)) < 0) {
273 		log_sys_error("lseek SEEK_END", name);
274 		close(fd);
275 		return 0;
276 	}
277 
278 	if (ioctl(fd, DIOCGDINFO, &lab) < 0) {
279 		if (ioctl(fd, DIOCGWEDGEINFO, &dkw) < 0) {
280 			log_debug("ioctl DIOCGWEDGEINFO", name);
281 			close(fd);
282 			return 0;
283 		} else
284 			if (dkw.dkw_size)
285 				*size = dkw.dkw_size;
286 	} else
287 		if (lab.d_secsize)
288 			*size /= lab.d_secsize;
289 #else
290 	if (ioctl(fd, BLKGETSIZE64, size) < 0) {
291 		log_sys_error("ioctl BLKGETSIZE64", name);
292 		if (close(fd))
293 			log_sys_error("close", name);
294 		return 0;
295 	}
296 
297 	*size >>= BLKSIZE_SHIFT;	/* Convert to sectors */
298 #endif
299 	if (close(fd))
300 		log_sys_error("close", name);
301 
302 	log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
303 
304 	return 1;
305 }
306 
307 /*-----------------------------------------------------------------
308  * Public functions
309  *---------------------------------------------------------------*/
310 
311 int dev_get_size(const struct device *dev, uint64_t *size)
312 {
313 	if ((dev->flags & DEV_REGULAR))
314 		return _dev_get_size_file(dev, size);
315 	else
316 		return _dev_get_size_dev(dev, size);
317 }
318 
319 /* FIXME Unused
320 int dev_get_sectsize(struct device *dev, uint32_t *size)
321 {
322 	int fd;
323 	int s;
324 	const char *name = dev_name(dev);
325 
326 	if ((fd = open(name, O_RDONLY)) < 0) {
327 		log_sys_error("open", name);
328 		return 0;
329 	}
330 
331 	if (ioctl(fd, BLKSSZGET, &s) < 0) {
332 		log_sys_error("ioctl BLKSSZGET", name);
333 		if (close(fd))
334 			log_sys_error("close", name);
335 		return 0;
336 	}
337 
338 	if (close(fd))
339 		log_sys_error("close", name);
340 
341 	*size = (uint32_t) s;
342 
343 	log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size);
344 
345 	return 1;
346 }
347 */
348 
349 void dev_flush(struct device *dev)
350 {
351 #ifdef __linux__
352 	if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0)
353 		return;
354 #endif
355 
356 	if (fsync(dev->fd) >= 0)
357 		return;
358 
359 	sync();
360 }
361 
362 int dev_open_flags(struct device *dev, int flags, int direct, int quiet)
363 {
364 	struct stat buf;
365 	const char *name;
366 	int need_excl = 0, need_rw = 0;
367 
368 	if ((flags & O_ACCMODE) == O_RDWR)
369 		need_rw = 1;
370 
371 	if ((flags & O_EXCL))
372 		need_excl = 1;
373 
374 	if (dev->fd >= 0) {
375 		if (((dev->flags & DEV_OPENED_RW) || !need_rw) &&
376 		    ((dev->flags & DEV_OPENED_EXCL) || !need_excl)) {
377 			dev->open_count++;
378 			return 1;
379 		}
380 
381 		if (dev->open_count && !need_excl) {
382 			/* FIXME Ensure we never get here */
383 			log_debug("WARNING: %s already opened read-only",
384 				  dev_name(dev));
385 			dev->open_count++;
386 		}
387 
388 		dev_close_immediate(dev);
389 	}
390 
391 	if (memlock())
392 		log_error("WARNING: dev_open(%s) called while suspended",
393 			  dev_name(dev));
394 
395 	if (dev->flags & DEV_REGULAR)
396 		name = dev_name(dev);
397 	else if (!(name = dev_name_confirmed(dev, quiet)))
398 		return_0;
399 
400 	if (!(dev->flags & DEV_REGULAR)) {
401 		if (stat(name, &buf) < 0) {
402 			log_sys_error("%s: stat failed", name);
403 			return 0;
404 		}
405 		if (buf.st_rdev != dev->dev) {
406 			log_error("%s: device changed", name);
407 			return 0;
408 		}
409 	}
410 
411 #ifdef O_DIRECT_SUPPORT
412 	if (direct) {
413 		if (!(dev->flags & DEV_O_DIRECT_TESTED))
414 			dev->flags |= DEV_O_DIRECT;
415 
416 		if ((dev->flags & DEV_O_DIRECT))
417 			flags |= O_DIRECT;
418 	}
419 #endif
420 
421 #ifdef O_NOATIME
422 	/* Don't update atime on device inodes */
423 	if (!(dev->flags & DEV_REGULAR))
424 		flags |= O_NOATIME;
425 #endif
426 
427 	if ((dev->fd = open(name, flags, 0777)) < 0) {
428 #ifdef O_DIRECT_SUPPORT
429 		if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) {
430 			flags &= ~O_DIRECT;
431 			if ((dev->fd = open(name, flags, 0777)) >= 0) {
432 				dev->flags &= ~DEV_O_DIRECT;
433 				log_debug("%s: Not using O_DIRECT", name);
434 				goto opened;
435 			}
436 		}
437 #endif
438 		if (quiet)
439 			log_sys_debug("open", name);
440 		else
441 			log_sys_error("open", name);
442 
443 		return 0;
444 	}
445 
446 #ifdef O_DIRECT_SUPPORT
447       opened:
448 	if (direct)
449 		dev->flags |= DEV_O_DIRECT_TESTED;
450 #endif
451 	dev->open_count++;
452 	dev->flags &= ~DEV_ACCESSED_W;
453 
454 	if (need_rw)
455 		dev->flags |= DEV_OPENED_RW;
456 	else
457 		dev->flags &= ~DEV_OPENED_RW;
458 
459 	if (need_excl)
460 		dev->flags |= DEV_OPENED_EXCL;
461 	else
462 		dev->flags &= ~DEV_OPENED_EXCL;
463 
464 	if (!(dev->flags & DEV_REGULAR) &&
465 	    ((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) {
466 		log_error("%s: fstat failed: Has device name changed?", name);
467 		dev_close_immediate(dev);
468 		return 0;
469 	}
470 
471 #ifndef O_DIRECT_SUPPORT
472 	if (!(dev->flags & DEV_REGULAR))
473 		dev_flush(dev);
474 #endif
475 
476 	if ((flags & O_CREAT) && !(flags & O_TRUNC))
477 		dev->end = lseek(dev->fd, (off_t) 0, SEEK_END);
478 
479 	dm_list_add(&_open_devices, &dev->open_list);
480 
481 	log_debug("Opened %s %s%s%s", dev_name(dev),
482 		  dev->flags & DEV_OPENED_RW ? "RW" : "RO",
483 		  dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "",
484 		  dev->flags & DEV_O_DIRECT ? " O_DIRECT" : "");
485 
486 	return 1;
487 }
488 
489 int dev_open_quiet(struct device *dev)
490 {
491 	int flags;
492 
493 	flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
494 
495 	return dev_open_flags(dev, flags, 1, 1);
496 }
497 
498 int dev_open(struct device *dev)
499 {
500 	int flags;
501 
502 	flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
503 
504 	return dev_open_flags(dev, flags, 1, 0);
505 }
506 
507 int dev_test_excl(struct device *dev)
508 {
509 	int flags;
510 	int r;
511 
512 	flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
513 	flags |= O_EXCL;
514 
515 	r = dev_open_flags(dev, flags, 1, 1);
516 	if (r)
517 		dev_close_immediate(dev);
518 
519 	return r;
520 }
521 
522 static void _close(struct device *dev)
523 {
524 	if (close(dev->fd))
525 		log_sys_error("close", dev_name(dev));
526 	dev->fd = -1;
527 	dev->block_size = -1;
528 	dm_list_del(&dev->open_list);
529 
530 	log_debug("Closed %s", dev_name(dev));
531 
532 	if (dev->flags & DEV_ALLOCED) {
533 		dm_free((void *) dm_list_item(dev->aliases.n, struct str_list)->
534 			 str);
535 		dm_free(dev->aliases.n);
536 		dm_free(dev);
537 	}
538 }
539 
540 static int _dev_close(struct device *dev, int immediate)
541 {
542 	struct lvmcache_info *info;
543 
544 	if (dev->fd < 0) {
545 		log_error("Attempt to close device '%s' "
546 			  "which is not open.", dev_name(dev));
547 		return 0;
548 	}
549 
550 #ifndef O_DIRECT_SUPPORT
551 	if (dev->flags & DEV_ACCESSED_W)
552 		dev_flush(dev);
553 #endif
554 
555 	if (dev->open_count > 0)
556 		dev->open_count--;
557 
558 	if (immediate && dev->open_count)
559 		log_debug("%s: Immediate close attempt while still referenced",
560 			  dev_name(dev));
561 
562 	/* Close unless device is known to belong to a locked VG */
563 	if (immediate ||
564 	    (dev->open_count < 1 &&
565 	     (!(info = info_from_pvid(dev->pvid, 0)) ||
566 	      !info->vginfo ||
567 	      !vgname_is_locked(info->vginfo->vgname))))
568 		_close(dev);
569 
570 	return 1;
571 }
572 
573 int dev_close(struct device *dev)
574 {
575 	return _dev_close(dev, 0);
576 }
577 
578 int dev_close_immediate(struct device *dev)
579 {
580 	return _dev_close(dev, 1);
581 }
582 
583 void dev_close_all(void)
584 {
585 	struct dm_list *doh, *doht;
586 	struct device *dev;
587 
588 	dm_list_iterate_safe(doh, doht, &_open_devices) {
589 		dev = dm_list_struct_base(doh, struct device, open_list);
590 		if (dev->open_count < 1)
591 			_close(dev);
592 	}
593 }
594 
595 int dev_read(struct device *dev, uint64_t offset, size_t len, void *buffer)
596 {
597 	struct device_area where;
598 
599 	if (!dev->open_count)
600 		return_0;
601 
602 	where.dev = dev;
603 	where.start = offset;
604 	where.size = len;
605 
606 	return _aligned_io(&where, buffer, 0);
607 }
608 
609 /*
610  * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted
611  * by (offset,len) and (offset2,len2).  Thus, the total size of
612  * 'buf' should be len+len2.
613  */
614 int dev_read_circular(struct device *dev, uint64_t offset, size_t len,
615 		      uint64_t offset2, size_t len2, void *buf)
616 {
617 	if (!dev_read(dev, offset, len, buf)) {
618 		log_error("Read from %s failed", dev_name(dev));
619 		return 0;
620 	}
621 
622 	/*
623 	 * The second region is optional, and allows for
624 	 * a circular buffer on the device.
625 	 */
626 	if (!len2)
627 		return 1;
628 
629 	if (!dev_read(dev, offset2, len2, buf + len)) {
630 		log_error("Circular read from %s failed",
631 			  dev_name(dev));
632 		return 0;
633 	}
634 
635 	return 1;
636 }
637 
638 /* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after.
639  *       But fails if concurrent processes writing
640  */
641 
642 /* FIXME pre-extend the file */
643 int dev_append(struct device *dev, size_t len, void *buffer)
644 {
645 	int r;
646 
647 	if (!dev->open_count)
648 		return_0;
649 
650 	r = dev_write(dev, dev->end, len, buffer);
651 	dev->end += (uint64_t) len;
652 
653 #ifndef O_DIRECT_SUPPORT
654 	dev_flush(dev);
655 #endif
656 	return r;
657 }
658 
659 int dev_write(struct device *dev, uint64_t offset, size_t len, void *buffer)
660 {
661 	struct device_area where;
662 
663 	if (!dev->open_count)
664 		return_0;
665 
666 	where.dev = dev;
667 	where.start = offset;
668 	where.size = len;
669 
670 	dev->flags |= DEV_ACCESSED_W;
671 
672 	return _aligned_io(&where, buffer, 1);
673 }
674 
675 int dev_set(struct device *dev, uint64_t offset, size_t len, int value)
676 {
677 	size_t s;
678 	char buffer[4096] __attribute((aligned(8)));
679 
680 	if (!dev_open(dev))
681 		return_0;
682 
683 	if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE))
684 		log_debug("Wiping %s at %" PRIu64 " length %" PRIsize_t,
685 			  dev_name(dev), offset, len);
686 	else
687 		log_debug("Wiping %s at sector %" PRIu64 " length %" PRIsize_t
688 			  " sectors", dev_name(dev), offset >> SECTOR_SHIFT,
689 			  len >> SECTOR_SHIFT);
690 
691 	memset(buffer, value, sizeof(buffer));
692 	while (1) {
693 		s = len > sizeof(buffer) ? sizeof(buffer) : len;
694 		if (!dev_write(dev, offset, s, buffer))
695 			break;
696 
697 		len -= s;
698 		if (!len)
699 			break;
700 
701 		offset += s;
702 	}
703 
704 	dev->flags |= DEV_ACCESSED_W;
705 
706 	if (!dev_close(dev))
707 		stack;
708 
709 	return (len == 0);
710 }
711