xref: /netbsd-src/sys/ufs/ufs/ufs_readwrite.c (revision 63d4abf06d37aace2f9e41a494102a64fe3abddb)
1 /*	$NetBSD: ufs_readwrite.c,v 1.94 2009/02/22 20:28:07 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
32  */
33 
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.94 2009/02/22 20:28:07 ad Exp $");
36 
37 #ifdef LFS_READWRITE
38 #define	FS			struct lfs
39 #define	I_FS			i_lfs
40 #define	READ			lfs_read
41 #define	READ_S			"lfs_read"
42 #define	WRITE			lfs_write
43 #define	WRITE_S			"lfs_write"
44 #define	fs_bsize		lfs_bsize
45 #define	fs_bmask		lfs_bmask
46 #define	UFS_WAPBL_BEGIN(mp)	0
47 #define	UFS_WAPBL_END(mp)	do { } while (0)
48 #define	UFS_WAPBL_UPDATE(vp, access, modify, flags)	do { } while (0)
49 #else
50 #define	FS			struct fs
51 #define	I_FS			i_fs
52 #define	READ			ffs_read
53 #define	READ_S			"ffs_read"
54 #define	WRITE			ffs_write
55 #define	WRITE_S			"ffs_write"
56 #endif
57 
58 /*
59  * Vnode op for reading.
60  */
61 /* ARGSUSED */
62 int
63 READ(void *v)
64 {
65 	struct vop_read_args /* {
66 		struct vnode *a_vp;
67 		struct uio *a_uio;
68 		int a_ioflag;
69 		kauth_cred_t a_cred;
70 	} */ *ap = v;
71 	struct vnode *vp;
72 	struct inode *ip;
73 	struct uio *uio;
74 	struct ufsmount *ump;
75 	struct buf *bp;
76 	FS *fs;
77 	vsize_t bytelen;
78 	daddr_t lbn, nextlbn;
79 	off_t bytesinfile;
80 	long size, xfersize, blkoffset;
81 	int error, ioflag;
82 	bool usepc = false;
83 
84 	vp = ap->a_vp;
85 	ip = VTOI(vp);
86 	ump = ip->i_ump;
87 	uio = ap->a_uio;
88 	ioflag = ap->a_ioflag;
89 	error = 0;
90 
91 #ifdef DIAGNOSTIC
92 	if (uio->uio_rw != UIO_READ)
93 		panic("%s: mode", READ_S);
94 
95 	if (vp->v_type == VLNK) {
96 		if (ip->i_size < ump->um_maxsymlinklen ||
97 		    (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0))
98 			panic("%s: short symlink", READ_S);
99 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
100 		panic("%s: type %d", READ_S, vp->v_type);
101 #endif
102 	fs = ip->I_FS;
103 	if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize)
104 		return (EFBIG);
105 	if (uio->uio_resid == 0)
106 		return (0);
107 
108 #ifndef LFS_READWRITE
109 	if ((ip->i_flags & SF_SNAPSHOT))
110 		return ffs_snapshot_read(vp, uio, ioflag);
111 #endif /* !LFS_READWRITE */
112 
113 	fstrans_start(vp->v_mount, FSTRANS_SHARED);
114 
115 	if (uio->uio_offset >= ip->i_size)
116 		goto out;
117 
118 #ifdef LFS_READWRITE
119 	usepc = (vp->v_type == VREG && ip->i_number != LFS_IFILE_INUM);
120 #else /* !LFS_READWRITE */
121 	usepc = vp->v_type == VREG;
122 #endif /* !LFS_READWRITE */
123 	if (usepc) {
124 		const int advice = IO_ADV_DECODE(ap->a_ioflag);
125 
126 		while (uio->uio_resid > 0) {
127 			if (ioflag & IO_DIRECT) {
128 				genfs_directio(vp, uio, ioflag);
129 			}
130 			bytelen = MIN(ip->i_size - uio->uio_offset,
131 			    uio->uio_resid);
132 			if (bytelen == 0)
133 				break;
134 			error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
135 			    UBC_READ | UBC_PARTIALOK |
136 			    (UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0));
137 			if (error)
138 				break;
139 		}
140 		goto out;
141 	}
142 
143 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
144 		bytesinfile = ip->i_size - uio->uio_offset;
145 		if (bytesinfile <= 0)
146 			break;
147 		lbn = lblkno(fs, uio->uio_offset);
148 		nextlbn = lbn + 1;
149 		size = blksize(fs, ip, lbn);
150 		blkoffset = blkoff(fs, uio->uio_offset);
151 		xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
152 		    bytesinfile);
153 
154 		if (lblktosize(fs, nextlbn) >= ip->i_size)
155 			error = bread(vp, lbn, size, NOCRED, 0, &bp);
156 		else {
157 			int nextsize = blksize(fs, ip, nextlbn);
158 			error = breadn(vp, lbn,
159 			    size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
160 		}
161 		if (error)
162 			break;
163 
164 		/*
165 		 * We should only get non-zero b_resid when an I/O error
166 		 * has occurred, which should cause us to break above.
167 		 * However, if the short read did not cause an error,
168 		 * then we want to ensure that we do not uiomove bad
169 		 * or uninitialized data.
170 		 */
171 		size -= bp->b_resid;
172 		if (size < xfersize) {
173 			if (size == 0)
174 				break;
175 			xfersize = size;
176 		}
177 		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
178 		if (error)
179 			break;
180 		brelse(bp, 0);
181 	}
182 	if (bp != NULL)
183 		brelse(bp, 0);
184 
185  out:
186 	if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
187 		ip->i_flag |= IN_ACCESS;
188 		if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) {
189 			error = UFS_WAPBL_BEGIN(vp->v_mount);
190 			if (error) {
191 				fstrans_done(vp->v_mount);
192 				return error;
193 			}
194 			error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
195 			UFS_WAPBL_END(vp->v_mount);
196 		}
197 	}
198 
199 	fstrans_done(vp->v_mount);
200 	return (error);
201 }
202 
203 /*
204  * Vnode op for writing.
205  */
206 int
207 WRITE(void *v)
208 {
209 	struct vop_write_args /* {
210 		struct vnode *a_vp;
211 		struct uio *a_uio;
212 		int a_ioflag;
213 		kauth_cred_t a_cred;
214 	} */ *ap = v;
215 	struct vnode *vp;
216 	struct uio *uio;
217 	struct inode *ip;
218 	FS *fs;
219 	struct buf *bp;
220 	struct lwp *l;
221 	kauth_cred_t cred;
222 	daddr_t lbn;
223 	off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
224 	int blkoffset, error, flags, ioflag, resid, size, xfersize;
225 	int aflag;
226 	int extended=0;
227 	vsize_t bytelen;
228 	bool async;
229 	bool usepc = false;
230 #ifdef LFS_READWRITE
231 	bool need_unreserve = false;
232 #endif
233 	struct ufsmount *ump;
234 
235 	cred = ap->a_cred;
236 	ioflag = ap->a_ioflag;
237 	uio = ap->a_uio;
238 	vp = ap->a_vp;
239 	ip = VTOI(vp);
240 	ump = ip->i_ump;
241 
242 	KASSERT(vp->v_size == ip->i_size);
243 #ifdef DIAGNOSTIC
244 	if (uio->uio_rw != UIO_WRITE)
245 		panic("%s: mode", WRITE_S);
246 #endif
247 
248 	switch (vp->v_type) {
249 	case VREG:
250 		if (ioflag & IO_APPEND)
251 			uio->uio_offset = ip->i_size;
252 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
253 			return (EPERM);
254 		/* FALLTHROUGH */
255 	case VLNK:
256 		break;
257 	case VDIR:
258 		if ((ioflag & IO_SYNC) == 0)
259 			panic("%s: nonsync dir write", WRITE_S);
260 		break;
261 	default:
262 		panic("%s: type", WRITE_S);
263 	}
264 
265 	fs = ip->I_FS;
266 	if (uio->uio_offset < 0 ||
267 	    (u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize)
268 		return (EFBIG);
269 #ifdef LFS_READWRITE
270 	/* Disallow writes to the Ifile, even if noschg flag is removed */
271 	/* XXX can this go away when the Ifile is no longer in the namespace? */
272 	if (vp == fs->lfs_ivnode)
273 		return (EPERM);
274 #endif
275 	/*
276 	 * Maybe this should be above the vnode op call, but so long as
277 	 * file servers have no limits, I don't think it matters.
278 	 */
279 	l = curlwp;
280 	if (vp->v_type == VREG && l &&
281 	    uio->uio_offset + uio->uio_resid >
282 	    l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
283 		mutex_enter(proc_lock);
284 		psignal(l->l_proc, SIGXFSZ);
285 		mutex_exit(proc_lock);
286 		return (EFBIG);
287 	}
288 	if (uio->uio_resid == 0)
289 		return (0);
290 
291 	fstrans_start(vp->v_mount, FSTRANS_SHARED);
292 
293 	flags = ioflag & IO_SYNC ? B_SYNC : 0;
294 	async = vp->v_mount->mnt_flag & MNT_ASYNC;
295 	origoff = uio->uio_offset;
296 	resid = uio->uio_resid;
297 	osize = ip->i_size;
298 	error = 0;
299 
300 	usepc = vp->v_type == VREG;
301 
302 	if ((ioflag & IO_JOURNALLOCKED) == 0) {
303 		error = UFS_WAPBL_BEGIN(vp->v_mount);
304 		if (error) {
305 			fstrans_done(vp->v_mount);
306 			return error;
307 		}
308 	}
309 
310 #ifdef LFS_READWRITE
311 	async = true;
312 	lfs_check(vp, LFS_UNUSED_LBN, 0);
313 #endif /* !LFS_READWRITE */
314 	if (!usepc)
315 		goto bcache;
316 
317 	preallocoff = round_page(blkroundup(fs, MAX(osize, uio->uio_offset)));
318 	aflag = ioflag & IO_SYNC ? B_SYNC : 0;
319 	nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
320 	endallocoff = nsize - blkoff(fs, nsize);
321 
322 	/*
323 	 * if we're increasing the file size, deal with expanding
324 	 * the fragment if there is one.
325 	 */
326 
327 	if (nsize > osize && lblkno(fs, osize) < NDADDR &&
328 	    lblkno(fs, osize) != lblkno(fs, nsize) &&
329 	    blkroundup(fs, osize) != osize) {
330 		off_t eob;
331 
332 		eob = blkroundup(fs, osize);
333 		uvm_vnp_setwritesize(vp, eob);
334 		error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
335 		if (error)
336 			goto out;
337 		if (flags & B_SYNC) {
338 			mutex_enter(&vp->v_interlock);
339 			VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask),
340 			    round_page(eob),
341 			    PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
342 		}
343 	}
344 
345 	while (uio->uio_resid > 0) {
346 		int ubc_flags = UBC_WRITE;
347 		bool overwrite; /* if we're overwrite a whole block */
348 		off_t newoff;
349 
350 		if (ioflag & IO_DIRECT) {
351 			genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED);
352 		}
353 
354 		oldoff = uio->uio_offset;
355 		blkoffset = blkoff(fs, uio->uio_offset);
356 		bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
357 		if (bytelen == 0) {
358 			break;
359 		}
360 
361 		/*
362 		 * if we're filling in a hole, allocate the blocks now and
363 		 * initialize the pages first.  if we're extending the file,
364 		 * we can safely allocate blocks without initializing pages
365 		 * since the new blocks will be inaccessible until the write
366 		 * is complete.
367 		 */
368 		overwrite = uio->uio_offset >= preallocoff &&
369 		    uio->uio_offset < endallocoff;
370 		if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 &&
371 		    blkoff(fs, uio->uio_offset) == 0 &&
372 		    (uio->uio_offset & PAGE_MASK) == 0) {
373 			vsize_t len;
374 
375 			len = trunc_page(bytelen);
376 			len -= blkoff(fs, len);
377 			if (len > 0) {
378 				overwrite = true;
379 				bytelen = len;
380 			}
381 		}
382 
383 		newoff = oldoff + bytelen;
384 		if (vp->v_size < newoff) {
385 			uvm_vnp_setwritesize(vp, newoff);
386 		}
387 
388 		if (!overwrite) {
389 			error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
390 			    cred, aflag);
391 			if (error)
392 				break;
393 		} else {
394 			genfs_node_wrlock(vp);
395 			error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
396 			    aflag, cred);
397 			genfs_node_unlock(vp);
398 			if (error)
399 				break;
400 			ubc_flags |= UBC_FAULTBUSY;
401 		}
402 
403 		/*
404 		 * copy the data.
405 		 */
406 
407 		ubc_flags |= UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0;
408 		error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
409 		    IO_ADV_DECODE(ioflag), ubc_flags);
410 
411 		/*
412 		 * update UVM's notion of the size now that we've
413 		 * copied the data into the vnode's pages.
414 		 *
415 		 * we should update the size even when uiomove failed.
416 		 */
417 
418 		if (vp->v_size < newoff) {
419 			uvm_vnp_setsize(vp, newoff);
420 			extended = 1;
421 		}
422 
423 		if (error)
424 			break;
425 
426 		/*
427 		 * flush what we just wrote if necessary.
428 		 * XXXUBC simplistic async flushing.
429 		 */
430 
431 #ifndef LFS_READWRITE
432 		if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
433 			mutex_enter(&vp->v_interlock);
434 			error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
435 			    (uio->uio_offset >> 16) << 16,
436 			    PGO_CLEANIT | PGO_JOURNALLOCKED);
437 			if (error)
438 				break;
439 		}
440 #endif
441 	}
442 	if (error == 0 && ioflag & IO_SYNC) {
443 		mutex_enter(&vp->v_interlock);
444 		error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask),
445 		    round_page(blkroundup(fs, uio->uio_offset)),
446 		    PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
447 	}
448 	goto out;
449 
450  bcache:
451 	mutex_enter(&vp->v_interlock);
452 	VOP_PUTPAGES(vp, trunc_page(origoff), round_page(origoff + resid),
453 	    PGO_CLEANIT | PGO_FREE | PGO_SYNCIO | PGO_JOURNALLOCKED);
454 	while (uio->uio_resid > 0) {
455 		lbn = lblkno(fs, uio->uio_offset);
456 		blkoffset = blkoff(fs, uio->uio_offset);
457 		xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
458 		if (fs->fs_bsize > xfersize)
459 			flags |= B_CLRBUF;
460 		else
461 			flags &= ~B_CLRBUF;
462 
463 #ifdef LFS_READWRITE
464 		error = lfs_reserve(fs, vp, NULL,
465 		    btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
466 		if (error)
467 			break;
468 		need_unreserve = true;
469 #endif
470 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
471 		    ap->a_cred, flags, &bp);
472 
473 		if (error)
474 			break;
475 		if (uio->uio_offset + xfersize > ip->i_size) {
476 			ip->i_size = uio->uio_offset + xfersize;
477 			DIP_ASSIGN(ip, size, ip->i_size);
478 			uvm_vnp_setsize(vp, ip->i_size);
479 			extended = 1;
480 		}
481 		size = blksize(fs, ip, lbn) - bp->b_resid;
482 		if (xfersize > size)
483 			xfersize = size;
484 
485 		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
486 
487 		/*
488 		 * if we didn't clear the block and the uiomove failed,
489 		 * the buf will now contain part of some other file,
490 		 * so we need to invalidate it.
491 		 */
492 		if (error && (flags & B_CLRBUF) == 0) {
493 			brelse(bp, BC_INVAL);
494 			break;
495 		}
496 #ifdef LFS_READWRITE
497 		(void)VOP_BWRITE(bp);
498 		lfs_reserve(fs, vp, NULL,
499 		    -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
500 		need_unreserve = false;
501 #else
502 		if (ioflag & IO_SYNC)
503 			(void)bwrite(bp);
504 		else if (xfersize + blkoffset == fs->fs_bsize)
505 			bawrite(bp);
506 		else
507 			bdwrite(bp);
508 #endif
509 		if (error || xfersize == 0)
510 			break;
511 	}
512 #ifdef LFS_READWRITE
513 	if (need_unreserve) {
514 		lfs_reserve(fs, vp, NULL,
515 		    -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
516 	}
517 #endif
518 
519 	/*
520 	 * If we successfully wrote any data, and we are not the superuser
521 	 * we clear the setuid and setgid bits as a precaution against
522 	 * tampering.
523 	 */
524 out:
525 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
526 	if (resid > uio->uio_resid && ap->a_cred &&
527 	    kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
528 		ip->i_mode &= ~(ISUID | ISGID);
529 		DIP_ASSIGN(ip, mode, ip->i_mode);
530 	}
531 	if (resid > uio->uio_resid)
532 		VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
533 	if (error) {
534 		(void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred);
535 		uio->uio_offset -= resid - uio->uio_resid;
536 		uio->uio_resid = resid;
537 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
538 		error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
539 	else
540 		UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
541 	KASSERT(vp->v_size == ip->i_size);
542 	if ((ioflag & IO_JOURNALLOCKED) == 0)
543 		UFS_WAPBL_END(vp->v_mount);
544 	fstrans_done(vp->v_mount);
545 
546 	return (error);
547 }
548