xref: /netbsd-src/sys/ufs/ufs/ufs_readwrite.c (revision eb961d0e02b7a46a9acfa877b02df48df6637278)
1 /*	$NetBSD: ufs_readwrite.c,v 1.67 2006/03/01 12:38:33 yamt Exp $	*/
2 
3 /*-
4  * Copyright (c) 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
32  */
33 
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.67 2006/03/01 12:38:33 yamt Exp $");
36 
37 #ifdef LFS_READWRITE
38 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
39 #define	FS			struct lfs
40 #define	I_FS			i_lfs
41 #define	READ			lfs_read
42 #define	READ_S			"lfs_read"
43 #define	WRITE			lfs_write
44 #define	WRITE_S			"lfs_write"
45 #define	fs_bsize		lfs_bsize
46 #define	fs_bmask		lfs_bmask
47 #else
48 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
49 #define	FS			struct fs
50 #define	I_FS			i_fs
51 #define	READ			ffs_read
52 #define	READ_S			"ffs_read"
53 #define	WRITE			ffs_write
54 #define	WRITE_S			"ffs_write"
55 #endif
56 
57 /*
58  * Vnode op for reading.
59  */
60 /* ARGSUSED */
61 int
62 READ(void *v)
63 {
64 	struct vop_read_args /* {
65 		struct vnode *a_vp;
66 		struct uio *a_uio;
67 		int a_ioflag;
68 		struct ucred *a_cred;
69 	} */ *ap = v;
70 	struct vnode *vp;
71 	struct inode *ip;
72 	struct uio *uio;
73 	struct ufsmount *ump;
74 	struct buf *bp;
75 	FS *fs;
76 	void *win;
77 	vsize_t bytelen;
78 	daddr_t lbn, nextlbn;
79 	off_t bytesinfile;
80 	long size, xfersize, blkoffset;
81 	int error, flags;
82 	boolean_t usepc = FALSE;
83 
84 	vp = ap->a_vp;
85 	ip = VTOI(vp);
86 	ump = ip->i_ump;
87 	uio = ap->a_uio;
88 	error = 0;
89 
90 #ifdef DIAGNOSTIC
91 	if (uio->uio_rw != UIO_READ)
92 		panic("%s: mode", READ_S);
93 
94 	if (vp->v_type == VLNK) {
95 		if (ip->i_size < ump->um_maxsymlinklen ||
96 		    (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0))
97 			panic("%s: short symlink", READ_S);
98 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
99 		panic("%s: type %d", READ_S, vp->v_type);
100 #endif
101 	fs = ip->I_FS;
102 	if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize)
103 		return (EFBIG);
104 	if (uio->uio_resid == 0)
105 		return (0);
106 	if (uio->uio_offset >= ip->i_size)
107 		goto out;
108 
109 #ifdef LFS_READWRITE
110 	usepc = (vp->v_type == VREG && ip->i_number != LFS_IFILE_INUM);
111 #else /* !LFS_READWRITE */
112 	usepc = vp->v_type == VREG;
113 #endif /* !LFS_READWRITE */
114 	if (usepc) {
115 		const int advice = IO_ADV_DECODE(ap->a_ioflag);
116 
117 		while (uio->uio_resid > 0) {
118 			bytelen = MIN(ip->i_size - uio->uio_offset,
119 			    uio->uio_resid);
120 			if (bytelen == 0)
121 				break;
122 
123 			win = ubc_alloc(&vp->v_uobj, uio->uio_offset,
124 			    &bytelen, advice, UBC_READ);
125 			error = uiomove(win, bytelen, uio);
126 			flags = UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0;
127 			ubc_release(win, flags);
128 			if (error)
129 				break;
130 		}
131 		goto out;
132 	}
133 
134 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
135 		bytesinfile = ip->i_size - uio->uio_offset;
136 		if (bytesinfile <= 0)
137 			break;
138 		lbn = lblkno(fs, uio->uio_offset);
139 		nextlbn = lbn + 1;
140 		size = BLKSIZE(fs, ip, lbn);
141 		blkoffset = blkoff(fs, uio->uio_offset);
142 		xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
143 		    bytesinfile);
144 
145 		if (lblktosize(fs, nextlbn) >= ip->i_size)
146 			error = bread(vp, lbn, size, NOCRED, &bp);
147 		else {
148 			int nextsize = BLKSIZE(fs, ip, nextlbn);
149 			error = breadn(vp, lbn,
150 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
151 		}
152 		if (error)
153 			break;
154 
155 		/*
156 		 * We should only get non-zero b_resid when an I/O error
157 		 * has occurred, which should cause us to break above.
158 		 * However, if the short read did not cause an error,
159 		 * then we want to ensure that we do not uiomove bad
160 		 * or uninitialized data.
161 		 */
162 		size -= bp->b_resid;
163 		if (size < xfersize) {
164 			if (size == 0)
165 				break;
166 			xfersize = size;
167 		}
168 		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
169 		if (error)
170 			break;
171 		brelse(bp);
172 	}
173 	if (bp != NULL)
174 		brelse(bp);
175 
176  out:
177 	if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
178 		ip->i_flag |= IN_ACCESS;
179 		if ((ap->a_ioflag & IO_SYNC) == IO_SYNC)
180 			error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
181 	}
182 	return (error);
183 }
184 
185 /*
186  * Vnode op for writing.
187  */
188 int
189 WRITE(void *v)
190 {
191 	struct vop_write_args /* {
192 		struct vnode *a_vp;
193 		struct uio *a_uio;
194 		int a_ioflag;
195 		struct ucred *a_cred;
196 	} */ *ap = v;
197 	struct vnode *vp;
198 	struct uio *uio;
199 	struct inode *ip;
200 	struct genfs_node *gp;
201 	FS *fs;
202 	struct buf *bp;
203 	struct lwp *l;
204 	struct ucred *cred;
205 	daddr_t lbn;
206 	off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
207 	int blkoffset, error, flags, ioflag, resid, size, xfersize;
208 	int aflag;
209 	int ubc_alloc_flags, ubc_release_flags;
210 	int extended=0;
211 	void *win;
212 	vsize_t bytelen;
213 	boolean_t async;
214 	boolean_t usepc = FALSE;
215 #ifdef LFS_READWRITE
216 	boolean_t need_unreserve = FALSE;
217 #endif
218 	struct ufsmount *ump;
219 
220 	cred = ap->a_cred;
221 	ioflag = ap->a_ioflag;
222 	uio = ap->a_uio;
223 	vp = ap->a_vp;
224 	ip = VTOI(vp);
225 	gp = VTOG(vp);
226 	ump = ip->i_ump;
227 
228 	KASSERT(vp->v_size == ip->i_size);
229 #ifdef DIAGNOSTIC
230 	if (uio->uio_rw != UIO_WRITE)
231 		panic("%s: mode", WRITE_S);
232 #endif
233 
234 	switch (vp->v_type) {
235 	case VREG:
236 		if (ioflag & IO_APPEND)
237 			uio->uio_offset = ip->i_size;
238 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
239 			return (EPERM);
240 		/* FALLTHROUGH */
241 	case VLNK:
242 		break;
243 	case VDIR:
244 		if ((ioflag & IO_SYNC) == 0)
245 			panic("%s: nonsync dir write", WRITE_S);
246 		break;
247 	default:
248 		panic("%s: type", WRITE_S);
249 	}
250 
251 	fs = ip->I_FS;
252 	if (uio->uio_offset < 0 ||
253 	    (u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize)
254 		return (EFBIG);
255 #ifdef LFS_READWRITE
256 	/* Disallow writes to the Ifile, even if noschg flag is removed */
257 	/* XXX can this go away when the Ifile is no longer in the namespace? */
258 	if (vp == fs->lfs_ivnode)
259 		return (EPERM);
260 #endif
261 	/*
262 	 * Maybe this should be above the vnode op call, but so long as
263 	 * file servers have no limits, I don't think it matters.
264 	 */
265 	l = curlwp;
266 	if (vp->v_type == VREG && l &&
267 	    uio->uio_offset + uio->uio_resid >
268 	    l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
269 		psignal(l->l_proc, SIGXFSZ);
270 		return (EFBIG);
271 	}
272 	if (uio->uio_resid == 0)
273 		return (0);
274 
275 	flags = ioflag & IO_SYNC ? B_SYNC : 0;
276 	async = vp->v_mount->mnt_flag & MNT_ASYNC;
277 	origoff = uio->uio_offset;
278 	resid = uio->uio_resid;
279 	osize = ip->i_size;
280 	error = 0;
281 
282 	usepc = vp->v_type == VREG;
283 #ifdef LFS_READWRITE
284 	async = TRUE;
285 	lfs_check(vp, LFS_UNUSED_LBN, 0);
286 #endif /* !LFS_READWRITE */
287 	if (!usepc)
288 		goto bcache;
289 
290 	preallocoff = round_page(blkroundup(fs, MAX(osize, uio->uio_offset)));
291 	aflag = ioflag & IO_SYNC ? B_SYNC : 0;
292 	nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
293 	endallocoff = nsize - blkoff(fs, nsize);
294 
295 	/*
296 	 * if we're increasing the file size, deal with expanding
297 	 * the fragment if there is one.
298 	 */
299 
300 	if (nsize > osize && lblkno(fs, osize) < NDADDR &&
301 	    lblkno(fs, osize) != lblkno(fs, nsize) &&
302 	    blkroundup(fs, osize) != osize) {
303 		off_t eob;
304 
305 		eob = blkroundup(fs, osize);
306 		error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
307 		if (error)
308 			goto out;
309 		if (flags & B_SYNC) {
310 			vp->v_size = eob;
311 			simple_lock(&vp->v_interlock);
312 			VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask),
313 			    round_page(eob), PGO_CLEANIT | PGO_SYNCIO);
314 		}
315 	}
316 
317 	ubc_alloc_flags = UBC_WRITE;
318 	while (uio->uio_resid > 0) {
319 		boolean_t extending; /* if we're extending a whole block */
320 		off_t newoff;
321 
322 		oldoff = uio->uio_offset;
323 		blkoffset = blkoff(fs, uio->uio_offset);
324 		bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
325 
326 		/*
327 		 * if we're filling in a hole, allocate the blocks now and
328 		 * initialize the pages first.  if we're extending the file,
329 		 * we can safely allocate blocks without initializing pages
330 		 * since the new blocks will be inaccessible until the write
331 		 * is complete.
332 		 */
333 		extending = uio->uio_offset >= preallocoff &&
334 		    uio->uio_offset < endallocoff;
335 
336 		if (!extending) {
337 			error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
338 			    cred, aflag);
339 			if (error)
340 				break;
341 			ubc_alloc_flags &= ~UBC_FAULTBUSY;
342 		} else {
343 			lockmgr(&gp->g_glock, LK_EXCLUSIVE, NULL);
344 			error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
345 			    aflag, cred);
346 			lockmgr(&gp->g_glock, LK_RELEASE, NULL);
347 			if (error)
348 				break;
349 			ubc_alloc_flags |= UBC_FAULTBUSY;
350 		}
351 
352 		/*
353 		 * copy the data.
354 		 */
355 
356 		win = ubc_alloc(&vp->v_uobj, uio->uio_offset, &bytelen,
357 		    UVM_ADV_NORMAL, ubc_alloc_flags);
358 		error = uiomove(win, bytelen, uio);
359 		if (error && extending) {
360 			/*
361 			 * if we haven't initialized the pages yet,
362 			 * do it now.  it's safe to use memset here
363 			 * because we just mapped the pages above.
364 			 */
365 			memset(win, 0, bytelen);
366 		}
367 		ubc_release_flags = UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0;
368 		ubc_release(win, ubc_release_flags);
369 
370 		/*
371 		 * update UVM's notion of the size now that we've
372 		 * copied the data into the vnode's pages.
373 		 *
374 		 * we should update the size even when uiomove failed.
375 		 * otherwise ffs_truncate can't flush soft update states.
376 		 */
377 
378 		newoff = oldoff + bytelen;
379 		if (vp->v_size < newoff) {
380 			uvm_vnp_setsize(vp, newoff);
381 			extended = 1;
382 		}
383 
384 		if (error)
385 			break;
386 
387 		/*
388 		 * flush what we just wrote if necessary.
389 		 * XXXUBC simplistic async flushing.
390 		 */
391 
392 		if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
393 			simple_lock(&vp->v_interlock);
394 			error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
395 			    (uio->uio_offset >> 16) << 16, PGO_CLEANIT);
396 			if (error)
397 				break;
398 		}
399 	}
400 	if (error == 0 && ioflag & IO_SYNC) {
401 		simple_lock(&vp->v_interlock);
402 		error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask),
403 		    round_page(blkroundup(fs, uio->uio_offset)),
404 		    PGO_CLEANIT | PGO_SYNCIO);
405 	}
406 	goto out;
407 
408  bcache:
409 	simple_lock(&vp->v_interlock);
410 	VOP_PUTPAGES(vp, trunc_page(origoff), round_page(origoff + resid),
411 	    PGO_CLEANIT | PGO_FREE | PGO_SYNCIO);
412 	while (uio->uio_resid > 0) {
413 		lbn = lblkno(fs, uio->uio_offset);
414 		blkoffset = blkoff(fs, uio->uio_offset);
415 		xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
416 		if (fs->fs_bsize > xfersize)
417 			flags |= B_CLRBUF;
418 		else
419 			flags &= ~B_CLRBUF;
420 
421 #ifdef LFS_READWRITE
422 		error = lfs_reserve(fs, vp, NULL,
423 		    btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
424 		if (error)
425 			break;
426 		need_unreserve = TRUE;
427 #endif
428 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
429 		    ap->a_cred, flags, &bp);
430 
431 		if (error)
432 			break;
433 		if (uio->uio_offset + xfersize > ip->i_size) {
434 			ip->i_size = uio->uio_offset + xfersize;
435 			DIP_ASSIGN(ip, size, ip->i_size);
436 			uvm_vnp_setsize(vp, ip->i_size);
437 			extended = 1;
438 		}
439 		size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
440 		if (xfersize > size)
441 			xfersize = size;
442 
443 		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
444 
445 		/*
446 		 * if we didn't clear the block and the uiomove failed,
447 		 * the buf will now contain part of some other file,
448 		 * so we need to invalidate it.
449 		 */
450 		if (error && (flags & B_CLRBUF) == 0) {
451 			bp->b_flags |= B_INVAL;
452 			brelse(bp);
453 			break;
454 		}
455 #ifdef LFS_READWRITE
456 		(void)VOP_BWRITE(bp);
457 		lfs_reserve(fs, vp, NULL,
458 		    -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
459 		need_unreserve = FALSE;
460 #else
461 		if (ioflag & IO_SYNC)
462 			(void)bwrite(bp);
463 		else if (xfersize + blkoffset == fs->fs_bsize)
464 			bawrite(bp);
465 		else
466 			bdwrite(bp);
467 #endif
468 		if (error || xfersize == 0)
469 			break;
470 	}
471 #ifdef LFS_READWRITE
472 	if (need_unreserve) {
473 		lfs_reserve(fs, vp, NULL,
474 		    -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
475 	}
476 #endif
477 
478 	/*
479 	 * If we successfully wrote any data, and we are not the superuser
480 	 * we clear the setuid and setgid bits as a precaution against
481 	 * tampering.
482 	 */
483 out:
484 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
485 	if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) {
486 		ip->i_mode &= ~(ISUID | ISGID);
487 		DIP_ASSIGN(ip, mode, ip->i_mode);
488 	}
489 	if (resid > uio->uio_resid)
490 		VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
491 	if (error) {
492 		(void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred,
493 		    curlwp);
494 		uio->uio_offset -= resid - uio->uio_resid;
495 		uio->uio_resid = resid;
496 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
497 		error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
498 	KASSERT(vp->v_size == ip->i_size);
499 	return (error);
500 }
501