1 /* $NetBSD: ulfs_readwrite.c,v 1.28 2021/10/20 03:08:19 thorpej Exp $ */
2 /* from NetBSD: ufs_readwrite.c,v 1.120 2015/04/12 22:48:38 riastradh Exp */
3
4 /*-
5 * Copyright (c) 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
33 */
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(1, "$NetBSD: ulfs_readwrite.c,v 1.28 2021/10/20 03:08:19 thorpej Exp $");
37
38 #define FS struct lfs
39 #define I_FS i_lfs
40 #define READ lfs_read
41 #define READ_S "lfs_read"
42 #define WRITE lfs_write
43 #define WRITE_S "lfs_write"
44 #define BUFRD lfs_bufrd
45 #define BUFWR lfs_bufwr
46 #define fs_sb_getbsize(fs) lfs_sb_getbsize(fs)
47 #define fs_bmask lfs_bmask
48
49 static int ulfs_post_read_update(struct vnode *, int, int);
50 static int ulfs_post_write_update(struct vnode *, struct uio *, int,
51 kauth_cred_t, off_t, int, int);
52
53 /*
54 * Vnode op for reading.
55 */
56 /* ARGSUSED */
57 int
READ(void * v)58 READ(void *v)
59 {
60 struct vop_read_args /* {
61 struct vnode *a_vp;
62 struct uio *a_uio;
63 int a_ioflag;
64 kauth_cred_t a_cred;
65 } */ *ap = v;
66 struct vnode *vp;
67 struct inode *ip;
68 struct uio *uio;
69 FS *fs;
70 vsize_t bytelen;
71 int error, ioflag, advice;
72
73 vp = ap->a_vp;
74 ip = VTOI(vp);
75 fs = ip->I_FS;
76 uio = ap->a_uio;
77 ioflag = ap->a_ioflag;
78 error = 0;
79
80 KASSERT(uio->uio_rw == UIO_READ);
81 KASSERT(vp->v_type == VREG || vp->v_type == VDIR);
82
83 /* XXX Eliminate me by refusing directory reads from userland. */
84 if (vp->v_type == VDIR)
85 return BUFRD(vp, uio, ioflag, ap->a_cred);
86 /* XXX Eliminate me by using ufs_bufio in lfs. */
87 if (vp->v_type == VREG && ip->i_number == LFS_IFILE_INUM)
88 return BUFRD(vp, uio, ioflag, ap->a_cred);
89 if ((u_int64_t)uio->uio_offset > fs->um_maxfilesize)
90 return (EFBIG);
91 if (uio->uio_resid == 0)
92 return (0);
93
94
95 if (uio->uio_offset >= ip->i_size)
96 goto out;
97
98 KASSERT(vp->v_type == VREG);
99 advice = IO_ADV_DECODE(ap->a_ioflag);
100 while (uio->uio_resid > 0) {
101 if (ioflag & IO_DIRECT) {
102 genfs_directio(vp, uio, ioflag);
103 }
104 bytelen = MIN(ip->i_size - uio->uio_offset, uio->uio_resid);
105 if (bytelen == 0)
106 break;
107 error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
108 UBC_READ | UBC_PARTIALOK | UBC_VNODE_FLAGS(vp));
109 if (error)
110 break;
111 }
112
113 out:
114 error = ulfs_post_read_update(vp, ap->a_ioflag, error);
115 return (error);
116 }
117
118 /*
119 * UFS op for reading via the buffer cache
120 */
121 int
BUFRD(struct vnode * vp,struct uio * uio,int ioflag,kauth_cred_t cred)122 BUFRD(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
123 {
124 struct inode *ip;
125 FS *fs;
126 struct buf *bp;
127 daddr_t lbn, nextlbn;
128 off_t bytesinfile;
129 long size, xfersize, blkoffset;
130 int error;
131
132 KASSERT(VOP_ISLOCKED(vp));
133 KASSERT(vp->v_type == VDIR || vp->v_type == VLNK ||
134 vp->v_type == VREG);
135 KASSERT(uio->uio_rw == UIO_READ);
136
137 ip = VTOI(vp);
138 fs = ip->I_FS;
139 error = 0;
140
141 KASSERT(vp->v_type != VLNK || ip->i_size >= fs->um_maxsymlinklen);
142 KASSERT(vp->v_type != VLNK || fs->um_maxsymlinklen != 0 ||
143 DIP(ip, blocks) == 0);
144 KASSERT(vp->v_type != VREG || vp == fs->lfs_ivnode);
145 KASSERT(vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM);
146
147 if (uio->uio_offset > fs->um_maxfilesize)
148 return EFBIG;
149 if (uio->uio_resid == 0)
150 return 0;
151
152
153 if (uio->uio_offset >= ip->i_size)
154 goto out;
155
156 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
157 bytesinfile = ip->i_size - uio->uio_offset;
158 if (bytesinfile <= 0)
159 break;
160 lbn = lfs_lblkno(fs, uio->uio_offset);
161 nextlbn = lbn + 1;
162 size = lfs_blksize(fs, ip, lbn);
163 blkoffset = lfs_blkoff(fs, uio->uio_offset);
164 xfersize = MIN(MIN(fs_sb_getbsize(fs) - blkoffset, uio->uio_resid),
165 bytesinfile);
166
167 if (lfs_lblktosize(fs, nextlbn) >= ip->i_size)
168 error = bread(vp, lbn, size, 0, &bp);
169 else {
170 int nextsize = lfs_blksize(fs, ip, nextlbn);
171 error = breadn(vp, lbn,
172 size, &nextlbn, &nextsize, 1, 0, &bp);
173 }
174 if (error)
175 break;
176
177 /*
178 * We should only get non-zero b_resid when an I/O error
179 * has occurred, which should cause us to break above.
180 * However, if the short read did not cause an error,
181 * then we want to ensure that we do not uiomove bad
182 * or uninitialized data.
183 */
184 size -= bp->b_resid;
185 if (size < xfersize) {
186 if (size == 0)
187 break;
188 xfersize = size;
189 }
190 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
191 if (error)
192 break;
193 brelse(bp, 0);
194 }
195 if (bp != NULL)
196 brelse(bp, 0);
197
198 out:
199 error = ulfs_post_read_update(vp, ioflag, error);
200 return (error);
201 }
202
203 static int
ulfs_post_read_update(struct vnode * vp,int ioflag,int oerror)204 ulfs_post_read_update(struct vnode *vp, int ioflag, int oerror)
205 {
206 struct inode *ip = VTOI(vp);
207 int error = oerror;
208
209 if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
210 ip->i_state |= IN_ACCESS;
211 if ((ioflag & IO_SYNC) == IO_SYNC) {
212 error = lfs_update(vp, NULL, NULL, UPDATE_WAIT);
213 }
214 }
215
216 /* Read error overrides any inode update error. */
217 if (oerror)
218 error = oerror;
219 return error;
220 }
221
222 /*
223 * Vnode op for writing.
224 */
225 int
WRITE(void * v)226 WRITE(void *v)
227 {
228 struct vop_write_args /* {
229 struct vnode *a_vp;
230 struct uio *a_uio;
231 int a_ioflag;
232 kauth_cred_t a_cred;
233 } */ *ap = v;
234 struct vnode *vp;
235 struct uio *uio;
236 struct inode *ip;
237 FS *fs;
238 kauth_cred_t cred;
239 off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
240 int blkoffset, error, flags, ioflag, resid;
241 int aflag;
242 vsize_t bytelen;
243 bool async;
244
245 cred = ap->a_cred;
246 ioflag = ap->a_ioflag;
247 uio = ap->a_uio;
248 vp = ap->a_vp;
249 ip = VTOI(vp);
250
251 KASSERT(vp->v_size == ip->i_size);
252 KASSERT(uio->uio_rw == UIO_WRITE);
253 KASSERT(vp->v_type == VREG);
254
255 if (ioflag & IO_APPEND)
256 uio->uio_offset = ip->i_size;
257 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
258 return (EPERM);
259
260 fs = ip->I_FS;
261 if (uio->uio_offset < 0 ||
262 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->um_maxfilesize)
263 return (EFBIG);
264 /* Disallow writes to the Ifile, even if noschg flag is removed */
265 /* XXX can this go away when the Ifile is no longer in the namespace? */
266 if (vp == fs->lfs_ivnode)
267 return (EPERM);
268 if (uio->uio_resid == 0)
269 return (0);
270
271 flags = ioflag & IO_SYNC ? B_SYNC : 0;
272 async = vp->v_mount->mnt_flag & MNT_ASYNC;
273 origoff = uio->uio_offset;
274 resid = uio->uio_resid;
275 osize = ip->i_size;
276 error = 0;
277
278 KASSERT(vp->v_type == VREG);
279
280 async = true;
281 lfs_availwait(fs, lfs_btofsb(fs, uio->uio_resid));
282 lfs_check(vp, LFS_UNUSED_LBN, 0);
283
284 preallocoff = round_page(lfs_blkroundup(fs, MAX(osize, uio->uio_offset)));
285 aflag = ioflag & IO_SYNC ? B_SYNC : 0;
286 nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
287 endallocoff = nsize - lfs_blkoff(fs, nsize);
288
289 /*
290 * if we're increasing the file size, deal with expanding
291 * the fragment if there is one.
292 */
293
294 if (nsize > osize && lfs_lblkno(fs, osize) < ULFS_NDADDR &&
295 lfs_lblkno(fs, osize) != lfs_lblkno(fs, nsize) &&
296 lfs_blkroundup(fs, osize) != osize) {
297 off_t eob;
298
299 eob = lfs_blkroundup(fs, osize);
300 uvm_vnp_setwritesize(vp, eob);
301 error = ulfs_balloc_range(vp, osize, eob - osize, cred, aflag);
302 if (error)
303 goto out;
304 if (flags & B_SYNC) {
305 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
306 VOP_PUTPAGES(vp, trunc_page(osize & lfs_sb_getbmask(fs)),
307 round_page(eob),
308 PGO_CLEANIT | PGO_SYNCIO);
309 }
310 }
311
312 while (uio->uio_resid > 0) {
313 int ubc_flags = UBC_WRITE;
314 bool overwrite; /* if we're overwrite a whole block */
315 off_t newoff;
316
317 if (ioflag & IO_DIRECT) {
318 genfs_directio(vp, uio, ioflag);
319 }
320
321 oldoff = uio->uio_offset;
322 blkoffset = lfs_blkoff(fs, uio->uio_offset);
323 bytelen = MIN(fs_sb_getbsize(fs) - blkoffset, uio->uio_resid);
324 if (bytelen == 0) {
325 break;
326 }
327
328 /*
329 * if we're filling in a hole, allocate the blocks now and
330 * initialize the pages first. if we're extending the file,
331 * we can safely allocate blocks without initializing pages
332 * since the new blocks will be inaccessible until the write
333 * is complete.
334 */
335 overwrite = uio->uio_offset >= preallocoff &&
336 uio->uio_offset < endallocoff;
337 if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 &&
338 lfs_blkoff(fs, uio->uio_offset) == 0 &&
339 (uio->uio_offset & PAGE_MASK) == 0) {
340 vsize_t len;
341
342 len = trunc_page(bytelen);
343 len -= lfs_blkoff(fs, len);
344 if (len > 0) {
345 overwrite = true;
346 bytelen = len;
347 }
348 }
349
350 newoff = oldoff + bytelen;
351 if (vp->v_size < newoff) {
352 uvm_vnp_setwritesize(vp, newoff);
353 }
354
355 if (!overwrite) {
356 error = ulfs_balloc_range(vp, uio->uio_offset, bytelen,
357 cred, aflag);
358 if (error)
359 break;
360 } else {
361 genfs_node_wrlock(vp);
362 error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
363 aflag, cred);
364 genfs_node_unlock(vp);
365 if (error)
366 break;
367 ubc_flags |= UBC_FAULTBUSY;
368 }
369
370 /*
371 * copy the data.
372 */
373
374 error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
375 IO_ADV_DECODE(ioflag), ubc_flags | UBC_VNODE_FLAGS(vp));
376
377 /*
378 * update UVM's notion of the size now that we've
379 * copied the data into the vnode's pages.
380 *
381 * we should update the size even when uiomove failed.
382 */
383
384 if (vp->v_size < newoff) {
385 uvm_vnp_setsize(vp, newoff);
386 }
387
388 if (error)
389 break;
390
391 /*
392 * flush what we just wrote if necessary.
393 * XXXUBC simplistic async flushing.
394 */
395
396 __USE(async);
397 }
398 if (error == 0 && ioflag & IO_SYNC) {
399 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
400 error = VOP_PUTPAGES(vp, trunc_page(origoff & lfs_sb_getbmask(fs)),
401 round_page(lfs_blkroundup(fs, uio->uio_offset)),
402 PGO_CLEANIT | PGO_SYNCIO);
403 }
404
405 out:
406 error = ulfs_post_write_update(vp, uio, ioflag, cred, osize, resid,
407 error);
408
409 return (error);
410 }
411
412 /*
413 * UFS op for writing via the buffer cache
414 */
415 int
BUFWR(struct vnode * vp,struct uio * uio,int ioflag,kauth_cred_t cred)416 BUFWR(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
417 {
418 struct inode *ip;
419 FS *fs;
420 int flags;
421 struct buf *bp;
422 off_t osize;
423 int resid, xfersize, size, blkoffset;
424 daddr_t lbn;
425 int error;
426 bool need_unreserve = false;
427
428 KASSERT(ISSET(ioflag, IO_NODELOCKED));
429 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
430 KASSERT(vp->v_type == VDIR || vp->v_type == VLNK);
431 KASSERT(vp->v_type != VDIR || ISSET(ioflag, IO_SYNC));
432 KASSERT(uio->uio_rw == UIO_WRITE);
433
434 ip = VTOI(vp);
435 fs = ip->I_FS;
436
437 KASSERT(vp->v_size == ip->i_size);
438
439 if (uio->uio_offset < 0 ||
440 uio->uio_resid > fs->um_maxfilesize ||
441 uio->uio_offset > (fs->um_maxfilesize - uio->uio_resid))
442 return EFBIG;
443 KASSERT(vp != fs->lfs_ivnode);
444 if (uio->uio_resid == 0)
445 return 0;
446
447 flags = ioflag & IO_SYNC ? B_SYNC : 0;
448 resid = uio->uio_resid;
449 osize = ip->i_size;
450 error = 0;
451
452 KASSERT(vp->v_type != VREG);
453
454 lfs_availwait(fs, lfs_btofsb(fs, uio->uio_resid));
455 lfs_check(vp, LFS_UNUSED_LBN, 0);
456
457 /* XXX Should never have pages cached here. */
458 KASSERT(vp->v_uobj.uo_npages == 0);
459 while (uio->uio_resid > 0) {
460 lbn = lfs_lblkno(fs, uio->uio_offset);
461 blkoffset = lfs_blkoff(fs, uio->uio_offset);
462 xfersize = MIN(fs_sb_getbsize(fs) - blkoffset, uio->uio_resid);
463 if (fs_sb_getbsize(fs) > xfersize)
464 flags |= B_CLRBUF;
465 else
466 flags &= ~B_CLRBUF;
467
468 error = lfs_reserve(fs, vp, NULL,
469 lfs_btofsb(fs, (ULFS_NIADDR + 1) << lfs_sb_getbshift(fs)));
470 if (error)
471 break;
472 need_unreserve = true;
473 error = lfs_balloc(vp, uio->uio_offset, xfersize, cred, flags,
474 &bp);
475
476 if (error)
477 break;
478 if (uio->uio_offset + xfersize > ip->i_size) {
479 ip->i_size = uio->uio_offset + xfersize;
480 DIP_ASSIGN(ip, size, ip->i_size);
481 uvm_vnp_setsize(vp, ip->i_size);
482 }
483 size = lfs_blksize(fs, ip, lbn) - bp->b_resid;
484 if (xfersize > size)
485 xfersize = size;
486
487 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
488
489 /*
490 * if we didn't clear the block and the uiomove failed,
491 * the buf will now contain part of some other file,
492 * so we need to invalidate it.
493 */
494 if (error && (flags & B_CLRBUF) == 0) {
495 brelse(bp, BC_INVAL);
496 break;
497 }
498 (void)VOP_BWRITE(bp->b_vp, bp);
499 lfs_reserve(fs, vp, NULL,
500 -lfs_btofsb(fs, (ULFS_NIADDR + 1) << lfs_sb_getbshift(fs)));
501 need_unreserve = false;
502 if (error || xfersize == 0)
503 break;
504 }
505 if (need_unreserve) {
506 lfs_reserve(fs, vp, NULL,
507 -lfs_btofsb(fs, (ULFS_NIADDR + 1) << lfs_sb_getbshift(fs)));
508 }
509
510 error = ulfs_post_write_update(vp, uio, ioflag, cred, osize, resid,
511 error);
512
513 return (error);
514 }
515
516 static int
ulfs_post_write_update(struct vnode * vp,struct uio * uio,int ioflag,kauth_cred_t cred,off_t osize,int resid,int oerror)517 ulfs_post_write_update(struct vnode *vp, struct uio *uio, int ioflag,
518 kauth_cred_t cred, off_t osize, int resid, int oerror)
519 {
520 struct inode *ip = VTOI(vp);
521 int error = oerror;
522
523 /* Trigger ctime and mtime updates, and atime if MNT_RELATIME. */
524 ip->i_state |= IN_CHANGE | IN_UPDATE;
525 if (vp->v_mount->mnt_flag & MNT_RELATIME)
526 ip->i_state |= IN_ACCESS;
527
528 /*
529 * If we successfully wrote any data and we are not the superuser,
530 * we clear the setuid and setgid bits as a precaution against
531 * tampering.
532 */
533 if (resid > uio->uio_resid && cred) {
534 if (ip->i_mode & ISUID) {
535 if (kauth_authorize_vnode(cred,
536 KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0) {
537 ip->i_mode &= ~ISUID;
538 DIP_ASSIGN(ip, mode, ip->i_mode);
539 }
540 }
541
542 if (ip->i_mode & ISGID) {
543 if (kauth_authorize_vnode(cred,
544 KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0) {
545 ip->i_mode &= ~ISGID;
546 DIP_ASSIGN(ip, mode, ip->i_mode);
547 }
548 }
549 }
550
551 /*
552 * Update the size on disk: truncate back to original size on
553 * error, or reflect the new size on success.
554 */
555 if (error) {
556 (void) lfs_truncate(vp, osize, ioflag & IO_SYNC, cred);
557 uio->uio_offset -= resid - uio->uio_resid;
558 uio->uio_resid = resid;
559 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) {
560 error = lfs_update(vp, NULL, NULL, UPDATE_WAIT);
561 } else {
562 /* nothing */
563 }
564
565 /* Make sure the vnode uvm size matches the inode file size. */
566 KASSERT(vp->v_size == ip->i_size);
567
568 /* Write error overrides any inode update error. */
569 if (oerror)
570 error = oerror;
571 return error;
572 }
573