1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * Portions of this source code were derived from Berkeley 4.3 BSD
32 * under license from the Regents of the University of California.
33 */
34
35 #pragma ident "%Z%%M% %I% %E% SMI"
36
37 #include <sys/param.h>
38 #include <sys/isa_defs.h>
39 #include <sys/types.h>
40 #include <sys/inttypes.h>
41 #include <sys/sysmacros.h>
42 #include <sys/cred.h>
43 #include <sys/user.h>
44 #include <sys/systm.h>
45 #include <sys/errno.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/cpuvar.h>
50 #include <sys/uio.h>
51 #include <sys/debug.h>
52 #include <sys/rctl.h>
53 #include <sys/nbmlock.h>
54
55 #define COPYOUT_MAX_CACHE (1<<17) /* 128K */
56
57 size_t copyout_max_cached = COPYOUT_MAX_CACHE; /* global so it's patchable */
58
59 /*
60 * read, write, pread, pwrite, readv, and writev syscalls.
61 *
62 * 64-bit open: all open's are large file opens.
63 * Large Files: the behaviour of read depends on whether the fd
64 * corresponds to large open or not.
65 * 32-bit open: FOFFMAX flag not set.
66 * read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
67 * EOVERFLOW if count is non-zero and if size of file
68 * is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
69 * at >= MAXOFF32_T returns EOF.
70 */
71
72 /*
73 * Native system call
74 */
75 ssize_t
read(int fdes,void * cbuf,size_t count)76 read(int fdes, void *cbuf, size_t count)
77 {
78 struct uio auio;
79 struct iovec aiov;
80 file_t *fp;
81 register vnode_t *vp;
82 struct cpu *cp;
83 int fflag, ioflag, rwflag;
84 ssize_t cnt, bcount;
85 int error = 0;
86 u_offset_t fileoff;
87 int in_crit = 0;
88
89 if ((cnt = (ssize_t)count) < 0)
90 return (set_errno(EINVAL));
91 if ((fp = getf(fdes)) == NULL)
92 return (set_errno(EBADF));
93 if (((fflag = fp->f_flag) & FREAD) == 0) {
94 error = EBADF;
95 goto out;
96 }
97 vp = fp->f_vnode;
98
99 if (vp->v_type == VREG && cnt == 0) {
100 goto out;
101 }
102
103 rwflag = 0;
104 aiov.iov_base = cbuf;
105 aiov.iov_len = cnt;
106
107 /*
108 * We have to enter the critical region before calling VOP_RWLOCK
109 * to avoid a deadlock with write() calls.
110 */
111 if (nbl_need_check(vp)) {
112 int svmand;
113
114 nbl_start_crit(vp, RW_READER);
115 in_crit = 1;
116 error = nbl_svmand(vp, fp->f_cred, &svmand);
117 if (error != 0)
118 goto out;
119 if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
120 NULL)) {
121 error = EACCES;
122 goto out;
123 }
124 }
125
126 (void) VOP_RWLOCK(vp, rwflag, NULL);
127
128 /*
129 * We do the following checks inside VOP_RWLOCK so as to
130 * prevent file size from changing while these checks are
131 * being done. Also, we load fp's offset to the local
132 * variable fileoff because we can have a parallel lseek
133 * going on (f_offset is not protected by any lock) which
134 * could change f_offset. We need to see the value only
135 * once here and take a decision. Seeing it more than once
136 * can lead to incorrect functionality.
137 */
138
139 fileoff = (u_offset_t)fp->f_offset;
140 if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
141 struct vattr va;
142 va.va_mask = AT_SIZE;
143 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
144 VOP_RWUNLOCK(vp, rwflag, NULL);
145 goto out;
146 }
147 if (fileoff >= va.va_size) {
148 cnt = 0;
149 VOP_RWUNLOCK(vp, rwflag, NULL);
150 goto out;
151 } else {
152 error = EOVERFLOW;
153 VOP_RWUNLOCK(vp, rwflag, NULL);
154 goto out;
155 }
156 }
157 if ((vp->v_type == VREG) &&
158 (fileoff + cnt > OFFSET_MAX(fp))) {
159 cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
160 }
161 auio.uio_loffset = fileoff;
162 auio.uio_iov = &aiov;
163 auio.uio_iovcnt = 1;
164 auio.uio_resid = bcount = cnt;
165 auio.uio_segflg = UIO_USERSPACE;
166 auio.uio_llimit = MAXOFFSET_T;
167 auio.uio_fmode = fflag;
168 /*
169 * Only use bypass caches when the count is large enough
170 */
171 if (bcount <= copyout_max_cached)
172 auio.uio_extflg = UIO_COPY_CACHED;
173 else
174 auio.uio_extflg = UIO_COPY_DEFAULT;
175
176 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
177
178 /* If read sync is not asked for, filter sync flags */
179 if ((ioflag & FRSYNC) == 0)
180 ioflag &= ~(FSYNC|FDSYNC);
181 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
182 cnt -= auio.uio_resid;
183 CPU_STATS_ENTER_K();
184 cp = CPU;
185 CPU_STATS_ADDQ(cp, sys, sysread, 1);
186 CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
187 CPU_STATS_EXIT_K();
188 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
189
190 if (vp->v_type == VFIFO) /* Backward compatibility */
191 fp->f_offset = cnt;
192 else if (((fp->f_flag & FAPPEND) == 0) ||
193 (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
194 fp->f_offset = auio.uio_loffset;
195 VOP_RWUNLOCK(vp, rwflag, NULL);
196
197 if (error == EINTR && cnt != 0)
198 error = 0;
199 out:
200 if (in_crit)
201 nbl_end_crit(vp);
202 releasef(fdes);
203 if (error)
204 return (set_errno(error));
205 return (cnt);
206 }
207
208 /*
209 * Native system call
210 */
211 ssize_t
write(int fdes,void * cbuf,size_t count)212 write(int fdes, void *cbuf, size_t count)
213 {
214 struct uio auio;
215 struct iovec aiov;
216 file_t *fp;
217 register vnode_t *vp;
218 struct cpu *cp;
219 int fflag, ioflag, rwflag;
220 ssize_t cnt, bcount;
221 int error = 0;
222 u_offset_t fileoff;
223 int in_crit = 0;
224
225 if ((cnt = (ssize_t)count) < 0)
226 return (set_errno(EINVAL));
227 if ((fp = getf(fdes)) == NULL)
228 return (set_errno(EBADF));
229 if (((fflag = fp->f_flag) & FWRITE) == 0) {
230 error = EBADF;
231 goto out;
232 }
233 vp = fp->f_vnode;
234
235 if (vp->v_type == VREG && cnt == 0) {
236 goto out;
237 }
238
239 rwflag = 1;
240 aiov.iov_base = cbuf;
241 aiov.iov_len = cnt;
242
243 /*
244 * We have to enter the critical region before calling VOP_RWLOCK
245 * to avoid a deadlock with ufs.
246 */
247 if (nbl_need_check(vp)) {
248 int svmand;
249
250 nbl_start_crit(vp, RW_READER);
251 in_crit = 1;
252 error = nbl_svmand(vp, fp->f_cred, &svmand);
253 if (error != 0)
254 goto out;
255 if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
256 NULL)) {
257 error = EACCES;
258 goto out;
259 }
260 }
261
262 (void) VOP_RWLOCK(vp, rwflag, NULL);
263
264 fileoff = fp->f_offset;
265 if (vp->v_type == VREG) {
266
267 /*
268 * We raise psignal if write for >0 bytes causes
269 * it to exceed the ulimit.
270 */
271 if (fileoff >= curproc->p_fsz_ctl) {
272 VOP_RWUNLOCK(vp, rwflag, NULL);
273
274 mutex_enter(&curproc->p_lock);
275 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
276 curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
277 mutex_exit(&curproc->p_lock);
278
279 error = EFBIG;
280 goto out;
281 }
282 /*
283 * We return EFBIG if write is done at an offset
284 * greater than the offset maximum for this file structure.
285 */
286
287 if (fileoff >= OFFSET_MAX(fp)) {
288 VOP_RWUNLOCK(vp, rwflag, NULL);
289 error = EFBIG;
290 goto out;
291 }
292 /*
293 * Limit the bytes to be written upto offset maximum for
294 * this open file structure.
295 */
296 if (fileoff + cnt > OFFSET_MAX(fp))
297 cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
298 }
299 auio.uio_loffset = fileoff;
300 auio.uio_iov = &aiov;
301 auio.uio_iovcnt = 1;
302 auio.uio_resid = bcount = cnt;
303 auio.uio_segflg = UIO_USERSPACE;
304 auio.uio_llimit = curproc->p_fsz_ctl;
305 auio.uio_fmode = fflag;
306 auio.uio_extflg = UIO_COPY_DEFAULT;
307
308 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
309
310 error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
311 cnt -= auio.uio_resid;
312 CPU_STATS_ENTER_K();
313 cp = CPU;
314 CPU_STATS_ADDQ(cp, sys, syswrite, 1);
315 CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
316 CPU_STATS_EXIT_K();
317 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
318
319 if (vp->v_type == VFIFO) /* Backward compatibility */
320 fp->f_offset = cnt;
321 else if (((fp->f_flag & FAPPEND) == 0) ||
322 (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
323 fp->f_offset = auio.uio_loffset;
324 VOP_RWUNLOCK(vp, rwflag, NULL);
325
326 if (error == EINTR && cnt != 0)
327 error = 0;
328 out:
329 if (in_crit)
330 nbl_end_crit(vp);
331 releasef(fdes);
332 if (error)
333 return (set_errno(error));
334 return (cnt);
335 }
336
337 ssize_t
pread(int fdes,void * cbuf,size_t count,off_t offset)338 pread(int fdes, void *cbuf, size_t count, off_t offset)
339 {
340 struct uio auio;
341 struct iovec aiov;
342 file_t *fp;
343 register vnode_t *vp;
344 struct cpu *cp;
345 int fflag, ioflag, rwflag;
346 ssize_t bcount;
347 int error = 0;
348 u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
349 #ifdef _SYSCALL32_IMPL
350 u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
351 MAXOFF32_T : MAXOFFSET_T;
352 #else
353 const u_offset_t maxoff = MAXOFF32_T;
354 #endif
355 int in_crit = 0;
356
357 if ((bcount = (ssize_t)count) < 0)
358 return (set_errno(EINVAL));
359
360 if ((fp = getf(fdes)) == NULL)
361 return (set_errno(EBADF));
362 if (((fflag = fp->f_flag) & (FREAD)) == 0) {
363 error = EBADF;
364 goto out;
365 }
366
367 rwflag = 0;
368 vp = fp->f_vnode;
369
370 if (vp->v_type == VREG) {
371
372 if (bcount == 0)
373 goto out;
374
375 /*
376 * Return EINVAL if an invalid offset comes to pread.
377 * Negative offset from user will cause this error.
378 */
379
380 if (fileoff > maxoff) {
381 error = EINVAL;
382 goto out;
383 }
384 /*
385 * Limit offset such that we don't read or write
386 * a file beyond the maximum offset representable in
387 * an off_t structure.
388 */
389 if (fileoff + bcount > maxoff)
390 bcount = (ssize_t)((offset_t)maxoff - fileoff);
391 } else if (vp->v_type == VFIFO) {
392 error = ESPIPE;
393 goto out;
394 }
395
396 /*
397 * We have to enter the critical region before calling VOP_RWLOCK
398 * to avoid a deadlock with ufs.
399 */
400 if (nbl_need_check(vp)) {
401 int svmand;
402
403 nbl_start_crit(vp, RW_READER);
404 in_crit = 1;
405 error = nbl_svmand(vp, fp->f_cred, &svmand);
406 if (error != 0)
407 goto out;
408 if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
409 NULL)) {
410 error = EACCES;
411 goto out;
412 }
413 }
414
415 aiov.iov_base = cbuf;
416 aiov.iov_len = bcount;
417 (void) VOP_RWLOCK(vp, rwflag, NULL);
418 if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
419 struct vattr va;
420 va.va_mask = AT_SIZE;
421 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
422 VOP_RWUNLOCK(vp, rwflag, NULL);
423 goto out;
424 }
425 VOP_RWUNLOCK(vp, rwflag, NULL);
426
427 /*
428 * We have to return EOF if fileoff is >= file size.
429 */
430 if (fileoff >= va.va_size) {
431 bcount = 0;
432 goto out;
433 }
434
435 /*
436 * File is greater than or equal to maxoff and therefore
437 * we return EOVERFLOW.
438 */
439 error = EOVERFLOW;
440 goto out;
441 }
442 auio.uio_loffset = fileoff;
443 auio.uio_iov = &aiov;
444 auio.uio_iovcnt = 1;
445 auio.uio_resid = bcount;
446 auio.uio_segflg = UIO_USERSPACE;
447 auio.uio_llimit = MAXOFFSET_T;
448 auio.uio_fmode = fflag;
449 auio.uio_extflg = UIO_COPY_CACHED;
450
451 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
452
453 /* If read sync is not asked for, filter sync flags */
454 if ((ioflag & FRSYNC) == 0)
455 ioflag &= ~(FSYNC|FDSYNC);
456 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
457 bcount -= auio.uio_resid;
458 CPU_STATS_ENTER_K();
459 cp = CPU;
460 CPU_STATS_ADDQ(cp, sys, sysread, 1);
461 CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
462 CPU_STATS_EXIT_K();
463 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
464 VOP_RWUNLOCK(vp, rwflag, NULL);
465
466 if (error == EINTR && bcount != 0)
467 error = 0;
468 out:
469 if (in_crit)
470 nbl_end_crit(vp);
471 releasef(fdes);
472 if (error)
473 return (set_errno(error));
474 return (bcount);
475 }
476
477 ssize_t
pwrite(int fdes,void * cbuf,size_t count,off_t offset)478 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
479 {
480 struct uio auio;
481 struct iovec aiov;
482 file_t *fp;
483 register vnode_t *vp;
484 struct cpu *cp;
485 int fflag, ioflag, rwflag;
486 ssize_t bcount;
487 int error = 0;
488 u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
489 #ifdef _SYSCALL32_IMPL
490 u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
491 MAXOFF32_T : MAXOFFSET_T;
492 #else
493 const u_offset_t maxoff = MAXOFF32_T;
494 #endif
495 int in_crit = 0;
496
497 if ((bcount = (ssize_t)count) < 0)
498 return (set_errno(EINVAL));
499 if ((fp = getf(fdes)) == NULL)
500 return (set_errno(EBADF));
501 if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
502 error = EBADF;
503 goto out;
504 }
505
506 rwflag = 1;
507 vp = fp->f_vnode;
508
509 if (vp->v_type == VREG) {
510
511 if (bcount == 0)
512 goto out;
513
514 /*
515 * return EINVAL for offsets that cannot be
516 * represented in an off_t.
517 */
518 if (fileoff > maxoff) {
519 error = EINVAL;
520 goto out;
521 }
522 /*
523 * Take appropriate action if we are trying to write above the
524 * resource limit.
525 */
526 if (fileoff >= curproc->p_fsz_ctl) {
527 mutex_enter(&curproc->p_lock);
528 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
529 curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
530 mutex_exit(&curproc->p_lock);
531
532 error = EFBIG;
533 goto out;
534 }
535 /*
536 * Don't allow pwrite to cause file sizes to exceed
537 * maxoff.
538 */
539 if (fileoff == maxoff) {
540 error = EFBIG;
541 goto out;
542 }
543 if (fileoff + count > maxoff)
544 bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
545 } else if (vp->v_type == VFIFO) {
546 error = ESPIPE;
547 goto out;
548 }
549
550 /*
551 * We have to enter the critical region before calling VOP_RWLOCK
552 * to avoid a deadlock with ufs.
553 */
554 if (nbl_need_check(vp)) {
555 int svmand;
556
557 nbl_start_crit(vp, RW_READER);
558 in_crit = 1;
559 error = nbl_svmand(vp, fp->f_cred, &svmand);
560 if (error != 0)
561 goto out;
562 if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
563 NULL)) {
564 error = EACCES;
565 goto out;
566 }
567 }
568
569 aiov.iov_base = cbuf;
570 aiov.iov_len = bcount;
571 (void) VOP_RWLOCK(vp, rwflag, NULL);
572 auio.uio_loffset = fileoff;
573 auio.uio_iov = &aiov;
574 auio.uio_iovcnt = 1;
575 auio.uio_resid = bcount;
576 auio.uio_segflg = UIO_USERSPACE;
577 auio.uio_llimit = curproc->p_fsz_ctl;
578 auio.uio_fmode = fflag;
579 auio.uio_extflg = UIO_COPY_CACHED;
580
581 /*
582 * The SUSv4 POSIX specification states:
583 * The pwrite() function shall be equivalent to write(), except
584 * that it writes into a given position and does not change
585 * the file offset (regardless of whether O_APPEND is set).
586 * To make this be true, we omit the FAPPEND flag from ioflag.
587 */
588 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
589
590 error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
591 bcount -= auio.uio_resid;
592 CPU_STATS_ENTER_K();
593 cp = CPU;
594 CPU_STATS_ADDQ(cp, sys, syswrite, 1);
595 CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
596 CPU_STATS_EXIT_K();
597 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
598 VOP_RWUNLOCK(vp, rwflag, NULL);
599
600 if (error == EINTR && bcount != 0)
601 error = 0;
602 out:
603 if (in_crit)
604 nbl_end_crit(vp);
605 releasef(fdes);
606 if (error)
607 return (set_errno(error));
608 return (bcount);
609 }
610
611 /*
612 * XXX -- The SVID refers to IOV_MAX, but doesn't define it. Grrrr....
613 * XXX -- However, SVVS expects readv() and writev() to fail if
614 * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
615 * XXX -- so I guess that's the "interface".
616 */
617 #define DEF_IOV_MAX 16
618
619 ssize_t
readv(int fdes,struct iovec * iovp,int iovcnt)620 readv(int fdes, struct iovec *iovp, int iovcnt)
621 {
622 struct uio auio;
623 struct iovec aiov[DEF_IOV_MAX];
624 file_t *fp;
625 register vnode_t *vp;
626 struct cpu *cp;
627 int fflag, ioflag, rwflag;
628 ssize_t count, bcount;
629 int error = 0;
630 int i;
631 u_offset_t fileoff;
632 int in_crit = 0;
633
634 if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
635 return (set_errno(EINVAL));
636
637 #ifdef _SYSCALL32_IMPL
638 /*
639 * 32-bit callers need to have their iovec expanded,
640 * while ensuring that they can't move more than 2Gbytes
641 * of data in a single call.
642 */
643 if (get_udatamodel() == DATAMODEL_ILP32) {
644 struct iovec32 aiov32[DEF_IOV_MAX];
645 ssize32_t count32;
646
647 if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
648 return (set_errno(EFAULT));
649
650 count32 = 0;
651 for (i = 0; i < iovcnt; i++) {
652 ssize32_t iovlen32 = aiov32[i].iov_len;
653 count32 += iovlen32;
654 if (iovlen32 < 0 || count32 < 0)
655 return (set_errno(EINVAL));
656 aiov[i].iov_len = iovlen32;
657 aiov[i].iov_base =
658 (caddr_t)(uintptr_t)aiov32[i].iov_base;
659 }
660 } else
661 #endif
662 if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
663 return (set_errno(EFAULT));
664
665 count = 0;
666 for (i = 0; i < iovcnt; i++) {
667 ssize_t iovlen = aiov[i].iov_len;
668 count += iovlen;
669 if (iovlen < 0 || count < 0)
670 return (set_errno(EINVAL));
671 }
672 if ((fp = getf(fdes)) == NULL)
673 return (set_errno(EBADF));
674 if (((fflag = fp->f_flag) & FREAD) == 0) {
675 error = EBADF;
676 goto out;
677 }
678 vp = fp->f_vnode;
679 if (vp->v_type == VREG && count == 0) {
680 goto out;
681 }
682
683 rwflag = 0;
684
685 /*
686 * We have to enter the critical region before calling VOP_RWLOCK
687 * to avoid a deadlock with ufs.
688 */
689 if (nbl_need_check(vp)) {
690 int svmand;
691
692 nbl_start_crit(vp, RW_READER);
693 in_crit = 1;
694 error = nbl_svmand(vp, fp->f_cred, &svmand);
695 if (error != 0)
696 goto out;
697 if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
698 NULL)) {
699 error = EACCES;
700 goto out;
701 }
702 }
703
704 (void) VOP_RWLOCK(vp, rwflag, NULL);
705 fileoff = fp->f_offset;
706
707 /*
708 * Behaviour is same as read. Please see comments in read.
709 */
710
711 if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
712 struct vattr va;
713 va.va_mask = AT_SIZE;
714 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
715 VOP_RWUNLOCK(vp, rwflag, NULL);
716 goto out;
717 }
718 if (fileoff >= va.va_size) {
719 VOP_RWUNLOCK(vp, rwflag, NULL);
720 count = 0;
721 goto out;
722 } else {
723 VOP_RWUNLOCK(vp, rwflag, NULL);
724 error = EOVERFLOW;
725 goto out;
726 }
727 }
728 if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
729 count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
730 }
731 auio.uio_loffset = fileoff;
732 auio.uio_iov = aiov;
733 auio.uio_iovcnt = iovcnt;
734 auio.uio_resid = bcount = count;
735 auio.uio_segflg = UIO_USERSPACE;
736 auio.uio_llimit = MAXOFFSET_T;
737 auio.uio_fmode = fflag;
738 if (bcount <= copyout_max_cached)
739 auio.uio_extflg = UIO_COPY_CACHED;
740 else
741 auio.uio_extflg = UIO_COPY_DEFAULT;
742
743
744 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
745
746 /* If read sync is not asked for, filter sync flags */
747 if ((ioflag & FRSYNC) == 0)
748 ioflag &= ~(FSYNC|FDSYNC);
749 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
750 count -= auio.uio_resid;
751 CPU_STATS_ENTER_K();
752 cp = CPU;
753 CPU_STATS_ADDQ(cp, sys, sysread, 1);
754 CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
755 CPU_STATS_EXIT_K();
756 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
757
758 if (vp->v_type == VFIFO) /* Backward compatibility */
759 fp->f_offset = count;
760 else if (((fp->f_flag & FAPPEND) == 0) ||
761 (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
762 fp->f_offset = auio.uio_loffset;
763
764 VOP_RWUNLOCK(vp, rwflag, NULL);
765
766 if (error == EINTR && count != 0)
767 error = 0;
768 out:
769 if (in_crit)
770 nbl_end_crit(vp);
771 releasef(fdes);
772 if (error)
773 return (set_errno(error));
774 return (count);
775 }
776
777 ssize_t
writev(int fdes,struct iovec * iovp,int iovcnt)778 writev(int fdes, struct iovec *iovp, int iovcnt)
779 {
780 struct uio auio;
781 struct iovec aiov[DEF_IOV_MAX];
782 file_t *fp;
783 register vnode_t *vp;
784 struct cpu *cp;
785 int fflag, ioflag, rwflag;
786 ssize_t count, bcount;
787 int error = 0;
788 int i;
789 u_offset_t fileoff;
790 int in_crit = 0;
791
792 if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
793 return (set_errno(EINVAL));
794
795 #ifdef _SYSCALL32_IMPL
796 /*
797 * 32-bit callers need to have their iovec expanded,
798 * while ensuring that they can't move more than 2Gbytes
799 * of data in a single call.
800 */
801 if (get_udatamodel() == DATAMODEL_ILP32) {
802 struct iovec32 aiov32[DEF_IOV_MAX];
803 ssize32_t count32;
804
805 if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
806 return (set_errno(EFAULT));
807
808 count32 = 0;
809 for (i = 0; i < iovcnt; i++) {
810 ssize32_t iovlen = aiov32[i].iov_len;
811 count32 += iovlen;
812 if (iovlen < 0 || count32 < 0)
813 return (set_errno(EINVAL));
814 aiov[i].iov_len = iovlen;
815 aiov[i].iov_base =
816 (caddr_t)(uintptr_t)aiov32[i].iov_base;
817 }
818 } else
819 #endif
820 if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
821 return (set_errno(EFAULT));
822
823 count = 0;
824 for (i = 0; i < iovcnt; i++) {
825 ssize_t iovlen = aiov[i].iov_len;
826 count += iovlen;
827 if (iovlen < 0 || count < 0)
828 return (set_errno(EINVAL));
829 }
830 if ((fp = getf(fdes)) == NULL)
831 return (set_errno(EBADF));
832 if (((fflag = fp->f_flag) & FWRITE) == 0) {
833 error = EBADF;
834 goto out;
835 }
836 vp = fp->f_vnode;
837 if (vp->v_type == VREG && count == 0) {
838 goto out;
839 }
840
841 rwflag = 1;
842
843 /*
844 * We have to enter the critical region before calling VOP_RWLOCK
845 * to avoid a deadlock with ufs.
846 */
847 if (nbl_need_check(vp)) {
848 int svmand;
849
850 nbl_start_crit(vp, RW_READER);
851 in_crit = 1;
852 error = nbl_svmand(vp, fp->f_cred, &svmand);
853 if (error != 0)
854 goto out;
855 if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
856 NULL)) {
857 error = EACCES;
858 goto out;
859 }
860 }
861
862 (void) VOP_RWLOCK(vp, rwflag, NULL);
863
864 fileoff = fp->f_offset;
865
866 /*
867 * Behaviour is same as write. Please see comments for write.
868 */
869
870 if (vp->v_type == VREG) {
871 if (fileoff >= curproc->p_fsz_ctl) {
872 VOP_RWUNLOCK(vp, rwflag, NULL);
873 mutex_enter(&curproc->p_lock);
874 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
875 curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
876 mutex_exit(&curproc->p_lock);
877 error = EFBIG;
878 goto out;
879 }
880 if (fileoff >= OFFSET_MAX(fp)) {
881 VOP_RWUNLOCK(vp, rwflag, NULL);
882 error = EFBIG;
883 goto out;
884 }
885 if (fileoff + count > OFFSET_MAX(fp))
886 count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
887 }
888 auio.uio_loffset = fileoff;
889 auio.uio_iov = aiov;
890 auio.uio_iovcnt = iovcnt;
891 auio.uio_resid = bcount = count;
892 auio.uio_segflg = UIO_USERSPACE;
893 auio.uio_llimit = curproc->p_fsz_ctl;
894 auio.uio_fmode = fflag;
895 auio.uio_extflg = UIO_COPY_DEFAULT;
896
897 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
898
899 error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
900 count -= auio.uio_resid;
901 CPU_STATS_ENTER_K();
902 cp = CPU;
903 CPU_STATS_ADDQ(cp, sys, syswrite, 1);
904 CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
905 CPU_STATS_EXIT_K();
906 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
907
908 if (vp->v_type == VFIFO) /* Backward compatibility */
909 fp->f_offset = count;
910 else if (((fp->f_flag & FAPPEND) == 0) ||
911 (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
912 fp->f_offset = auio.uio_loffset;
913 VOP_RWUNLOCK(vp, rwflag, NULL);
914
915 if (error == EINTR && count != 0)
916 error = 0;
917 out:
918 if (in_crit)
919 nbl_end_crit(vp);
920 releasef(fdes);
921 if (error)
922 return (set_errno(error));
923 return (count);
924 }
925
926 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
927
928 /*
929 * This syscall supplies 64-bit file offsets to 32-bit applications only.
930 */
931 ssize32_t
pread64(int fdes,void * cbuf,size32_t count,uint32_t offset_1,uint32_t offset_2)932 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
933 uint32_t offset_2)
934 {
935 struct uio auio;
936 struct iovec aiov;
937 file_t *fp;
938 register vnode_t *vp;
939 struct cpu *cp;
940 int fflag, ioflag, rwflag;
941 ssize_t bcount;
942 int error = 0;
943 u_offset_t fileoff;
944 int in_crit = 0;
945
946 #if defined(_LITTLE_ENDIAN)
947 fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
948 #else
949 fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
950 #endif
951
952 if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
953 return (set_errno(EINVAL));
954
955 if ((fp = getf(fdes)) == NULL)
956 return (set_errno(EBADF));
957 if (((fflag = fp->f_flag) & (FREAD)) == 0) {
958 error = EBADF;
959 goto out;
960 }
961
962 rwflag = 0;
963 vp = fp->f_vnode;
964
965 if (vp->v_type == VREG) {
966
967 if (bcount == 0)
968 goto out;
969
970 /*
971 * Same as pread. See comments in pread.
972 */
973
974 if (fileoff > MAXOFFSET_T) {
975 error = EINVAL;
976 goto out;
977 }
978 if (fileoff + bcount > MAXOFFSET_T)
979 bcount = (ssize_t)(MAXOFFSET_T - fileoff);
980 } else if (vp->v_type == VFIFO) {
981 error = ESPIPE;
982 goto out;
983 }
984
985 /*
986 * We have to enter the critical region before calling VOP_RWLOCK
987 * to avoid a deadlock with ufs.
988 */
989 if (nbl_need_check(vp)) {
990 int svmand;
991
992 nbl_start_crit(vp, RW_READER);
993 in_crit = 1;
994 error = nbl_svmand(vp, fp->f_cred, &svmand);
995 if (error != 0)
996 goto out;
997 if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
998 NULL)) {
999 error = EACCES;
1000 goto out;
1001 }
1002 }
1003
1004 aiov.iov_base = cbuf;
1005 aiov.iov_len = bcount;
1006 (void) VOP_RWLOCK(vp, rwflag, NULL);
1007 auio.uio_loffset = fileoff;
1008
1009 /*
1010 * Note: File size can never be greater than MAXOFFSET_T.
1011 * If ever we start supporting 128 bit files the code
1012 * similar to the one in pread at this place should be here.
1013 * Here we avoid the unnecessary VOP_GETATTR() when we
1014 * know that fileoff == MAXOFFSET_T implies that it is always
1015 * greater than or equal to file size.
1016 */
1017 auio.uio_iov = &aiov;
1018 auio.uio_iovcnt = 1;
1019 auio.uio_resid = bcount;
1020 auio.uio_segflg = UIO_USERSPACE;
1021 auio.uio_llimit = MAXOFFSET_T;
1022 auio.uio_fmode = fflag;
1023 auio.uio_extflg = UIO_COPY_CACHED;
1024
1025 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1026
1027 /* If read sync is not asked for, filter sync flags */
1028 if ((ioflag & FRSYNC) == 0)
1029 ioflag &= ~(FSYNC|FDSYNC);
1030 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1031 bcount -= auio.uio_resid;
1032 CPU_STATS_ENTER_K();
1033 cp = CPU;
1034 CPU_STATS_ADDQ(cp, sys, sysread, 1);
1035 CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1036 CPU_STATS_EXIT_K();
1037 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1038 VOP_RWUNLOCK(vp, rwflag, NULL);
1039
1040 if (error == EINTR && bcount != 0)
1041 error = 0;
1042 out:
1043 if (in_crit)
1044 nbl_end_crit(vp);
1045 releasef(fdes);
1046 if (error)
1047 return (set_errno(error));
1048 return (bcount);
1049 }
1050
1051 /*
1052 * This syscall supplies 64-bit file offsets to 32-bit applications only.
1053 */
1054 ssize32_t
pwrite64(int fdes,void * cbuf,size32_t count,uint32_t offset_1,uint32_t offset_2)1055 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1056 uint32_t offset_2)
1057 {
1058 struct uio auio;
1059 struct iovec aiov;
1060 file_t *fp;
1061 register vnode_t *vp;
1062 struct cpu *cp;
1063 int fflag, ioflag, rwflag;
1064 ssize_t bcount;
1065 int error = 0;
1066 u_offset_t fileoff;
1067 int in_crit = 0;
1068
1069 #if defined(_LITTLE_ENDIAN)
1070 fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1071 #else
1072 fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1073 #endif
1074
1075 if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1076 return (set_errno(EINVAL));
1077 if ((fp = getf(fdes)) == NULL)
1078 return (set_errno(EBADF));
1079 if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1080 error = EBADF;
1081 goto out;
1082 }
1083
1084 rwflag = 1;
1085 vp = fp->f_vnode;
1086
1087 if (vp->v_type == VREG) {
1088
1089 if (bcount == 0)
1090 goto out;
1091
1092 /*
1093 * See comments in pwrite.
1094 */
1095 if (fileoff > MAXOFFSET_T) {
1096 error = EINVAL;
1097 goto out;
1098 }
1099 if (fileoff >= curproc->p_fsz_ctl) {
1100 mutex_enter(&curproc->p_lock);
1101 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1102 curproc->p_rctls, curproc, RCA_SAFE);
1103 mutex_exit(&curproc->p_lock);
1104 error = EFBIG;
1105 goto out;
1106 }
1107 if (fileoff == MAXOFFSET_T) {
1108 error = EFBIG;
1109 goto out;
1110 }
1111 if (fileoff + bcount > MAXOFFSET_T)
1112 bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1113 } else if (vp->v_type == VFIFO) {
1114 error = ESPIPE;
1115 goto out;
1116 }
1117
1118 /*
1119 * We have to enter the critical region before calling VOP_RWLOCK
1120 * to avoid a deadlock with ufs.
1121 */
1122 if (nbl_need_check(vp)) {
1123 int svmand;
1124
1125 nbl_start_crit(vp, RW_READER);
1126 in_crit = 1;
1127 error = nbl_svmand(vp, fp->f_cred, &svmand);
1128 if (error != 0)
1129 goto out;
1130 if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1131 NULL)) {
1132 error = EACCES;
1133 goto out;
1134 }
1135 }
1136
1137 aiov.iov_base = cbuf;
1138 aiov.iov_len = bcount;
1139 (void) VOP_RWLOCK(vp, rwflag, NULL);
1140 auio.uio_loffset = fileoff;
1141 auio.uio_iov = &aiov;
1142 auio.uio_iovcnt = 1;
1143 auio.uio_resid = bcount;
1144 auio.uio_segflg = UIO_USERSPACE;
1145 auio.uio_llimit = curproc->p_fsz_ctl;
1146 auio.uio_fmode = fflag;
1147 auio.uio_extflg = UIO_COPY_CACHED;
1148
1149 /*
1150 * The SUSv4 POSIX specification states:
1151 * The pwrite() function shall be equivalent to write(), except
1152 * that it writes into a given position and does not change
1153 * the file offset (regardless of whether O_APPEND is set).
1154 * To make this be true, we omit the FAPPEND flag from ioflag.
1155 */
1156 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1157
1158 error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1159 bcount -= auio.uio_resid;
1160 CPU_STATS_ENTER_K();
1161 cp = CPU;
1162 CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1163 CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1164 CPU_STATS_EXIT_K();
1165 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1166 VOP_RWUNLOCK(vp, rwflag, NULL);
1167
1168 if (error == EINTR && bcount != 0)
1169 error = 0;
1170 out:
1171 if (in_crit)
1172 nbl_end_crit(vp);
1173 releasef(fdes);
1174 if (error)
1175 return (set_errno(error));
1176 return (bcount);
1177 }
1178
1179 #endif /* _SYSCALL32_IMPL || _ILP32 */
1180
1181 #ifdef _SYSCALL32_IMPL
1182 /*
1183 * Tail-call elimination of xxx32() down to xxx()
1184 *
1185 * A number of xxx32 system calls take a len (or count) argument and
1186 * return a number in the range [0,len] or -1 on error.
1187 * Given an ssize32_t input len, the downcall xxx() will return
1188 * a 64-bit value that is -1 or in the range [0,len] which actually
1189 * is a proper return value for the xxx32 call. So even if the xxx32
1190 * calls can be considered as returning a ssize32_t, they are currently
1191 * declared as returning a ssize_t as this enables tail-call elimination.
1192 *
1193 * The cast of len (or count) to ssize32_t is needed to ensure we pass
1194 * down negative input values as such and let the downcall handle error
1195 * reporting. Functions covered by this comments are:
1196 *
1197 * rw.c: read32, write32, pread32, pwrite32, readv32, writev32.
1198 * socksyscall.c: recv32, recvfrom32, send32, sendto32.
1199 * readlink.c: readlink32.
1200 */
1201
1202 ssize_t
read32(int32_t fdes,caddr32_t cbuf,size32_t count)1203 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1204 {
1205 return (read(fdes,
1206 (void *)(uintptr_t)cbuf, (ssize32_t)count));
1207 }
1208
1209 ssize_t
write32(int32_t fdes,caddr32_t cbuf,size32_t count)1210 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1211 {
1212 return (write(fdes,
1213 (void *)(uintptr_t)cbuf, (ssize32_t)count));
1214 }
1215
1216 ssize_t
pread32(int32_t fdes,caddr32_t cbuf,size32_t count,off32_t offset)1217 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1218 {
1219 return (pread(fdes,
1220 (void *)(uintptr_t)cbuf, (ssize32_t)count,
1221 (off_t)(uint32_t)offset));
1222 }
1223
1224 ssize_t
pwrite32(int32_t fdes,caddr32_t cbuf,size32_t count,off32_t offset)1225 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1226 {
1227 return (pwrite(fdes,
1228 (void *)(uintptr_t)cbuf, (ssize32_t)count,
1229 (off_t)(uint32_t)offset));
1230 }
1231
1232 ssize_t
readv32(int32_t fdes,caddr32_t iovp,int32_t iovcnt)1233 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1234 {
1235 return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1236 }
1237
1238 ssize_t
writev32(int32_t fdes,caddr32_t iovp,int32_t iovcnt)1239 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1240 {
1241 return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1242 }
1243
1244 #endif /* _SYSCALL32_IMPL */
1245