xref: /netbsd-src/sys/kern/sys_aio.c (revision 34335fd211163d14142f73fea8d0b7e87b6d07bc)
1*34335fd2Sriastradh /*	$NetBSD: sys_aio.c,v 1.50 2024/12/07 02:38:51 riastradh Exp $	*/
210c3d35cSrmind 
310c3d35cSrmind /*
4b8ea6ca4Srmind  * Copyright (c) 2007 Mindaugas Rasiukevicius <rmind at NetBSD org>
5c75dc327Srmind  * All rights reserved.
610c3d35cSrmind  *
710c3d35cSrmind  * Redistribution and use in source and binary forms, with or without
810c3d35cSrmind  * modification, are permitted provided that the following conditions
910c3d35cSrmind  * are met:
1010c3d35cSrmind  * 1. Redistributions of source code must retain the above copyright
1110c3d35cSrmind  *    notice, this list of conditions and the following disclaimer.
1210c3d35cSrmind  * 2. Redistributions in binary form must reproduce the above copyright
1310c3d35cSrmind  *    notice, this list of conditions and the following disclaimer in the
1410c3d35cSrmind  *    documentation and/or other materials provided with the distribution.
1510c3d35cSrmind  *
1606171502Srmind  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1706171502Srmind  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1806171502Srmind  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1906171502Srmind  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2006171502Srmind  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2106171502Srmind  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2206171502Srmind  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2306171502Srmind  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2406171502Srmind  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2506171502Srmind  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2606171502Srmind  * SUCH DAMAGE.
2710c3d35cSrmind  */
2810c3d35cSrmind 
2910c3d35cSrmind /*
3006171502Srmind  * Implementation of POSIX asynchronous I/O.
3106171502Srmind  * Defined in the Base Definitions volume of IEEE Std 1003.1-2001.
3210c3d35cSrmind  */
3310c3d35cSrmind 
3410c3d35cSrmind #include <sys/cdefs.h>
35*34335fd2Sriastradh __KERNEL_RCSID(0, "$NetBSD: sys_aio.c,v 1.50 2024/12/07 02:38:51 riastradh Exp $");
360a227b19Srmind 
37e6a33851Sad #ifdef _KERNEL_OPT
380a227b19Srmind #include "opt_ddb.h"
39e6a33851Sad #endif
4010c3d35cSrmind 
4110c3d35cSrmind #include <sys/param.h>
424d07ef19Sriastradh #include <sys/types.h>
434d07ef19Sriastradh 
444d07ef19Sriastradh #include <sys/atomic.h>
454d07ef19Sriastradh #include <sys/buf.h>
4610c3d35cSrmind #include <sys/condvar.h>
4710c3d35cSrmind #include <sys/file.h>
4810c3d35cSrmind #include <sys/filedesc.h>
4910c3d35cSrmind #include <sys/kernel.h>
5010c3d35cSrmind #include <sys/kmem.h>
5110c3d35cSrmind #include <sys/lwp.h>
524d07ef19Sriastradh #include <sys/module.h>
5310c3d35cSrmind #include <sys/mutex.h>
5410c3d35cSrmind #include <sys/pool.h>
5510c3d35cSrmind #include <sys/proc.h>
5610c3d35cSrmind #include <sys/queue.h>
57*34335fd2Sriastradh #include <sys/sdt.h>
5810c3d35cSrmind #include <sys/signal.h>
5910c3d35cSrmind #include <sys/signalvar.h>
60e6a33851Sad #include <sys/syscall.h>
6110c3d35cSrmind #include <sys/syscallargs.h>
62e6a33851Sad #include <sys/syscallvar.h>
6310c3d35cSrmind #include <sys/sysctl.h>
6410c3d35cSrmind #include <sys/systm.h>
6510c3d35cSrmind #include <sys/types.h>
6610c3d35cSrmind #include <sys/vnode.h>
6710c3d35cSrmind 
6810c3d35cSrmind #include <uvm/uvm_extern.h>
6910c3d35cSrmind 
70e6a33851Sad MODULE(MODULE_CLASS_MISC, aio, NULL);
71e6a33851Sad 
7210c3d35cSrmind /*
7310c3d35cSrmind  * System-wide limits and counter of AIO operations.
7410c3d35cSrmind  */
75461a86f9Schristos u_int			aio_listio_max = AIO_LISTIO_MAX;
760a227b19Srmind static u_int		aio_max = AIO_MAX;
770a227b19Srmind static u_int		aio_jobs_count;
7810c3d35cSrmind 
790a227b19Srmind static struct pool	aio_job_pool;
800a227b19Srmind static struct pool	aio_lio_pool;
81e6a33851Sad static void *		aio_ehook;
8210c3d35cSrmind 
83b8562be5Syamt static void		aio_worker(void *);
8410c3d35cSrmind static void		aio_process(struct aio_job *);
8510c3d35cSrmind static void		aio_sendsig(struct proc *, struct sigevent *);
8610c3d35cSrmind static int		aio_enqueue_job(int, void *, struct lio_req *);
87e6a33851Sad static void		aio_exit(proc_t *, void *);
88e6a33851Sad 
8972795172Sjruoho static int		sysctl_aio_listio_max(SYSCTLFN_PROTO);
9072795172Sjruoho static int		sysctl_aio_max(SYSCTLFN_PROTO);
9172795172Sjruoho 
92e6a33851Sad static const struct syscall_package aio_syscalls[] = {
93e6a33851Sad 	{ SYS_aio_cancel, 0, (sy_call_t *)sys_aio_cancel },
94e6a33851Sad 	{ SYS_aio_error, 0, (sy_call_t *)sys_aio_error },
95e6a33851Sad 	{ SYS_aio_fsync, 0, (sy_call_t *)sys_aio_fsync },
96e6a33851Sad 	{ SYS_aio_read, 0, (sy_call_t *)sys_aio_read },
97e6a33851Sad 	{ SYS_aio_return, 0, (sy_call_t *)sys_aio_return },
98461a86f9Schristos 	{ SYS___aio_suspend50, 0, (sy_call_t *)sys___aio_suspend50 },
99e6a33851Sad 	{ SYS_aio_write, 0, (sy_call_t *)sys_aio_write },
100e6a33851Sad 	{ SYS_lio_listio, 0, (sy_call_t *)sys_lio_listio },
101e6a33851Sad 	{ 0, 0, NULL },
102e6a33851Sad };
10310c3d35cSrmind 
10410c3d35cSrmind /*
105e6a33851Sad  * Tear down all AIO state.
1060a227b19Srmind  */
107e6a33851Sad static int
108e6a33851Sad aio_fini(bool interface)
1090a227b19Srmind {
110e6a33851Sad 	int error;
111e6a33851Sad 	proc_t *p;
112e6a33851Sad 
113e6a33851Sad 	if (interface) {
114e6a33851Sad 		/* Stop syscall activity. */
115e6a33851Sad 		error = syscall_disestablish(NULL, aio_syscalls);
116e6a33851Sad 		if (error != 0)
117e6a33851Sad 			return error;
118e6a33851Sad 		/* Abort if any processes are using AIO. */
1190eaaa024Sad 		mutex_enter(&proc_lock);
120e6a33851Sad 		PROCLIST_FOREACH(p, &allproc) {
121e6a33851Sad 			if (p->p_aio != NULL)
122e6a33851Sad 				break;
123e6a33851Sad 		}
1240eaaa024Sad 		mutex_exit(&proc_lock);
125e6a33851Sad 		if (p != NULL) {
126e6a33851Sad 			error = syscall_establish(NULL, aio_syscalls);
127e6a33851Sad 			KASSERT(error == 0);
128*34335fd2Sriastradh 			return SET_ERROR(EBUSY);
129e6a33851Sad 		}
130e6a33851Sad 	}
13172795172Sjruoho 
132e6a33851Sad 	KASSERT(aio_jobs_count == 0);
133e6a33851Sad 	exithook_disestablish(aio_ehook);
134e6a33851Sad 	pool_destroy(&aio_job_pool);
135e6a33851Sad 	pool_destroy(&aio_lio_pool);
136e6a33851Sad 	return 0;
137e6a33851Sad }
138e6a33851Sad 
139e6a33851Sad /*
140e6a33851Sad  * Initialize global AIO state.
141e6a33851Sad  */
142e6a33851Sad static int
143e6a33851Sad aio_init(void)
144e6a33851Sad {
145e6a33851Sad 	int error;
1460a227b19Srmind 
1470a227b19Srmind 	pool_init(&aio_job_pool, sizeof(struct aio_job), 0, 0, 0,
1480a227b19Srmind 	    "aio_jobs_pool", &pool_allocator_nointr, IPL_NONE);
1490a227b19Srmind 	pool_init(&aio_lio_pool, sizeof(struct lio_req), 0, 0, 0,
1500a227b19Srmind 	    "aio_lio_pool", &pool_allocator_nointr, IPL_NONE);
151e6a33851Sad 	aio_ehook = exithook_establish(aio_exit, NULL);
15272795172Sjruoho 
153e6a33851Sad 	error = syscall_establish(NULL, aio_syscalls);
154e6a33851Sad 	if (error != 0)
15572795172Sjruoho 		(void)aio_fini(false);
156e6a33851Sad 	return error;
157e6a33851Sad }
158e6a33851Sad 
159e6a33851Sad /*
160e6a33851Sad  * Module interface.
161e6a33851Sad  */
162e6a33851Sad static int
163e6a33851Sad aio_modcmd(modcmd_t cmd, void *arg)
164e6a33851Sad {
165e6a33851Sad 
166e6a33851Sad 	switch (cmd) {
167e6a33851Sad 	case MODULE_CMD_INIT:
168e6a33851Sad 		return aio_init();
169e6a33851Sad 	case MODULE_CMD_FINI:
170e6a33851Sad 		return aio_fini(true);
171e6a33851Sad 	default:
172*34335fd2Sriastradh 		return SET_ERROR(ENOTTY);
173e6a33851Sad 	}
1740a227b19Srmind }
1750a227b19Srmind 
1760a227b19Srmind /*
17710c3d35cSrmind  * Initialize Asynchronous I/O data structures for the process.
17810c3d35cSrmind  */
179e6a33851Sad static int
180e6a33851Sad aio_procinit(struct proc *p)
18110c3d35cSrmind {
18210c3d35cSrmind 	struct aioproc *aio;
18310c3d35cSrmind 	struct lwp *l;
184d831186dSad 	int error;
18510c3d35cSrmind 	vaddr_t uaddr;
18610c3d35cSrmind 
18710c3d35cSrmind 	/* Allocate and initialize AIO structure */
1886ef6e006Sad 	aio = kmem_zalloc(sizeof(struct aioproc), KM_SLEEP);
18910c3d35cSrmind 
1900a227b19Srmind 	/* Initialize queue and their synchronization structures */
19110c3d35cSrmind 	mutex_init(&aio->aio_mtx, MUTEX_DEFAULT, IPL_NONE);
19210c3d35cSrmind 	cv_init(&aio->aio_worker_cv, "aiowork");
19310c3d35cSrmind 	cv_init(&aio->done_cv, "aiodone");
19410c3d35cSrmind 	TAILQ_INIT(&aio->jobs_queue);
19510c3d35cSrmind 
19610c3d35cSrmind 	/*
19710c3d35cSrmind 	 * Create an AIO worker thread.
19810c3d35cSrmind 	 * XXX: Currently, AIO thread is not protected against user's actions.
19910c3d35cSrmind 	 */
20040cf6f36Srmind 	uaddr = uvm_uarea_alloc();
20110c3d35cSrmind 	if (uaddr == 0) {
20259085afdSrmind 		aio_exit(p, aio);
203*34335fd2Sriastradh 		return SET_ERROR(EAGAIN);
20410c3d35cSrmind 	}
20540cf6f36Srmind 	error = lwp_create(curlwp, p, uaddr, 0, NULL, 0, aio_worker,
206d7746f2eSchristos 	    NULL, &l, curlwp->l_class, &curlwp->l_sigmask, &curlwp->l_sigstk);
207d831186dSad 	if (error != 0) {
20840cf6f36Srmind 		uvm_uarea_free(uaddr);
20959085afdSrmind 		aio_exit(p, aio);
210d831186dSad 		return error;
21110c3d35cSrmind 	}
21210c3d35cSrmind 
21359085afdSrmind 	/* Recheck if we are really first */
214284c2b9aSad 	mutex_enter(p->p_lock);
21559085afdSrmind 	if (p->p_aio) {
216284c2b9aSad 		mutex_exit(p->p_lock);
21759085afdSrmind 		aio_exit(p, aio);
21859085afdSrmind 		lwp_exit(l);
21959085afdSrmind 		return 0;
22059085afdSrmind 	}
22159085afdSrmind 	p->p_aio = aio;
22259085afdSrmind 
22310c3d35cSrmind 	/* Complete the initialization of thread, and run it */
22410c3d35cSrmind 	aio->aio_worker = l;
22510c3d35cSrmind 	lwp_lock(l);
22611ba4e18Sad 	lwp_changepri(l, MAXPRI_USER);
22711ba4e18Sad 	setrunnable(l);
22811ba4e18Sad 	/* LWP now unlocked */
229284c2b9aSad 	mutex_exit(p->p_lock);
23010c3d35cSrmind 
23110c3d35cSrmind 	return 0;
23210c3d35cSrmind }
23310c3d35cSrmind 
23410c3d35cSrmind /*
23510c3d35cSrmind  * Exit of Asynchronous I/O subsystem of process.
23610c3d35cSrmind  */
237e6a33851Sad static void
238e6a33851Sad aio_exit(struct proc *p, void *cookie)
23910c3d35cSrmind {
24010c3d35cSrmind 	struct aio_job *a_job;
241e6a33851Sad 	struct aioproc *aio;
24210c3d35cSrmind 
243e6a33851Sad 	if (cookie != NULL)
244e6a33851Sad 		aio = cookie;
245e6a33851Sad 	else if ((aio = p->p_aio) == NULL)
24610c3d35cSrmind 		return;
24710c3d35cSrmind 
24810c3d35cSrmind 	/* Free AIO queue */
24910c3d35cSrmind 	while (!TAILQ_EMPTY(&aio->jobs_queue)) {
25010c3d35cSrmind 		a_job = TAILQ_FIRST(&aio->jobs_queue);
25110c3d35cSrmind 		TAILQ_REMOVE(&aio->jobs_queue, a_job, list);
2520a227b19Srmind 		pool_put(&aio_job_pool, a_job);
253a45b048eSad 		atomic_dec_uint(&aio_jobs_count);
25410c3d35cSrmind 	}
25510c3d35cSrmind 
25610c3d35cSrmind 	/* Destroy and free the entire AIO data structure */
25710c3d35cSrmind 	cv_destroy(&aio->aio_worker_cv);
25810c3d35cSrmind 	cv_destroy(&aio->done_cv);
25910c3d35cSrmind 	mutex_destroy(&aio->aio_mtx);
26010c3d35cSrmind 	kmem_free(aio, sizeof(struct aioproc));
26110c3d35cSrmind }
26210c3d35cSrmind 
26310c3d35cSrmind /*
26410c3d35cSrmind  * AIO worker thread and processor.
26510c3d35cSrmind  */
266b8562be5Syamt static void
26710c3d35cSrmind aio_worker(void *arg)
26810c3d35cSrmind {
26910c3d35cSrmind 	struct proc *p = curlwp->l_proc;
27010c3d35cSrmind 	struct aioproc *aio = p->p_aio;
27110c3d35cSrmind 	struct aio_job *a_job;
27210c3d35cSrmind 	struct lio_req *lio;
27310c3d35cSrmind 	sigset_t oss, nss;
2747c79fd6cSmartin 	int error __diagused, refcnt;
27510c3d35cSrmind 
27610c3d35cSrmind 	/*
27710c3d35cSrmind 	 * Make an empty signal mask, so it
27810c3d35cSrmind 	 * handles only SIGKILL and SIGSTOP.
27910c3d35cSrmind 	 */
28010c3d35cSrmind 	sigfillset(&nss);
281284c2b9aSad 	mutex_enter(p->p_lock);
28210c3d35cSrmind 	error = sigprocmask1(curlwp, SIG_SETMASK, &nss, &oss);
283284c2b9aSad 	mutex_exit(p->p_lock);
2840a227b19Srmind 	KASSERT(error == 0);
28510c3d35cSrmind 
28610c3d35cSrmind 	for (;;) {
28710c3d35cSrmind 		/*
28810c3d35cSrmind 		 * Loop for each job in the queue.  If there
2890a227b19Srmind 		 * are no jobs then sleep.
29010c3d35cSrmind 		 */
29110c3d35cSrmind 		mutex_enter(&aio->aio_mtx);
29210c3d35cSrmind 		while ((a_job = TAILQ_FIRST(&aio->jobs_queue)) == NULL) {
29310c3d35cSrmind 			if (cv_wait_sig(&aio->aio_worker_cv, &aio->aio_mtx)) {
29410c3d35cSrmind 				/*
2950a227b19Srmind 				 * Thread was interrupted - check for
2960a227b19Srmind 				 * pending exit or suspend.
29710c3d35cSrmind 				 */
2980a227b19Srmind 				mutex_exit(&aio->aio_mtx);
2990a227b19Srmind 				lwp_userret(curlwp);
3000a227b19Srmind 				mutex_enter(&aio->aio_mtx);
30110c3d35cSrmind 			}
30210c3d35cSrmind 		}
30310c3d35cSrmind 
30410c3d35cSrmind 		/* Take the job from the queue */
30510c3d35cSrmind 		aio->curjob = a_job;
30610c3d35cSrmind 		TAILQ_REMOVE(&aio->jobs_queue, a_job, list);
30710c3d35cSrmind 
308a45b048eSad 		atomic_dec_uint(&aio_jobs_count);
30910c3d35cSrmind 		aio->jobs_count--;
31010c3d35cSrmind 
31110c3d35cSrmind 		mutex_exit(&aio->aio_mtx);
31210c3d35cSrmind 
31310c3d35cSrmind 		/* Process an AIO operation */
31410c3d35cSrmind 		aio_process(a_job);
31510c3d35cSrmind 
31610c3d35cSrmind 		/* Copy data structure back to the user-space */
31710c3d35cSrmind 		(void)copyout(&a_job->aiocbp, a_job->aiocb_uptr,
31810c3d35cSrmind 		    sizeof(struct aiocb));
31910c3d35cSrmind 
32010c3d35cSrmind 		mutex_enter(&aio->aio_mtx);
32107928aacSyamt 		KASSERT(aio->curjob == a_job);
32210c3d35cSrmind 		aio->curjob = NULL;
3230a227b19Srmind 
32410c3d35cSrmind 		/* Decrease a reference counter, if there is a LIO structure */
32510c3d35cSrmind 		lio = a_job->lio;
3260a227b19Srmind 		refcnt = (lio != NULL ? --lio->refcnt : -1);
3270a227b19Srmind 
32810c3d35cSrmind 		/* Notify all suspenders */
32910c3d35cSrmind 		cv_broadcast(&aio->done_cv);
33010c3d35cSrmind 		mutex_exit(&aio->aio_mtx);
33110c3d35cSrmind 
33210c3d35cSrmind 		/* Send a signal, if any */
33310c3d35cSrmind 		aio_sendsig(p, &a_job->aiocbp.aio_sigevent);
33410c3d35cSrmind 
33510c3d35cSrmind 		/* Destroy the LIO structure */
3360a227b19Srmind 		if (refcnt == 0) {
33710c3d35cSrmind 			aio_sendsig(p, &lio->sig);
3380a227b19Srmind 			pool_put(&aio_lio_pool, lio);
33910c3d35cSrmind 		}
34010c3d35cSrmind 
3410ae57f90Smbalmer 		/* Destroy the job */
3420a227b19Srmind 		pool_put(&aio_job_pool, a_job);
34310c3d35cSrmind 	}
34410c3d35cSrmind 
3450a227b19Srmind 	/* NOTREACHED */
34610c3d35cSrmind }
34710c3d35cSrmind 
34810c3d35cSrmind static void
34910c3d35cSrmind aio_process(struct aio_job *a_job)
35010c3d35cSrmind {
35110c3d35cSrmind 	struct proc *p = curlwp->l_proc;
35210c3d35cSrmind 	struct aiocb *aiocbp = &a_job->aiocbp;
35310c3d35cSrmind 	struct file *fp;
35410c3d35cSrmind 	int fd = aiocbp->aio_fildes;
35510c3d35cSrmind 	int error = 0;
35610c3d35cSrmind 
35710c3d35cSrmind 	KASSERT(a_job->aio_op != 0);
35810c3d35cSrmind 
3590a227b19Srmind 	if ((a_job->aio_op & (AIO_READ | AIO_WRITE)) != 0) {
36010c3d35cSrmind 		struct iovec aiov;
36110c3d35cSrmind 		struct uio auio;
36210c3d35cSrmind 
36310c3d35cSrmind 		if (aiocbp->aio_nbytes > SSIZE_MAX) {
364*34335fd2Sriastradh 			error = SET_ERROR(EINVAL);
36510c3d35cSrmind 			goto done;
36610c3d35cSrmind 		}
36710c3d35cSrmind 
368a9ca7a37Sad 		fp = fd_getfile(fd);
36910c3d35cSrmind 		if (fp == NULL) {
370*34335fd2Sriastradh 			error = SET_ERROR(EBADF);
37110c3d35cSrmind 			goto done;
37210c3d35cSrmind 		}
37310c3d35cSrmind 
37410c3d35cSrmind 		aiov.iov_base = (void *)(uintptr_t)aiocbp->aio_buf;
37510c3d35cSrmind 		aiov.iov_len = aiocbp->aio_nbytes;
37610c3d35cSrmind 		auio.uio_iov = &aiov;
37710c3d35cSrmind 		auio.uio_iovcnt = 1;
37810c3d35cSrmind 		auio.uio_resid = aiocbp->aio_nbytes;
37910c3d35cSrmind 		auio.uio_vmspace = p->p_vmspace;
38010c3d35cSrmind 
38110c3d35cSrmind 		if (a_job->aio_op & AIO_READ) {
38210c3d35cSrmind 			/*
38310c3d35cSrmind 			 * Perform a Read operation
38410c3d35cSrmind 			 */
38510c3d35cSrmind 			KASSERT((a_job->aio_op & AIO_WRITE) == 0);
38610c3d35cSrmind 
38710c3d35cSrmind 			if ((fp->f_flag & FREAD) == 0) {
388a9ca7a37Sad 				fd_putfile(fd);
389*34335fd2Sriastradh 				error = SET_ERROR(EBADF);
39010c3d35cSrmind 				goto done;
39110c3d35cSrmind 			}
39210c3d35cSrmind 			auio.uio_rw = UIO_READ;
39310c3d35cSrmind 			error = (*fp->f_ops->fo_read)(fp, &aiocbp->aio_offset,
39410c3d35cSrmind 			    &auio, fp->f_cred, FOF_UPDATE_OFFSET);
39510c3d35cSrmind 		} else {
39610c3d35cSrmind 			/*
39710c3d35cSrmind 			 * Perform a Write operation
39810c3d35cSrmind 			 */
39910c3d35cSrmind 			KASSERT(a_job->aio_op & AIO_WRITE);
40010c3d35cSrmind 
40110c3d35cSrmind 			if ((fp->f_flag & FWRITE) == 0) {
402a9ca7a37Sad 				fd_putfile(fd);
403*34335fd2Sriastradh 				error = SET_ERROR(EBADF);
40410c3d35cSrmind 				goto done;
40510c3d35cSrmind 			}
40610c3d35cSrmind 			auio.uio_rw = UIO_WRITE;
40710c3d35cSrmind 			error = (*fp->f_ops->fo_write)(fp, &aiocbp->aio_offset,
40810c3d35cSrmind 			    &auio, fp->f_cred, FOF_UPDATE_OFFSET);
40910c3d35cSrmind 		}
410a9ca7a37Sad 		fd_putfile(fd);
41110c3d35cSrmind 
41210c3d35cSrmind 		/* Store the result value */
41310c3d35cSrmind 		a_job->aiocbp.aio_nbytes -= auio.uio_resid;
41410c3d35cSrmind 		a_job->aiocbp._retval = (error == 0) ?
41510c3d35cSrmind 		    a_job->aiocbp.aio_nbytes : -1;
41610c3d35cSrmind 
4170a227b19Srmind 	} else if ((a_job->aio_op & (AIO_SYNC | AIO_DSYNC)) != 0) {
41810c3d35cSrmind 		/*
41910c3d35cSrmind 		 * Perform a file Sync operation
42010c3d35cSrmind 		 */
42110c3d35cSrmind 		struct vnode *vp;
42210c3d35cSrmind 
423a9ca7a37Sad 		if ((error = fd_getvnode(fd, &fp)) != 0)
42410c3d35cSrmind 			goto done;
42510c3d35cSrmind 
42610c3d35cSrmind 		if ((fp->f_flag & FWRITE) == 0) {
427a9ca7a37Sad 			fd_putfile(fd);
428*34335fd2Sriastradh 			error = SET_ERROR(EBADF);
42910c3d35cSrmind 			goto done;
43010c3d35cSrmind 		}
43110c3d35cSrmind 
43245b1ec74Smatt 		vp = fp->f_vnode;
43310c3d35cSrmind 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
43410c3d35cSrmind 		if (a_job->aio_op & AIO_DSYNC) {
43510c3d35cSrmind 			error = VOP_FSYNC(vp, fp->f_cred,
43661e8303eSpooka 			    FSYNC_WAIT | FSYNC_DATAONLY, 0, 0);
43710c3d35cSrmind 		} else if (a_job->aio_op & AIO_SYNC) {
43810c3d35cSrmind 			error = VOP_FSYNC(vp, fp->f_cred,
43961e8303eSpooka 			    FSYNC_WAIT, 0, 0);
44010c3d35cSrmind 		}
4411423e65bShannken 		VOP_UNLOCK(vp);
442a9ca7a37Sad 		fd_putfile(fd);
44310c3d35cSrmind 
44410c3d35cSrmind 		/* Store the result value */
44510c3d35cSrmind 		a_job->aiocbp._retval = (error == 0) ? 0 : -1;
44610c3d35cSrmind 
44710c3d35cSrmind 	} else
44810c3d35cSrmind 		panic("aio_process: invalid operation code\n");
44910c3d35cSrmind 
45010c3d35cSrmind done:
45110c3d35cSrmind 	/* Job is done, set the error, if any */
45210c3d35cSrmind 	a_job->aiocbp._errno = error;
45310c3d35cSrmind 	a_job->aiocbp._state = JOB_DONE;
45410c3d35cSrmind }
45510c3d35cSrmind 
45610c3d35cSrmind /*
45710c3d35cSrmind  * Send AIO signal.
45810c3d35cSrmind  */
45910c3d35cSrmind static void
46010c3d35cSrmind aio_sendsig(struct proc *p, struct sigevent *sig)
46110c3d35cSrmind {
46210c3d35cSrmind 	ksiginfo_t ksi;
46310c3d35cSrmind 
46410c3d35cSrmind 	if (sig->sigev_signo == 0 || sig->sigev_notify == SIGEV_NONE)
46510c3d35cSrmind 		return;
46610c3d35cSrmind 
46710c3d35cSrmind 	KSI_INIT(&ksi);
46810c3d35cSrmind 	ksi.ksi_signo = sig->sigev_signo;
46910c3d35cSrmind 	ksi.ksi_code = SI_ASYNCIO;
470c61eed39Schristos 	ksi.ksi_value = sig->sigev_value;
4710eaaa024Sad 	mutex_enter(&proc_lock);
47210c3d35cSrmind 	kpsignal(p, &ksi, NULL);
4730eaaa024Sad 	mutex_exit(&proc_lock);
47410c3d35cSrmind }
47510c3d35cSrmind 
47610c3d35cSrmind /*
47710c3d35cSrmind  * Enqueue the job.
47810c3d35cSrmind  */
47910c3d35cSrmind static int
48010c3d35cSrmind aio_enqueue_job(int op, void *aiocb_uptr, struct lio_req *lio)
48110c3d35cSrmind {
48210c3d35cSrmind 	struct proc *p = curlwp->l_proc;
48310c3d35cSrmind 	struct aioproc *aio;
48410c3d35cSrmind 	struct aio_job *a_job;
48510c3d35cSrmind 	struct aiocb aiocbp;
48610c3d35cSrmind 	struct sigevent *sig;
48710c3d35cSrmind 	int error;
48810c3d35cSrmind 
4890e5c3c74Srmind 	/* Non-accurate check for the limit */
4900e5c3c74Srmind 	if (aio_jobs_count + 1 > aio_max)
491*34335fd2Sriastradh 		return SET_ERROR(EAGAIN);
49210c3d35cSrmind 
49310c3d35cSrmind 	/* Get the data structure from user-space */
49410c3d35cSrmind 	error = copyin(aiocb_uptr, &aiocbp, sizeof(struct aiocb));
49510c3d35cSrmind 	if (error)
49610c3d35cSrmind 		return error;
49710c3d35cSrmind 
49810c3d35cSrmind 	/* Check if signal is set, and validate it */
49910c3d35cSrmind 	sig = &aiocbp.aio_sigevent;
50010c3d35cSrmind 	if (sig->sigev_signo < 0 || sig->sigev_signo >= NSIG ||
50110c3d35cSrmind 	    sig->sigev_notify < SIGEV_NONE || sig->sigev_notify > SIGEV_SA)
502*34335fd2Sriastradh 		return SET_ERROR(EINVAL);
50310c3d35cSrmind 
50410c3d35cSrmind 	/* Buffer and byte count */
50510c3d35cSrmind 	if (((AIO_SYNC | AIO_DSYNC) & op) == 0)
50610c3d35cSrmind 		if (aiocbp.aio_buf == NULL || aiocbp.aio_nbytes > SSIZE_MAX)
507*34335fd2Sriastradh 			return SET_ERROR(EINVAL);
50810c3d35cSrmind 
50910c3d35cSrmind 	/* Check the opcode, if LIO_NOP - simply ignore */
51010c3d35cSrmind 	if (op == AIO_LIO) {
51110c3d35cSrmind 		KASSERT(lio != NULL);
51210c3d35cSrmind 		if (aiocbp.aio_lio_opcode == LIO_WRITE)
51310c3d35cSrmind 			op = AIO_WRITE;
51410c3d35cSrmind 		else if (aiocbp.aio_lio_opcode == LIO_READ)
51510c3d35cSrmind 			op = AIO_READ;
51610c3d35cSrmind 		else
517*34335fd2Sriastradh 			return (aiocbp.aio_lio_opcode == LIO_NOP) ? 0 :
518*34335fd2Sriastradh 			    SET_ERROR(EINVAL);
51910c3d35cSrmind 	} else {
52010c3d35cSrmind 		KASSERT(lio == NULL);
52110c3d35cSrmind 	}
52210c3d35cSrmind 
52310c3d35cSrmind 	/*
52410c3d35cSrmind 	 * Look for already existing job.  If found - the job is in-progress.
52510c3d35cSrmind 	 * According to POSIX this is invalid, so return the error.
52610c3d35cSrmind 	 */
52710c3d35cSrmind 	aio = p->p_aio;
52810c3d35cSrmind 	if (aio) {
52910c3d35cSrmind 		mutex_enter(&aio->aio_mtx);
53010c3d35cSrmind 		TAILQ_FOREACH(a_job, &aio->jobs_queue, list) {
53110c3d35cSrmind 			if (a_job->aiocb_uptr != aiocb_uptr)
53210c3d35cSrmind 				continue;
53310c3d35cSrmind 			mutex_exit(&aio->aio_mtx);
534*34335fd2Sriastradh 			return SET_ERROR(EINVAL);
53510c3d35cSrmind 		}
53610c3d35cSrmind 		mutex_exit(&aio->aio_mtx);
53710c3d35cSrmind 	}
53810c3d35cSrmind 
53910c3d35cSrmind 	/*
54010c3d35cSrmind 	 * Check if AIO structure is initialized, if not - initialize it.
54110c3d35cSrmind 	 * In LIO case, we did that already.  We will recheck this with
542e6a33851Sad 	 * the lock in aio_procinit().
54310c3d35cSrmind 	 */
54410c3d35cSrmind 	if (lio == NULL && p->p_aio == NULL)
545e6a33851Sad 		if (aio_procinit(p))
546*34335fd2Sriastradh 			return SET_ERROR(EAGAIN);
54710c3d35cSrmind 	aio = p->p_aio;
54810c3d35cSrmind 
54910c3d35cSrmind 	/*
55010c3d35cSrmind 	 * Set the state with errno, and copy data
55110c3d35cSrmind 	 * structure back to the user-space.
55210c3d35cSrmind 	 */
55310c3d35cSrmind 	aiocbp._state = JOB_WIP;
554*34335fd2Sriastradh 	aiocbp._errno = SET_ERROR(EINPROGRESS);
55510c3d35cSrmind 	aiocbp._retval = -1;
55610c3d35cSrmind 	error = copyout(&aiocbp, aiocb_uptr, sizeof(struct aiocb));
55710c3d35cSrmind 	if (error)
55810c3d35cSrmind 		return error;
55910c3d35cSrmind 
56010c3d35cSrmind 	/* Allocate and initialize a new AIO job */
5619577643dSchristos 	a_job = pool_get(&aio_job_pool, PR_WAITOK | PR_ZERO);
56210c3d35cSrmind 
56310c3d35cSrmind 	/*
56410c3d35cSrmind 	 * Set the data.
56510c3d35cSrmind 	 * Store the user-space pointer for searching.  Since we
56610c3d35cSrmind 	 * are storing only per proc pointers - it is safe.
56710c3d35cSrmind 	 */
56810c3d35cSrmind 	memcpy(&a_job->aiocbp, &aiocbp, sizeof(struct aiocb));
56910c3d35cSrmind 	a_job->aiocb_uptr = aiocb_uptr;
57010c3d35cSrmind 	a_job->aio_op |= op;
57110c3d35cSrmind 	a_job->lio = lio;
57210c3d35cSrmind 
57310c3d35cSrmind 	/*
57410c3d35cSrmind 	 * Add the job to the queue, update the counters, and
57510c3d35cSrmind 	 * notify the AIO worker thread to handle the job.
57610c3d35cSrmind 	 */
57710c3d35cSrmind 	mutex_enter(&aio->aio_mtx);
57810c3d35cSrmind 
57910c3d35cSrmind 	/* Fail, if the limit was reached */
580cbb39165Srmind 	if (atomic_inc_uint_nv(&aio_jobs_count) > aio_max ||
581cbb39165Srmind 	    aio->jobs_count >= aio_listio_max) {
5820e5c3c74Srmind 		atomic_dec_uint(&aio_jobs_count);
58310c3d35cSrmind 		mutex_exit(&aio->aio_mtx);
5840a227b19Srmind 		pool_put(&aio_job_pool, a_job);
585*34335fd2Sriastradh 		return SET_ERROR(EAGAIN);
58610c3d35cSrmind 	}
58710c3d35cSrmind 
58810c3d35cSrmind 	TAILQ_INSERT_TAIL(&aio->jobs_queue, a_job, list);
58910c3d35cSrmind 	aio->jobs_count++;
59010c3d35cSrmind 	if (lio)
59110c3d35cSrmind 		lio->refcnt++;
59210c3d35cSrmind 	cv_signal(&aio->aio_worker_cv);
59310c3d35cSrmind 
59410c3d35cSrmind 	mutex_exit(&aio->aio_mtx);
59510c3d35cSrmind 
59610c3d35cSrmind 	/*
59710c3d35cSrmind 	 * One would handle the errors only with aio_error() function.
59810c3d35cSrmind 	 * This way is appropriate according to POSIX.
59910c3d35cSrmind 	 */
60010c3d35cSrmind 	return 0;
60110c3d35cSrmind }
60210c3d35cSrmind 
60310c3d35cSrmind /*
60410c3d35cSrmind  * Syscall functions.
60510c3d35cSrmind  */
60610c3d35cSrmind 
60710c3d35cSrmind int
60829e552b0Syamt sys_aio_cancel(struct lwp *l, const struct sys_aio_cancel_args *uap,
60929e552b0Syamt     register_t *retval)
61010c3d35cSrmind {
6117e2790cfSdsl 	/* {
61210c3d35cSrmind 		syscallarg(int) fildes;
61310c3d35cSrmind 		syscallarg(struct aiocb *) aiocbp;
6147e2790cfSdsl 	} */
61510c3d35cSrmind 	struct proc *p = l->l_proc;
61610c3d35cSrmind 	struct aioproc *aio;
61710c3d35cSrmind 	struct aio_job *a_job;
61810c3d35cSrmind 	struct aiocb *aiocbp_ptr;
61910c3d35cSrmind 	struct lio_req *lio;
62010c3d35cSrmind 	struct filedesc	*fdp = p->p_fd;
62110c3d35cSrmind 	unsigned int cn, errcnt, fildes;
622d991fcb3Sad 	fdtab_t *dt;
62310c3d35cSrmind 
62410c3d35cSrmind 	TAILQ_HEAD(, aio_job) tmp_jobs_list;
62510c3d35cSrmind 
62610c3d35cSrmind 	/* Check for invalid file descriptor */
62710c3d35cSrmind 	fildes = (unsigned int)SCARG(uap, fildes);
6288e6cd4ceSriastradh 	dt = atomic_load_consume(&fdp->fd_dt);
629d991fcb3Sad 	if (fildes >= dt->dt_nfiles)
630*34335fd2Sriastradh 		return SET_ERROR(EBADF);
631d991fcb3Sad 	if (dt->dt_ff[fildes] == NULL || dt->dt_ff[fildes]->ff_file == NULL)
632*34335fd2Sriastradh 		return SET_ERROR(EBADF);
63310c3d35cSrmind 
63410c3d35cSrmind 	/* Check if AIO structure is initialized */
63510c3d35cSrmind 	if (p->p_aio == NULL) {
63610c3d35cSrmind 		*retval = AIO_NOTCANCELED;
63710c3d35cSrmind 		return 0;
63810c3d35cSrmind 	}
63910c3d35cSrmind 
64010c3d35cSrmind 	aio = p->p_aio;
64110c3d35cSrmind 	aiocbp_ptr = (struct aiocb *)SCARG(uap, aiocbp);
64210c3d35cSrmind 
64310c3d35cSrmind 	mutex_enter(&aio->aio_mtx);
64410c3d35cSrmind 
64510c3d35cSrmind 	/* Cancel the jobs, and remove them from the queue */
64610c3d35cSrmind 	cn = 0;
64710c3d35cSrmind 	TAILQ_INIT(&tmp_jobs_list);
64810c3d35cSrmind 	TAILQ_FOREACH(a_job, &aio->jobs_queue, list) {
64910c3d35cSrmind 		if (aiocbp_ptr) {
65010c3d35cSrmind 			if (aiocbp_ptr != a_job->aiocb_uptr)
65110c3d35cSrmind 				continue;
65210c3d35cSrmind 			if (fildes != a_job->aiocbp.aio_fildes) {
65310c3d35cSrmind 				mutex_exit(&aio->aio_mtx);
654*34335fd2Sriastradh 				return SET_ERROR(EBADF);
65510c3d35cSrmind 			}
65610c3d35cSrmind 		} else if (a_job->aiocbp.aio_fildes != fildes)
65710c3d35cSrmind 			continue;
65810c3d35cSrmind 
65910c3d35cSrmind 		TAILQ_REMOVE(&aio->jobs_queue, a_job, list);
66010c3d35cSrmind 		TAILQ_INSERT_TAIL(&tmp_jobs_list, a_job, list);
66110c3d35cSrmind 
66210c3d35cSrmind 		/* Decrease the counters */
663a45b048eSad 		atomic_dec_uint(&aio_jobs_count);
66410c3d35cSrmind 		aio->jobs_count--;
66510c3d35cSrmind 		lio = a_job->lio;
6660a227b19Srmind 		if (lio != NULL && --lio->refcnt != 0)
66710c3d35cSrmind 			a_job->lio = NULL;
66810c3d35cSrmind 
66910c3d35cSrmind 		cn++;
67010c3d35cSrmind 		if (aiocbp_ptr)
67110c3d35cSrmind 			break;
67210c3d35cSrmind 	}
67310c3d35cSrmind 
67410c3d35cSrmind 	/* There are canceled jobs */
67510c3d35cSrmind 	if (cn)
67610c3d35cSrmind 		*retval = AIO_CANCELED;
67710c3d35cSrmind 
67810c3d35cSrmind 	/* We cannot cancel current job */
67910c3d35cSrmind 	a_job = aio->curjob;
68010c3d35cSrmind 	if (a_job && ((a_job->aiocbp.aio_fildes == fildes) ||
68110c3d35cSrmind 	    (a_job->aiocb_uptr == aiocbp_ptr)))
68210c3d35cSrmind 		*retval = AIO_NOTCANCELED;
68310c3d35cSrmind 
68410c3d35cSrmind 	mutex_exit(&aio->aio_mtx);
68510c3d35cSrmind 
68610c3d35cSrmind 	/* Free the jobs after the lock */
68710c3d35cSrmind 	errcnt = 0;
68810c3d35cSrmind 	while (!TAILQ_EMPTY(&tmp_jobs_list)) {
68910c3d35cSrmind 		a_job = TAILQ_FIRST(&tmp_jobs_list);
69010c3d35cSrmind 		TAILQ_REMOVE(&tmp_jobs_list, a_job, list);
69110c3d35cSrmind 		/* Set the errno and copy structures back to the user-space */
692*34335fd2Sriastradh 		a_job->aiocbp._errno = SET_ERROR(ECANCELED);
69310c3d35cSrmind 		a_job->aiocbp._state = JOB_DONE;
69410c3d35cSrmind 		if (copyout(&a_job->aiocbp, a_job->aiocb_uptr,
69510c3d35cSrmind 		    sizeof(struct aiocb)))
69610c3d35cSrmind 			errcnt++;
69710c3d35cSrmind 		/* Send a signal if any */
69810c3d35cSrmind 		aio_sendsig(p, &a_job->aiocbp.aio_sigevent);
6995023159eSrmind 		if (a_job->lio) {
7005023159eSrmind 			lio = a_job->lio;
7015023159eSrmind 			aio_sendsig(p, &lio->sig);
7025023159eSrmind 			pool_put(&aio_lio_pool, lio);
7035023159eSrmind 		}
7040a227b19Srmind 		pool_put(&aio_job_pool, a_job);
70510c3d35cSrmind 	}
70610c3d35cSrmind 
70710c3d35cSrmind 	if (errcnt)
708*34335fd2Sriastradh 		return SET_ERROR(EFAULT);
70910c3d35cSrmind 
71010c3d35cSrmind 	/* Set a correct return value */
71110c3d35cSrmind 	if (*retval == 0)
71210c3d35cSrmind 		*retval = AIO_ALLDONE;
71310c3d35cSrmind 
71410c3d35cSrmind 	return 0;
71510c3d35cSrmind }
71610c3d35cSrmind 
71710c3d35cSrmind int
71829e552b0Syamt sys_aio_error(struct lwp *l, const struct sys_aio_error_args *uap,
71929e552b0Syamt     register_t *retval)
72010c3d35cSrmind {
7217e2790cfSdsl 	/* {
72210c3d35cSrmind 		syscallarg(const struct aiocb *) aiocbp;
7237e2790cfSdsl 	} */
72410c3d35cSrmind 	struct proc *p = l->l_proc;
72510c3d35cSrmind 	struct aioproc *aio = p->p_aio;
72610c3d35cSrmind 	struct aiocb aiocbp;
72710c3d35cSrmind 	int error;
72810c3d35cSrmind 
72910c3d35cSrmind 	if (aio == NULL)
730*34335fd2Sriastradh 		return SET_ERROR(EINVAL);
73110c3d35cSrmind 
73210c3d35cSrmind 	error = copyin(SCARG(uap, aiocbp), &aiocbp, sizeof(struct aiocb));
73310c3d35cSrmind 	if (error)
73410c3d35cSrmind 		return error;
73510c3d35cSrmind 
73610c3d35cSrmind 	if (aiocbp._state == JOB_NONE)
737*34335fd2Sriastradh 		return SET_ERROR(EINVAL);
73810c3d35cSrmind 
73910c3d35cSrmind 	*retval = aiocbp._errno;
74010c3d35cSrmind 
74110c3d35cSrmind 	return 0;
74210c3d35cSrmind }
74310c3d35cSrmind 
74410c3d35cSrmind int
74529e552b0Syamt sys_aio_fsync(struct lwp *l, const struct sys_aio_fsync_args *uap,
74629e552b0Syamt     register_t *retval)
74710c3d35cSrmind {
7487e2790cfSdsl 	/* {
74910c3d35cSrmind 		syscallarg(int) op;
75010c3d35cSrmind 		syscallarg(struct aiocb *) aiocbp;
7517e2790cfSdsl 	} */
75210c3d35cSrmind 	int op = SCARG(uap, op);
75310c3d35cSrmind 
75410c3d35cSrmind 	if ((op != O_DSYNC) && (op != O_SYNC))
755*34335fd2Sriastradh 		return SET_ERROR(EINVAL);
75610c3d35cSrmind 
75710c3d35cSrmind 	op = O_DSYNC ? AIO_DSYNC : AIO_SYNC;
75810c3d35cSrmind 
75910c3d35cSrmind 	return aio_enqueue_job(op, SCARG(uap, aiocbp), NULL);
76010c3d35cSrmind }
76110c3d35cSrmind 
76210c3d35cSrmind int
76329e552b0Syamt sys_aio_read(struct lwp *l, const struct sys_aio_read_args *uap,
76429e552b0Syamt     register_t *retval)
76510c3d35cSrmind {
7667e2790cfSdsl 	/* {
76710c3d35cSrmind 		syscallarg(struct aiocb *) aiocbp;
7687e2790cfSdsl 	} */
76910c3d35cSrmind 
77010c3d35cSrmind 	return aio_enqueue_job(AIO_READ, SCARG(uap, aiocbp), NULL);
77110c3d35cSrmind }
77210c3d35cSrmind 
77310c3d35cSrmind int
77429e552b0Syamt sys_aio_return(struct lwp *l, const struct sys_aio_return_args *uap,
77529e552b0Syamt     register_t *retval)
77610c3d35cSrmind {
7777e2790cfSdsl 	/* {
77810c3d35cSrmind 		syscallarg(struct aiocb *) aiocbp;
7797e2790cfSdsl 	} */
78010c3d35cSrmind 	struct proc *p = l->l_proc;
78110c3d35cSrmind 	struct aioproc *aio = p->p_aio;
78210c3d35cSrmind 	struct aiocb aiocbp;
78310c3d35cSrmind 	int error;
78410c3d35cSrmind 
78510c3d35cSrmind 	if (aio == NULL)
786*34335fd2Sriastradh 		return SET_ERROR(EINVAL);
78710c3d35cSrmind 
78810c3d35cSrmind 	error = copyin(SCARG(uap, aiocbp), &aiocbp, sizeof(struct aiocb));
78910c3d35cSrmind 	if (error)
79010c3d35cSrmind 		return error;
79110c3d35cSrmind 
79210c3d35cSrmind 	if (aiocbp._errno == EINPROGRESS || aiocbp._state != JOB_DONE)
793*34335fd2Sriastradh 		return SET_ERROR(EINVAL);
79410c3d35cSrmind 
79510c3d35cSrmind 	*retval = aiocbp._retval;
79610c3d35cSrmind 
79710c3d35cSrmind 	/* Reset the internal variables */
79810c3d35cSrmind 	aiocbp._errno = 0;
79910c3d35cSrmind 	aiocbp._retval = -1;
80010c3d35cSrmind 	aiocbp._state = JOB_NONE;
80110c3d35cSrmind 	error = copyout(&aiocbp, SCARG(uap, aiocbp), sizeof(struct aiocb));
80210c3d35cSrmind 
80310c3d35cSrmind 	return error;
80410c3d35cSrmind }
80510c3d35cSrmind 
80610c3d35cSrmind int
807461a86f9Schristos sys___aio_suspend50(struct lwp *l, const struct sys___aio_suspend50_args *uap,
808461a86f9Schristos     register_t *retval)
80910c3d35cSrmind {
8107e2790cfSdsl 	/* {
81110c3d35cSrmind 		syscallarg(const struct aiocb *const[]) list;
81210c3d35cSrmind 		syscallarg(int) nent;
81310c3d35cSrmind 		syscallarg(const struct timespec *) timeout;
8147e2790cfSdsl 	} */
815461a86f9Schristos 	struct aiocb **list;
81610c3d35cSrmind 	struct timespec ts;
817461a86f9Schristos 	int error, nent;
81810c3d35cSrmind 
81910c3d35cSrmind 	nent = SCARG(uap, nent);
82010c3d35cSrmind 	if (nent <= 0 || nent > aio_listio_max)
821*34335fd2Sriastradh 		return SET_ERROR(EAGAIN);
82210c3d35cSrmind 
82310c3d35cSrmind 	if (SCARG(uap, timeout)) {
82410c3d35cSrmind 		/* Convert timespec to ticks */
82510c3d35cSrmind 		error = copyin(SCARG(uap, timeout), &ts,
82610c3d35cSrmind 		    sizeof(struct timespec));
82710c3d35cSrmind 		if (error)
82810c3d35cSrmind 			return error;
829461a86f9Schristos 	}
830b8ea6ca4Srmind 
831e8947292Syamt 	list = kmem_alloc(nent * sizeof(*list), KM_SLEEP);
832e8947292Syamt 	error = copyin(SCARG(uap, list), list, nent * sizeof(*list));
833461a86f9Schristos 	if (error)
834461a86f9Schristos 		goto out;
835461a86f9Schristos 	error = aio_suspend1(l, list, nent, SCARG(uap, timeout) ? &ts : NULL);
836461a86f9Schristos out:
837e8947292Syamt 	kmem_free(list, nent * sizeof(*list));
838461a86f9Schristos 	return error;
839461a86f9Schristos }
840461a86f9Schristos 
841461a86f9Schristos int
842461a86f9Schristos aio_suspend1(struct lwp *l, struct aiocb **aiocbp_list, int nent,
843461a86f9Schristos     struct timespec *ts)
844461a86f9Schristos {
845461a86f9Schristos 	struct proc *p = l->l_proc;
846461a86f9Schristos 	struct aioproc *aio;
847461a86f9Schristos 	struct aio_job *a_job;
848461a86f9Schristos 	int i, error, timo;
849461a86f9Schristos 
850461a86f9Schristos 	if (p->p_aio == NULL)
851*34335fd2Sriastradh 		return SET_ERROR(EAGAIN);
852461a86f9Schristos 	aio = p->p_aio;
853461a86f9Schristos 
854461a86f9Schristos 	if (ts) {
855461a86f9Schristos 		timo = mstohz((ts->tv_sec * 1000) + (ts->tv_nsec / 1000000));
856461a86f9Schristos 		if (timo == 0 && ts->tv_sec == 0 && ts->tv_nsec > 0)
85710c3d35cSrmind 			timo = 1;
85810c3d35cSrmind 		if (timo <= 0)
859*34335fd2Sriastradh 			return SET_ERROR(EAGAIN);
86010c3d35cSrmind 	} else
86110c3d35cSrmind 		timo = 0;
86210c3d35cSrmind 
86310c3d35cSrmind 	mutex_enter(&aio->aio_mtx);
86410c3d35cSrmind 	for (;;) {
86510c3d35cSrmind 		for (i = 0; i < nent; i++) {
86610c3d35cSrmind 
86710c3d35cSrmind 			/* Skip NULL entries */
86810c3d35cSrmind 			if (aiocbp_list[i] == NULL)
86910c3d35cSrmind 				continue;
87010c3d35cSrmind 
87110c3d35cSrmind 			/* Skip current job */
87210c3d35cSrmind 			if (aio->curjob) {
87310c3d35cSrmind 				a_job = aio->curjob;
87410c3d35cSrmind 				if (a_job->aiocb_uptr == aiocbp_list[i])
87510c3d35cSrmind 					continue;
87610c3d35cSrmind 			}
87710c3d35cSrmind 
87810c3d35cSrmind 			/* Look for a job in the queue */
87910c3d35cSrmind 			TAILQ_FOREACH(a_job, &aio->jobs_queue, list)
88010c3d35cSrmind 				if (a_job->aiocb_uptr == aiocbp_list[i])
88110c3d35cSrmind 					break;
88210c3d35cSrmind 
88310c3d35cSrmind 			if (a_job == NULL) {
88410c3d35cSrmind 				struct aiocb aiocbp;
88510c3d35cSrmind 
88610c3d35cSrmind 				mutex_exit(&aio->aio_mtx);
88710c3d35cSrmind 
888b8ea6ca4Srmind 				/* Check if the job is done. */
88910c3d35cSrmind 				error = copyin(aiocbp_list[i], &aiocbp,
89010c3d35cSrmind 				    sizeof(struct aiocb));
89110c3d35cSrmind 				if (error == 0 && aiocbp._state != JOB_DONE) {
89210c3d35cSrmind 					mutex_enter(&aio->aio_mtx);
89310c3d35cSrmind 					continue;
89410c3d35cSrmind 				}
89510c3d35cSrmind 				return error;
89610c3d35cSrmind 			}
89710c3d35cSrmind 		}
89810c3d35cSrmind 
89910c3d35cSrmind 		/* Wait for a signal or when timeout occurs */
90010c3d35cSrmind 		error = cv_timedwait_sig(&aio->done_cv, &aio->aio_mtx, timo);
90110c3d35cSrmind 		if (error) {
90210c3d35cSrmind 			if (error == EWOULDBLOCK)
903*34335fd2Sriastradh 				error = SET_ERROR(EAGAIN);
90410c3d35cSrmind 			break;
90510c3d35cSrmind 		}
90610c3d35cSrmind 	}
90710c3d35cSrmind 	mutex_exit(&aio->aio_mtx);
90810c3d35cSrmind 	return error;
90910c3d35cSrmind }
91010c3d35cSrmind 
91110c3d35cSrmind int
91229e552b0Syamt sys_aio_write(struct lwp *l, const struct sys_aio_write_args *uap,
91329e552b0Syamt     register_t *retval)
91410c3d35cSrmind {
9157e2790cfSdsl 	/* {
91610c3d35cSrmind 		syscallarg(struct aiocb *) aiocbp;
9177e2790cfSdsl 	} */
91810c3d35cSrmind 
91910c3d35cSrmind 	return aio_enqueue_job(AIO_WRITE, SCARG(uap, aiocbp), NULL);
92010c3d35cSrmind }
92110c3d35cSrmind 
92210c3d35cSrmind int
92329e552b0Syamt sys_lio_listio(struct lwp *l, const struct sys_lio_listio_args *uap,
92429e552b0Syamt     register_t *retval)
92510c3d35cSrmind {
9267e2790cfSdsl 	/* {
92710c3d35cSrmind 		syscallarg(int) mode;
92810c3d35cSrmind 		syscallarg(struct aiocb *const[]) list;
92910c3d35cSrmind 		syscallarg(int) nent;
93010c3d35cSrmind 		syscallarg(struct sigevent *) sig;
9317e2790cfSdsl 	} */
93210c3d35cSrmind 	struct proc *p = l->l_proc;
93310c3d35cSrmind 	struct aioproc *aio;
93410c3d35cSrmind 	struct aiocb **aiocbp_list;
93510c3d35cSrmind 	struct lio_req *lio;
93610c3d35cSrmind 	int i, error, errcnt, mode, nent;
93710c3d35cSrmind 
93810c3d35cSrmind 	mode = SCARG(uap, mode);
93910c3d35cSrmind 	nent = SCARG(uap, nent);
94010c3d35cSrmind 
9410e5c3c74Srmind 	/* Non-accurate checks for the limit and invalid values */
94210c3d35cSrmind 	if (nent < 1 || nent > aio_listio_max)
943*34335fd2Sriastradh 		return SET_ERROR(EINVAL);
9440e5c3c74Srmind 	if (aio_jobs_count + nent > aio_max)
945*34335fd2Sriastradh 		return SET_ERROR(EAGAIN);
94610c3d35cSrmind 
94710c3d35cSrmind 	/* Check if AIO structure is initialized, if not - initialize it */
94810c3d35cSrmind 	if (p->p_aio == NULL)
949e6a33851Sad 		if (aio_procinit(p))
950*34335fd2Sriastradh 			return SET_ERROR(EAGAIN);
95110c3d35cSrmind 	aio = p->p_aio;
95210c3d35cSrmind 
95310c3d35cSrmind 	/* Create a LIO structure */
9540a227b19Srmind 	lio = pool_get(&aio_lio_pool, PR_WAITOK);
9550a227b19Srmind 	lio->refcnt = 1;
9560a227b19Srmind 	error = 0;
9570a227b19Srmind 
9580a227b19Srmind 	switch (mode) {
9590a227b19Srmind 	case LIO_WAIT:
96010c3d35cSrmind 		memset(&lio->sig, 0, sizeof(struct sigevent));
9610a227b19Srmind 		break;
9620a227b19Srmind 	case LIO_NOWAIT:
9630a227b19Srmind 		/* Check for signal, validate it */
9640a227b19Srmind 		if (SCARG(uap, sig)) {
9650a227b19Srmind 			struct sigevent *sig = &lio->sig;
9660a227b19Srmind 
9670a227b19Srmind 			error = copyin(SCARG(uap, sig), &lio->sig,
9680a227b19Srmind 			    sizeof(struct sigevent));
9690a227b19Srmind 			if (error == 0 &&
9700a227b19Srmind 			    (sig->sigev_signo < 0 ||
9710a227b19Srmind 			    sig->sigev_signo >= NSIG ||
9720a227b19Srmind 			    sig->sigev_notify < SIGEV_NONE ||
9730a227b19Srmind 			    sig->sigev_notify > SIGEV_SA))
974*34335fd2Sriastradh 				error = SET_ERROR(EINVAL);
9750a227b19Srmind 		} else
9760a227b19Srmind 			memset(&lio->sig, 0, sizeof(struct sigevent));
9770a227b19Srmind 		break;
9780a227b19Srmind 	default:
979*34335fd2Sriastradh 		error = SET_ERROR(EINVAL);
9800a227b19Srmind 		break;
9810a227b19Srmind 	}
9820a227b19Srmind 
9830a227b19Srmind 	if (error != 0) {
9840a227b19Srmind 		pool_put(&aio_lio_pool, lio);
9850a227b19Srmind 		return error;
9860a227b19Srmind 	}
98710c3d35cSrmind 
98810c3d35cSrmind 	/* Get the list from user-space */
989e8947292Syamt 	aiocbp_list = kmem_alloc(nent * sizeof(*aiocbp_list), KM_SLEEP);
99010c3d35cSrmind 	error = copyin(SCARG(uap, list), aiocbp_list,
991e8947292Syamt 	    nent * sizeof(*aiocbp_list));
9920a227b19Srmind 	if (error) {
9930a227b19Srmind 		mutex_enter(&aio->aio_mtx);
99410c3d35cSrmind 		goto err;
9950a227b19Srmind 	}
99610c3d35cSrmind 
99710c3d35cSrmind 	/* Enqueue all jobs */
99810c3d35cSrmind 	errcnt = 0;
99910c3d35cSrmind 	for (i = 0; i < nent; i++) {
100010c3d35cSrmind 		error = aio_enqueue_job(AIO_LIO, aiocbp_list[i], lio);
100110c3d35cSrmind 		/*
100210c3d35cSrmind 		 * According to POSIX, in such error case it may
100310c3d35cSrmind 		 * fail with other I/O operations initiated.
100410c3d35cSrmind 		 */
100510c3d35cSrmind 		if (error)
100610c3d35cSrmind 			errcnt++;
100710c3d35cSrmind 	}
100810c3d35cSrmind 
10090a227b19Srmind 	mutex_enter(&aio->aio_mtx);
10100a227b19Srmind 
101110c3d35cSrmind 	/* Return an error, if any */
101210c3d35cSrmind 	if (errcnt) {
1013*34335fd2Sriastradh 		error = SET_ERROR(EIO);
101410c3d35cSrmind 		goto err;
101510c3d35cSrmind 	}
101610c3d35cSrmind 
101710c3d35cSrmind 	if (mode == LIO_WAIT) {
101810c3d35cSrmind 		/*
101910c3d35cSrmind 		 * Wait for AIO completion.  In such case,
102010c3d35cSrmind 		 * the LIO structure will be freed here.
102110c3d35cSrmind 		 */
10220a227b19Srmind 		while (lio->refcnt > 1 && error == 0)
102310c3d35cSrmind 			error = cv_wait_sig(&aio->done_cv, &aio->aio_mtx);
102410c3d35cSrmind 		if (error)
1025*34335fd2Sriastradh 			error = SET_ERROR(EINTR);
102610c3d35cSrmind 	}
102710c3d35cSrmind 
102810c3d35cSrmind err:
10290a227b19Srmind 	if (--lio->refcnt != 0)
10300a227b19Srmind 		lio = NULL;
10310a227b19Srmind 	mutex_exit(&aio->aio_mtx);
10320a227b19Srmind 	if (lio != NULL) {
10330a227b19Srmind 		aio_sendsig(p, &lio->sig);
10340a227b19Srmind 		pool_put(&aio_lio_pool, lio);
103510c3d35cSrmind 	}
1036e8947292Syamt 	kmem_free(aiocbp_list, nent * sizeof(*aiocbp_list));
103710c3d35cSrmind 	return error;
103810c3d35cSrmind }
103910c3d35cSrmind 
104010c3d35cSrmind /*
104110c3d35cSrmind  * SysCtl
104210c3d35cSrmind  */
104310c3d35cSrmind 
104410c3d35cSrmind static int
104510c3d35cSrmind sysctl_aio_listio_max(SYSCTLFN_ARGS)
104610c3d35cSrmind {
104710c3d35cSrmind 	struct sysctlnode node;
104810c3d35cSrmind 	int error, newsize;
104910c3d35cSrmind 
105010c3d35cSrmind 	node = *rnode;
105110c3d35cSrmind 	node.sysctl_data = &newsize;
105210c3d35cSrmind 
105310c3d35cSrmind 	newsize = aio_listio_max;
105410c3d35cSrmind 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
105510c3d35cSrmind 	if (error || newp == NULL)
105610c3d35cSrmind 		return error;
105710c3d35cSrmind 
105810c3d35cSrmind 	if (newsize < 1 || newsize > aio_max)
1059*34335fd2Sriastradh 		return SET_ERROR(EINVAL);
106010c3d35cSrmind 	aio_listio_max = newsize;
106110c3d35cSrmind 
106210c3d35cSrmind 	return 0;
106310c3d35cSrmind }
106410c3d35cSrmind 
106510c3d35cSrmind static int
106610c3d35cSrmind sysctl_aio_max(SYSCTLFN_ARGS)
106710c3d35cSrmind {
106810c3d35cSrmind 	struct sysctlnode node;
106910c3d35cSrmind 	int error, newsize;
107010c3d35cSrmind 
107110c3d35cSrmind 	node = *rnode;
107210c3d35cSrmind 	node.sysctl_data = &newsize;
107310c3d35cSrmind 
107410c3d35cSrmind 	newsize = aio_max;
107510c3d35cSrmind 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
107610c3d35cSrmind 	if (error || newp == NULL)
107710c3d35cSrmind 		return error;
107810c3d35cSrmind 
107910c3d35cSrmind 	if (newsize < 1 || newsize < aio_listio_max)
1080*34335fd2Sriastradh 		return SET_ERROR(EINVAL);
108110c3d35cSrmind 	aio_max = newsize;
108210c3d35cSrmind 
108310c3d35cSrmind 	return 0;
108410c3d35cSrmind }
108510c3d35cSrmind 
10869120d451Spgoyette SYSCTL_SETUP(sysctl_aio_init, "aio sysctl")
108710c3d35cSrmind {
108872795172Sjruoho 	int rv;
108910c3d35cSrmind 
10909120d451Spgoyette 	rv = sysctl_createv(clog, 0, NULL, NULL,
109110c3d35cSrmind 	    CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
109210c3d35cSrmind 	    CTLTYPE_INT, "posix_aio",
109310c3d35cSrmind 	    SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
109410c3d35cSrmind 		"Asynchronous I/O option to which the "
109510c3d35cSrmind 		"system attempts to conform"),
109610c3d35cSrmind 	    NULL, _POSIX_ASYNCHRONOUS_IO, NULL, 0,
109710c3d35cSrmind 	    CTL_KERN, CTL_CREATE, CTL_EOL);
109872795172Sjruoho 
109972795172Sjruoho 	if (rv != 0)
11009120d451Spgoyette 		return;
110172795172Sjruoho 
11029120d451Spgoyette 	rv = sysctl_createv(clog, 0, NULL, NULL,
110310c3d35cSrmind 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
110410c3d35cSrmind 	    CTLTYPE_INT, "aio_listio_max",
110510c3d35cSrmind 	    SYSCTL_DESCR("Maximum number of asynchronous I/O "
110610c3d35cSrmind 		"operations in a single list I/O call"),
110710c3d35cSrmind 	    sysctl_aio_listio_max, 0, &aio_listio_max, 0,
110810c3d35cSrmind 	    CTL_KERN, CTL_CREATE, CTL_EOL);
110972795172Sjruoho 
111072795172Sjruoho 	if (rv != 0)
11119120d451Spgoyette 		return;
111272795172Sjruoho 
11139120d451Spgoyette 	rv = sysctl_createv(clog, 0, NULL, NULL,
111410c3d35cSrmind 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
111510c3d35cSrmind 	    CTLTYPE_INT, "aio_max",
111610c3d35cSrmind 	    SYSCTL_DESCR("Maximum number of asynchronous I/O "
111710c3d35cSrmind 		"operations"),
111810c3d35cSrmind 	    sysctl_aio_max, 0, &aio_max, 0,
111910c3d35cSrmind 	    CTL_KERN, CTL_CREATE, CTL_EOL);
112072795172Sjruoho 
11219120d451Spgoyette 	return;
112210c3d35cSrmind }
112310c3d35cSrmind 
112410c3d35cSrmind /*
112510c3d35cSrmind  * Debugging
112610c3d35cSrmind  */
112710c3d35cSrmind #if defined(DDB)
112810c3d35cSrmind void
112910c3d35cSrmind aio_print_jobs(void (*pr)(const char *, ...))
113010c3d35cSrmind {
11314b44bf46Smatt 	struct proc *p = curlwp->l_proc;
113210c3d35cSrmind 	struct aioproc *aio;
113310c3d35cSrmind 	struct aio_job *a_job;
113410c3d35cSrmind 	struct aiocb *aiocbp;
113510c3d35cSrmind 
113610c3d35cSrmind 	if (p == NULL) {
113710c3d35cSrmind 		(*pr)("AIO: We are not in the processes right now.\n");
113810c3d35cSrmind 		return;
113910c3d35cSrmind 	}
114010c3d35cSrmind 
114110c3d35cSrmind 	aio = p->p_aio;
114210c3d35cSrmind 	if (aio == NULL) {
114310c3d35cSrmind 		(*pr)("AIO data is not initialized (PID = %d).\n", p->p_pid);
114410c3d35cSrmind 		return;
114510c3d35cSrmind 	}
114610c3d35cSrmind 
114710c3d35cSrmind 	(*pr)("AIO: PID = %d\n", p->p_pid);
114810c3d35cSrmind 	(*pr)("AIO: Global count of the jobs = %u\n", aio_jobs_count);
114910c3d35cSrmind 	(*pr)("AIO: Count of the jobs = %u\n", aio->jobs_count);
115010c3d35cSrmind 
115110c3d35cSrmind 	if (aio->curjob) {
115210c3d35cSrmind 		a_job = aio->curjob;
115310c3d35cSrmind 		(*pr)("\nAIO current job:\n");
115410c3d35cSrmind 		(*pr)(" opcode = %d, errno = %d, state = %d, aiocb_ptr = %p\n",
115510c3d35cSrmind 		    a_job->aio_op, a_job->aiocbp._errno,
115610c3d35cSrmind 		    a_job->aiocbp._state, a_job->aiocb_uptr);
115710c3d35cSrmind 		aiocbp = &a_job->aiocbp;
115810c3d35cSrmind 		(*pr)("   fd = %d, offset = %u, buf = %p, nbytes = %u\n",
115910c3d35cSrmind 		    aiocbp->aio_fildes, aiocbp->aio_offset,
116010c3d35cSrmind 		    aiocbp->aio_buf, aiocbp->aio_nbytes);
116110c3d35cSrmind 	}
116210c3d35cSrmind 
116310c3d35cSrmind 	(*pr)("\nAIO queue:\n");
116410c3d35cSrmind 	TAILQ_FOREACH(a_job, &aio->jobs_queue, list) {
116510c3d35cSrmind 		(*pr)(" opcode = %d, errno = %d, state = %d, aiocb_ptr = %p\n",
116610c3d35cSrmind 		    a_job->aio_op, a_job->aiocbp._errno,
116710c3d35cSrmind 		    a_job->aiocbp._state, a_job->aiocb_uptr);
116810c3d35cSrmind 		aiocbp = &a_job->aiocbp;
116910c3d35cSrmind 		(*pr)("   fd = %d, offset = %u, buf = %p, nbytes = %u\n",
117010c3d35cSrmind 		    aiocbp->aio_fildes, aiocbp->aio_offset,
117110c3d35cSrmind 		    aiocbp->aio_buf, aiocbp->aio_nbytes);
117210c3d35cSrmind 	}
117310c3d35cSrmind }
117410c3d35cSrmind #endif /* defined(DDB) */
1175