xref: /netbsd-src/sys/kern/sys_aio.c (revision cac8e449158efc7261bebc8657cbb0125a2cfdde)
1 /*	$NetBSD: sys_aio.c,v 1.19 2008/05/26 17:45:51 rmind Exp $	*/
2 
3 /*
4  * Copyright (c) 2007, Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * Implementation of POSIX asynchronous I/O.
31  * Defined in the Base Definitions volume of IEEE Std 1003.1-2001.
32  */
33 
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(0, "$NetBSD: sys_aio.c,v 1.19 2008/05/26 17:45:51 rmind Exp $");
36 
37 #include "opt_ddb.h"
38 
39 #include <sys/param.h>
40 #include <sys/condvar.h>
41 #include <sys/file.h>
42 #include <sys/filedesc.h>
43 #include <sys/kernel.h>
44 #include <sys/kmem.h>
45 #include <sys/lwp.h>
46 #include <sys/mutex.h>
47 #include <sys/pool.h>
48 #include <sys/proc.h>
49 #include <sys/queue.h>
50 #include <sys/signal.h>
51 #include <sys/signalvar.h>
52 #include <sys/syscallargs.h>
53 #include <sys/sysctl.h>
54 #include <sys/systm.h>
55 #include <sys/types.h>
56 #include <sys/vnode.h>
57 #include <sys/atomic.h>
58 
59 #include <uvm/uvm_extern.h>
60 
61 /*
62  * System-wide limits and counter of AIO operations.
63  */
64 static u_int aio_listio_max = AIO_LISTIO_MAX;
65 static u_int aio_max = AIO_MAX;
66 static u_int aio_jobs_count;
67 
68 static struct pool aio_job_pool;
69 static struct pool aio_lio_pool;
70 
71 /* Prototypes */
72 void aio_worker(void *);
73 static void aio_process(struct aio_job *);
74 static void aio_sendsig(struct proc *, struct sigevent *);
75 static int aio_enqueue_job(int, void *, struct lio_req *);
76 
77 /*
78  * Initialize the AIO system.
79  */
80 void
81 aio_sysinit(void)
82 {
83 
84 	pool_init(&aio_job_pool, sizeof(struct aio_job), 0, 0, 0,
85 	    "aio_jobs_pool", &pool_allocator_nointr, IPL_NONE);
86 	pool_init(&aio_lio_pool, sizeof(struct lio_req), 0, 0, 0,
87 	    "aio_lio_pool", &pool_allocator_nointr, IPL_NONE);
88 }
89 
90 /*
91  * Initialize Asynchronous I/O data structures for the process.
92  */
93 int
94 aio_init(struct proc *p)
95 {
96 	struct aioproc *aio;
97 	struct lwp *l;
98 	int error;
99 	bool inmem;
100 	vaddr_t uaddr;
101 
102 	/* Allocate and initialize AIO structure */
103 	aio = kmem_zalloc(sizeof(struct aioproc), KM_SLEEP);
104 	if (aio == NULL)
105 		return EAGAIN;
106 
107 	/* Initialize queue and their synchronization structures */
108 	mutex_init(&aio->aio_mtx, MUTEX_DEFAULT, IPL_NONE);
109 	cv_init(&aio->aio_worker_cv, "aiowork");
110 	cv_init(&aio->done_cv, "aiodone");
111 	TAILQ_INIT(&aio->jobs_queue);
112 
113 	/*
114 	 * Create an AIO worker thread.
115 	 * XXX: Currently, AIO thread is not protected against user's actions.
116 	 */
117 	inmem = uvm_uarea_alloc(&uaddr);
118 	if (uaddr == 0) {
119 		aio_exit(p, aio);
120 		return EAGAIN;
121 	}
122 	error = lwp_create(curlwp, p, uaddr, inmem, 0, NULL, 0, aio_worker,
123 	    NULL, &l, curlwp->l_class);
124 	if (error != 0) {
125 		uvm_uarea_free(uaddr, curcpu());
126 		aio_exit(p, aio);
127 		return error;
128 	}
129 
130 	/* Recheck if we are really first */
131 	mutex_enter(p->p_lock);
132 	if (p->p_aio) {
133 		mutex_exit(p->p_lock);
134 		aio_exit(p, aio);
135 		lwp_exit(l);
136 		return 0;
137 	}
138 	p->p_aio = aio;
139 
140 	/* Complete the initialization of thread, and run it */
141 	aio->aio_worker = l;
142 	p->p_nrlwps++;
143 	lwp_lock(l);
144 	l->l_stat = LSRUN;
145 	l->l_priority = MAXPRI_USER;
146 	sched_enqueue(l, false);
147 	lwp_unlock(l);
148 	mutex_exit(p->p_lock);
149 
150 	return 0;
151 }
152 
153 /*
154  * Exit of Asynchronous I/O subsystem of process.
155  */
156 void
157 aio_exit(struct proc *p, struct aioproc *aio)
158 {
159 	struct aio_job *a_job;
160 
161 	if (aio == NULL)
162 		return;
163 
164 	/* Free AIO queue */
165 	while (!TAILQ_EMPTY(&aio->jobs_queue)) {
166 		a_job = TAILQ_FIRST(&aio->jobs_queue);
167 		TAILQ_REMOVE(&aio->jobs_queue, a_job, list);
168 		pool_put(&aio_job_pool, a_job);
169 		atomic_dec_uint(&aio_jobs_count);
170 	}
171 
172 	/* Destroy and free the entire AIO data structure */
173 	cv_destroy(&aio->aio_worker_cv);
174 	cv_destroy(&aio->done_cv);
175 	mutex_destroy(&aio->aio_mtx);
176 	kmem_free(aio, sizeof(struct aioproc));
177 }
178 
179 /*
180  * AIO worker thread and processor.
181  */
182 void
183 aio_worker(void *arg)
184 {
185 	struct proc *p = curlwp->l_proc;
186 	struct aioproc *aio = p->p_aio;
187 	struct aio_job *a_job;
188 	struct lio_req *lio;
189 	sigset_t oss, nss;
190 	int error, refcnt;
191 
192 	/*
193 	 * Make an empty signal mask, so it
194 	 * handles only SIGKILL and SIGSTOP.
195 	 */
196 	sigfillset(&nss);
197 	mutex_enter(p->p_lock);
198 	error = sigprocmask1(curlwp, SIG_SETMASK, &nss, &oss);
199 	mutex_exit(p->p_lock);
200 	KASSERT(error == 0);
201 
202 	for (;;) {
203 		/*
204 		 * Loop for each job in the queue.  If there
205 		 * are no jobs then sleep.
206 		 */
207 		mutex_enter(&aio->aio_mtx);
208 		while ((a_job = TAILQ_FIRST(&aio->jobs_queue)) == NULL) {
209 			if (cv_wait_sig(&aio->aio_worker_cv, &aio->aio_mtx)) {
210 				/*
211 				 * Thread was interrupted - check for
212 				 * pending exit or suspend.
213 				 */
214 				mutex_exit(&aio->aio_mtx);
215 				lwp_userret(curlwp);
216 				mutex_enter(&aio->aio_mtx);
217 			}
218 		}
219 
220 		/* Take the job from the queue */
221 		aio->curjob = a_job;
222 		TAILQ_REMOVE(&aio->jobs_queue, a_job, list);
223 
224 		atomic_dec_uint(&aio_jobs_count);
225 		aio->jobs_count--;
226 
227 		mutex_exit(&aio->aio_mtx);
228 
229 		/* Process an AIO operation */
230 		aio_process(a_job);
231 
232 		/* Copy data structure back to the user-space */
233 		(void)copyout(&a_job->aiocbp, a_job->aiocb_uptr,
234 		    sizeof(struct aiocb));
235 
236 		mutex_enter(&aio->aio_mtx);
237 		aio->curjob = NULL;
238 
239 		/* Decrease a reference counter, if there is a LIO structure */
240 		lio = a_job->lio;
241 		refcnt = (lio != NULL ? --lio->refcnt : -1);
242 
243 		/* Notify all suspenders */
244 		cv_broadcast(&aio->done_cv);
245 		mutex_exit(&aio->aio_mtx);
246 
247 		/* Send a signal, if any */
248 		aio_sendsig(p, &a_job->aiocbp.aio_sigevent);
249 
250 		/* Destroy the LIO structure */
251 		if (refcnt == 0) {
252 			aio_sendsig(p, &lio->sig);
253 			pool_put(&aio_lio_pool, lio);
254 		}
255 
256 		/* Destroy the the job */
257 		pool_put(&aio_job_pool, a_job);
258 	}
259 
260 	/* NOTREACHED */
261 }
262 
263 static void
264 aio_process(struct aio_job *a_job)
265 {
266 	struct proc *p = curlwp->l_proc;
267 	struct aiocb *aiocbp = &a_job->aiocbp;
268 	struct file *fp;
269 	int fd = aiocbp->aio_fildes;
270 	int error = 0;
271 
272 	KASSERT(a_job->aio_op != 0);
273 
274 	if ((a_job->aio_op & (AIO_READ | AIO_WRITE)) != 0) {
275 		struct iovec aiov;
276 		struct uio auio;
277 
278 		if (aiocbp->aio_nbytes > SSIZE_MAX) {
279 			error = EINVAL;
280 			goto done;
281 		}
282 
283 		fp = fd_getfile(fd);
284 		if (fp == NULL) {
285 			error = EBADF;
286 			goto done;
287 		}
288 
289 		aiov.iov_base = (void *)(uintptr_t)aiocbp->aio_buf;
290 		aiov.iov_len = aiocbp->aio_nbytes;
291 		auio.uio_iov = &aiov;
292 		auio.uio_iovcnt = 1;
293 		auio.uio_resid = aiocbp->aio_nbytes;
294 		auio.uio_vmspace = p->p_vmspace;
295 
296 		if (a_job->aio_op & AIO_READ) {
297 			/*
298 			 * Perform a Read operation
299 			 */
300 			KASSERT((a_job->aio_op & AIO_WRITE) == 0);
301 
302 			if ((fp->f_flag & FREAD) == 0) {
303 				fd_putfile(fd);
304 				error = EBADF;
305 				goto done;
306 			}
307 			auio.uio_rw = UIO_READ;
308 			error = (*fp->f_ops->fo_read)(fp, &aiocbp->aio_offset,
309 			    &auio, fp->f_cred, FOF_UPDATE_OFFSET);
310 		} else {
311 			/*
312 			 * Perform a Write operation
313 			 */
314 			KASSERT(a_job->aio_op & AIO_WRITE);
315 
316 			if ((fp->f_flag & FWRITE) == 0) {
317 				fd_putfile(fd);
318 				error = EBADF;
319 				goto done;
320 			}
321 			auio.uio_rw = UIO_WRITE;
322 			error = (*fp->f_ops->fo_write)(fp, &aiocbp->aio_offset,
323 			    &auio, fp->f_cred, FOF_UPDATE_OFFSET);
324 		}
325 		fd_putfile(fd);
326 
327 		/* Store the result value */
328 		a_job->aiocbp.aio_nbytes -= auio.uio_resid;
329 		a_job->aiocbp._retval = (error == 0) ?
330 		    a_job->aiocbp.aio_nbytes : -1;
331 
332 	} else if ((a_job->aio_op & (AIO_SYNC | AIO_DSYNC)) != 0) {
333 		/*
334 		 * Perform a file Sync operation
335 		 */
336 		struct vnode *vp;
337 
338 		if ((error = fd_getvnode(fd, &fp)) != 0)
339 			goto done;
340 
341 		if ((fp->f_flag & FWRITE) == 0) {
342 			fd_putfile(fd);
343 			error = EBADF;
344 			goto done;
345 		}
346 
347 		vp = (struct vnode *)fp->f_data;
348 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
349 		if (a_job->aio_op & AIO_DSYNC) {
350 			error = VOP_FSYNC(vp, fp->f_cred,
351 			    FSYNC_WAIT | FSYNC_DATAONLY, 0, 0);
352 		} else if (a_job->aio_op & AIO_SYNC) {
353 			error = VOP_FSYNC(vp, fp->f_cred,
354 			    FSYNC_WAIT, 0, 0);
355 			if (error == 0 && bioopsp != NULL &&
356 			    vp->v_mount &&
357 			    (vp->v_mount->mnt_flag & MNT_SOFTDEP))
358 			    bioopsp->io_fsync(vp, 0);
359 		}
360 		VOP_UNLOCK(vp, 0);
361 		fd_putfile(fd);
362 
363 		/* Store the result value */
364 		a_job->aiocbp._retval = (error == 0) ? 0 : -1;
365 
366 	} else
367 		panic("aio_process: invalid operation code\n");
368 
369 done:
370 	/* Job is done, set the error, if any */
371 	a_job->aiocbp._errno = error;
372 	a_job->aiocbp._state = JOB_DONE;
373 }
374 
375 /*
376  * Send AIO signal.
377  */
378 static void
379 aio_sendsig(struct proc *p, struct sigevent *sig)
380 {
381 	ksiginfo_t ksi;
382 
383 	if (sig->sigev_signo == 0 || sig->sigev_notify == SIGEV_NONE)
384 		return;
385 
386 	KSI_INIT(&ksi);
387 	ksi.ksi_signo = sig->sigev_signo;
388 	ksi.ksi_code = SI_ASYNCIO;
389 	ksi.ksi_value = sig->sigev_value;
390 	mutex_enter(proc_lock);
391 	kpsignal(p, &ksi, NULL);
392 	mutex_exit(proc_lock);
393 }
394 
395 /*
396  * Enqueue the job.
397  */
398 static int
399 aio_enqueue_job(int op, void *aiocb_uptr, struct lio_req *lio)
400 {
401 	struct proc *p = curlwp->l_proc;
402 	struct aioproc *aio;
403 	struct aio_job *a_job;
404 	struct aiocb aiocbp;
405 	struct sigevent *sig;
406 	int error;
407 
408 	/* Non-accurate check for the limit */
409 	if (aio_jobs_count + 1 > aio_max)
410 		return EAGAIN;
411 
412 	/* Get the data structure from user-space */
413 	error = copyin(aiocb_uptr, &aiocbp, sizeof(struct aiocb));
414 	if (error)
415 		return error;
416 
417 	/* Check if signal is set, and validate it */
418 	sig = &aiocbp.aio_sigevent;
419 	if (sig->sigev_signo < 0 || sig->sigev_signo >= NSIG ||
420 	    sig->sigev_notify < SIGEV_NONE || sig->sigev_notify > SIGEV_SA)
421 		return EINVAL;
422 
423 	/* Buffer and byte count */
424 	if (((AIO_SYNC | AIO_DSYNC) & op) == 0)
425 		if (aiocbp.aio_buf == NULL || aiocbp.aio_nbytes > SSIZE_MAX)
426 			return EINVAL;
427 
428 	/* Check the opcode, if LIO_NOP - simply ignore */
429 	if (op == AIO_LIO) {
430 		KASSERT(lio != NULL);
431 		if (aiocbp.aio_lio_opcode == LIO_WRITE)
432 			op = AIO_WRITE;
433 		else if (aiocbp.aio_lio_opcode == LIO_READ)
434 			op = AIO_READ;
435 		else
436 			return (aiocbp.aio_lio_opcode == LIO_NOP) ? 0 : EINVAL;
437 	} else {
438 		KASSERT(lio == NULL);
439 	}
440 
441 	/*
442 	 * Look for already existing job.  If found - the job is in-progress.
443 	 * According to POSIX this is invalid, so return the error.
444 	 */
445 	aio = p->p_aio;
446 	if (aio) {
447 		mutex_enter(&aio->aio_mtx);
448 		if (aio->curjob) {
449 			a_job = aio->curjob;
450 			if (a_job->aiocb_uptr == aiocb_uptr) {
451 				mutex_exit(&aio->aio_mtx);
452 				return EINVAL;
453 			}
454 		}
455 		TAILQ_FOREACH(a_job, &aio->jobs_queue, list) {
456 			if (a_job->aiocb_uptr != aiocb_uptr)
457 				continue;
458 			mutex_exit(&aio->aio_mtx);
459 			return EINVAL;
460 		}
461 		mutex_exit(&aio->aio_mtx);
462 	}
463 
464 	/*
465 	 * Check if AIO structure is initialized, if not - initialize it.
466 	 * In LIO case, we did that already.  We will recheck this with
467 	 * the lock in aio_init().
468 	 */
469 	if (lio == NULL && p->p_aio == NULL)
470 		if (aio_init(p))
471 			return EAGAIN;
472 	aio = p->p_aio;
473 
474 	/*
475 	 * Set the state with errno, and copy data
476 	 * structure back to the user-space.
477 	 */
478 	aiocbp._state = JOB_WIP;
479 	aiocbp._errno = EINPROGRESS;
480 	aiocbp._retval = -1;
481 	error = copyout(&aiocbp, aiocb_uptr, sizeof(struct aiocb));
482 	if (error)
483 		return error;
484 
485 	/* Allocate and initialize a new AIO job */
486 	a_job = pool_get(&aio_job_pool, PR_WAITOK);
487 	memset(a_job, 0, sizeof(struct aio_job));
488 
489 	/*
490 	 * Set the data.
491 	 * Store the user-space pointer for searching.  Since we
492 	 * are storing only per proc pointers - it is safe.
493 	 */
494 	memcpy(&a_job->aiocbp, &aiocbp, sizeof(struct aiocb));
495 	a_job->aiocb_uptr = aiocb_uptr;
496 	a_job->aio_op |= op;
497 	a_job->lio = lio;
498 
499 	/*
500 	 * Add the job to the queue, update the counters, and
501 	 * notify the AIO worker thread to handle the job.
502 	 */
503 	mutex_enter(&aio->aio_mtx);
504 
505 	/* Fail, if the limit was reached */
506 	if (atomic_inc_uint_nv(&aio_jobs_count) > aio_max ||
507 	    aio->jobs_count >= aio_listio_max) {
508 		atomic_dec_uint(&aio_jobs_count);
509 		mutex_exit(&aio->aio_mtx);
510 		pool_put(&aio_job_pool, a_job);
511 		return EAGAIN;
512 	}
513 
514 	TAILQ_INSERT_TAIL(&aio->jobs_queue, a_job, list);
515 	aio->jobs_count++;
516 	if (lio)
517 		lio->refcnt++;
518 	cv_signal(&aio->aio_worker_cv);
519 
520 	mutex_exit(&aio->aio_mtx);
521 
522 	/*
523 	 * One would handle the errors only with aio_error() function.
524 	 * This way is appropriate according to POSIX.
525 	 */
526 	return 0;
527 }
528 
529 /*
530  * Syscall functions.
531  */
532 
533 int
534 sys_aio_cancel(struct lwp *l, const struct sys_aio_cancel_args *uap, register_t *retval)
535 {
536 	/* {
537 		syscallarg(int) fildes;
538 		syscallarg(struct aiocb *) aiocbp;
539 	} */
540 	struct proc *p = l->l_proc;
541 	struct aioproc *aio;
542 	struct aio_job *a_job;
543 	struct aiocb *aiocbp_ptr;
544 	struct lio_req *lio;
545 	struct filedesc	*fdp = p->p_fd;
546 	unsigned int cn, errcnt, fildes;
547 
548 	TAILQ_HEAD(, aio_job) tmp_jobs_list;
549 
550 	/* Check for invalid file descriptor */
551 	fildes = (unsigned int)SCARG(uap, fildes);
552 	if (fildes >= fdp->fd_nfiles)
553 		return EBADF;
554 	membar_consumer();
555 	if (fdp->fd_ofiles[fildes] == NULL || fdp->fd_ofiles[fildes]->ff_file == NULL)
556 		return EBADF;
557 
558 	/* Check if AIO structure is initialized */
559 	if (p->p_aio == NULL) {
560 		*retval = AIO_NOTCANCELED;
561 		return 0;
562 	}
563 
564 	aio = p->p_aio;
565 	aiocbp_ptr = (struct aiocb *)SCARG(uap, aiocbp);
566 
567 	mutex_enter(&aio->aio_mtx);
568 
569 	/* Cancel the jobs, and remove them from the queue */
570 	cn = 0;
571 	TAILQ_INIT(&tmp_jobs_list);
572 	TAILQ_FOREACH(a_job, &aio->jobs_queue, list) {
573 		if (aiocbp_ptr) {
574 			if (aiocbp_ptr != a_job->aiocb_uptr)
575 				continue;
576 			if (fildes != a_job->aiocbp.aio_fildes) {
577 				mutex_exit(&aio->aio_mtx);
578 				return EBADF;
579 			}
580 		} else if (a_job->aiocbp.aio_fildes != fildes)
581 			continue;
582 
583 		TAILQ_REMOVE(&aio->jobs_queue, a_job, list);
584 		TAILQ_INSERT_TAIL(&tmp_jobs_list, a_job, list);
585 
586 		/* Decrease the counters */
587 		atomic_dec_uint(&aio_jobs_count);
588 		aio->jobs_count--;
589 		lio = a_job->lio;
590 		if (lio != NULL && --lio->refcnt != 0)
591 			a_job->lio = NULL;
592 
593 		cn++;
594 		if (aiocbp_ptr)
595 			break;
596 	}
597 
598 	/* There are canceled jobs */
599 	if (cn)
600 		*retval = AIO_CANCELED;
601 
602 	/* We cannot cancel current job */
603 	a_job = aio->curjob;
604 	if (a_job && ((a_job->aiocbp.aio_fildes == fildes) ||
605 	    (a_job->aiocb_uptr == aiocbp_ptr)))
606 		*retval = AIO_NOTCANCELED;
607 
608 	mutex_exit(&aio->aio_mtx);
609 
610 	/* Free the jobs after the lock */
611 	errcnt = 0;
612 	while (!TAILQ_EMPTY(&tmp_jobs_list)) {
613 		a_job = TAILQ_FIRST(&tmp_jobs_list);
614 		TAILQ_REMOVE(&tmp_jobs_list, a_job, list);
615 		/* Set the errno and copy structures back to the user-space */
616 		a_job->aiocbp._errno = ECANCELED;
617 		a_job->aiocbp._state = JOB_DONE;
618 		if (copyout(&a_job->aiocbp, a_job->aiocb_uptr,
619 		    sizeof(struct aiocb)))
620 			errcnt++;
621 		/* Send a signal if any */
622 		aio_sendsig(p, &a_job->aiocbp.aio_sigevent);
623 		if (a_job->lio) {
624 			lio = a_job->lio;
625 			aio_sendsig(p, &lio->sig);
626 			pool_put(&aio_lio_pool, lio);
627 		}
628 		pool_put(&aio_job_pool, a_job);
629 	}
630 
631 	if (errcnt)
632 		return EFAULT;
633 
634 	/* Set a correct return value */
635 	if (*retval == 0)
636 		*retval = AIO_ALLDONE;
637 
638 	return 0;
639 }
640 
641 int
642 sys_aio_error(struct lwp *l, const struct sys_aio_error_args *uap, register_t *retval)
643 {
644 	/* {
645 		syscallarg(const struct aiocb *) aiocbp;
646 	} */
647 	struct proc *p = l->l_proc;
648 	struct aioproc *aio = p->p_aio;
649 	struct aiocb aiocbp;
650 	int error;
651 
652 	if (aio == NULL)
653 		return EINVAL;
654 
655 	error = copyin(SCARG(uap, aiocbp), &aiocbp, sizeof(struct aiocb));
656 	if (error)
657 		return error;
658 
659 	if (aiocbp._state == JOB_NONE)
660 		return EINVAL;
661 
662 	*retval = aiocbp._errno;
663 
664 	return 0;
665 }
666 
667 int
668 sys_aio_fsync(struct lwp *l, const struct sys_aio_fsync_args *uap, register_t *retval)
669 {
670 	/* {
671 		syscallarg(int) op;
672 		syscallarg(struct aiocb *) aiocbp;
673 	} */
674 	int op = SCARG(uap, op);
675 
676 	if ((op != O_DSYNC) && (op != O_SYNC))
677 		return EINVAL;
678 
679 	op = O_DSYNC ? AIO_DSYNC : AIO_SYNC;
680 
681 	return aio_enqueue_job(op, SCARG(uap, aiocbp), NULL);
682 }
683 
684 int
685 sys_aio_read(struct lwp *l, const struct sys_aio_read_args *uap, register_t *retval)
686 {
687 	/* {
688 		syscallarg(struct aiocb *) aiocbp;
689 	} */
690 
691 	return aio_enqueue_job(AIO_READ, SCARG(uap, aiocbp), NULL);
692 }
693 
694 int
695 sys_aio_return(struct lwp *l, const struct sys_aio_return_args *uap, register_t *retval)
696 {
697 	/* {
698 		syscallarg(struct aiocb *) aiocbp;
699 	} */
700 	struct proc *p = l->l_proc;
701 	struct aioproc *aio = p->p_aio;
702 	struct aiocb aiocbp;
703 	int error;
704 
705 	if (aio == NULL)
706 		return EINVAL;
707 
708 	error = copyin(SCARG(uap, aiocbp), &aiocbp, sizeof(struct aiocb));
709 	if (error)
710 		return error;
711 
712 	if (aiocbp._errno == EINPROGRESS || aiocbp._state != JOB_DONE)
713 		return EINVAL;
714 
715 	*retval = aiocbp._retval;
716 
717 	/* Reset the internal variables */
718 	aiocbp._errno = 0;
719 	aiocbp._retval = -1;
720 	aiocbp._state = JOB_NONE;
721 	error = copyout(&aiocbp, SCARG(uap, aiocbp), sizeof(struct aiocb));
722 
723 	return error;
724 }
725 
726 int
727 sys_aio_suspend(struct lwp *l, const struct sys_aio_suspend_args *uap, register_t *retval)
728 {
729 	/* {
730 		syscallarg(const struct aiocb *const[]) list;
731 		syscallarg(int) nent;
732 		syscallarg(const struct timespec *) timeout;
733 	} */
734 	struct proc *p = l->l_proc;
735 	struct aioproc *aio;
736 	struct aio_job *a_job;
737 	struct aiocb **aiocbp_list;
738 	struct timespec ts;
739 	int i, error, nent, timo;
740 
741 	if (p->p_aio == NULL)
742 		return EAGAIN;
743 	aio = p->p_aio;
744 
745 	nent = SCARG(uap, nent);
746 	if (nent <= 0 || nent > aio_listio_max)
747 		return EAGAIN;
748 
749 	if (SCARG(uap, timeout)) {
750 		/* Convert timespec to ticks */
751 		error = copyin(SCARG(uap, timeout), &ts,
752 		    sizeof(struct timespec));
753 		if (error)
754 			return error;
755 		timo = mstohz((ts.tv_sec * 1000) + (ts.tv_nsec / 1000000));
756 		if (timo == 0 && ts.tv_sec == 0 && ts.tv_nsec > 0)
757 			timo = 1;
758 		if (timo <= 0)
759 			return EAGAIN;
760 	} else
761 		timo = 0;
762 
763 	/* Get the list from user-space */
764 	aiocbp_list = kmem_zalloc(nent * sizeof(struct aio_job), KM_SLEEP);
765 	error = copyin(SCARG(uap, list), aiocbp_list,
766 	    nent * sizeof(struct aiocb));
767 	if (error) {
768 		kmem_free(aiocbp_list, nent * sizeof(struct aio_job));
769 		return error;
770 	}
771 
772 	mutex_enter(&aio->aio_mtx);
773 	for (;;) {
774 
775 		for (i = 0; i < nent; i++) {
776 
777 			/* Skip NULL entries */
778 			if (aiocbp_list[i] == NULL)
779 				continue;
780 
781 			/* Skip current job */
782 			if (aio->curjob) {
783 				a_job = aio->curjob;
784 				if (a_job->aiocb_uptr == aiocbp_list[i])
785 					continue;
786 			}
787 
788 			/* Look for a job in the queue */
789 			TAILQ_FOREACH(a_job, &aio->jobs_queue, list)
790 				if (a_job->aiocb_uptr == aiocbp_list[i])
791 					break;
792 
793 			if (a_job == NULL) {
794 				struct aiocb aiocbp;
795 
796 				mutex_exit(&aio->aio_mtx);
797 
798 				error = copyin(aiocbp_list[i], &aiocbp,
799 				    sizeof(struct aiocb));
800 				if (error == 0 && aiocbp._state != JOB_DONE) {
801 					mutex_enter(&aio->aio_mtx);
802 					continue;
803 				}
804 
805 				kmem_free(aiocbp_list,
806 				    nent * sizeof(struct aio_job));
807 				return error;
808 			}
809 		}
810 
811 		/* Wait for a signal or when timeout occurs */
812 		error = cv_timedwait_sig(&aio->done_cv, &aio->aio_mtx, timo);
813 		if (error) {
814 			if (error == EWOULDBLOCK)
815 				error = EAGAIN;
816 			break;
817 		}
818 	}
819 	mutex_exit(&aio->aio_mtx);
820 
821 	kmem_free(aiocbp_list, nent * sizeof(struct aio_job));
822 	return error;
823 }
824 
825 int
826 sys_aio_write(struct lwp *l, const struct sys_aio_write_args *uap, register_t *retval)
827 {
828 	/* {
829 		syscallarg(struct aiocb *) aiocbp;
830 	} */
831 
832 	return aio_enqueue_job(AIO_WRITE, SCARG(uap, aiocbp), NULL);
833 }
834 
835 int
836 sys_lio_listio(struct lwp *l, const struct sys_lio_listio_args *uap, register_t *retval)
837 {
838 	/* {
839 		syscallarg(int) mode;
840 		syscallarg(struct aiocb *const[]) list;
841 		syscallarg(int) nent;
842 		syscallarg(struct sigevent *) sig;
843 	} */
844 	struct proc *p = l->l_proc;
845 	struct aioproc *aio;
846 	struct aiocb **aiocbp_list;
847 	struct lio_req *lio;
848 	int i, error, errcnt, mode, nent;
849 
850 	mode = SCARG(uap, mode);
851 	nent = SCARG(uap, nent);
852 
853 	/* Non-accurate checks for the limit and invalid values */
854 	if (nent < 1 || nent > aio_listio_max)
855 		return EINVAL;
856 	if (aio_jobs_count + nent > aio_max)
857 		return EAGAIN;
858 
859 	/* Check if AIO structure is initialized, if not - initialize it */
860 	if (p->p_aio == NULL)
861 		if (aio_init(p))
862 			return EAGAIN;
863 	aio = p->p_aio;
864 
865 	/* Create a LIO structure */
866 	lio = pool_get(&aio_lio_pool, PR_WAITOK);
867 	lio->refcnt = 1;
868 	error = 0;
869 
870 	switch (mode) {
871 	case LIO_WAIT:
872 		memset(&lio->sig, 0, sizeof(struct sigevent));
873 		break;
874 	case LIO_NOWAIT:
875 		/* Check for signal, validate it */
876 		if (SCARG(uap, sig)) {
877 			struct sigevent *sig = &lio->sig;
878 
879 			error = copyin(SCARG(uap, sig), &lio->sig,
880 			    sizeof(struct sigevent));
881 			if (error == 0 &&
882 			    (sig->sigev_signo < 0 ||
883 			    sig->sigev_signo >= NSIG ||
884 			    sig->sigev_notify < SIGEV_NONE ||
885 			    sig->sigev_notify > SIGEV_SA))
886 				error = EINVAL;
887 		} else
888 			memset(&lio->sig, 0, sizeof(struct sigevent));
889 		break;
890 	default:
891 		error = EINVAL;
892 		break;
893 	}
894 
895 	if (error != 0) {
896 		pool_put(&aio_lio_pool, lio);
897 		return error;
898 	}
899 
900 	/* Get the list from user-space */
901 	aiocbp_list = kmem_zalloc(nent * sizeof(struct aio_job), KM_SLEEP);
902 	error = copyin(SCARG(uap, list), aiocbp_list,
903 	    nent * sizeof(struct aiocb));
904 	if (error) {
905 		mutex_enter(&aio->aio_mtx);
906 		goto err;
907 	}
908 
909 	/* Enqueue all jobs */
910 	errcnt = 0;
911 	for (i = 0; i < nent; i++) {
912 		error = aio_enqueue_job(AIO_LIO, aiocbp_list[i], lio);
913 		/*
914 		 * According to POSIX, in such error case it may
915 		 * fail with other I/O operations initiated.
916 		 */
917 		if (error)
918 			errcnt++;
919 	}
920 
921 	mutex_enter(&aio->aio_mtx);
922 
923 	/* Return an error, if any */
924 	if (errcnt) {
925 		error = EIO;
926 		goto err;
927 	}
928 
929 	if (mode == LIO_WAIT) {
930 		/*
931 		 * Wait for AIO completion.  In such case,
932 		 * the LIO structure will be freed here.
933 		 */
934 		while (lio->refcnt > 1 && error == 0)
935 			error = cv_wait_sig(&aio->done_cv, &aio->aio_mtx);
936 		if (error)
937 			error = EINTR;
938 	}
939 
940 err:
941 	if (--lio->refcnt != 0)
942 		lio = NULL;
943 	mutex_exit(&aio->aio_mtx);
944 	if (lio != NULL) {
945 		aio_sendsig(p, &lio->sig);
946 		pool_put(&aio_lio_pool, lio);
947 	}
948 	kmem_free(aiocbp_list, nent * sizeof(struct aio_job));
949 	return error;
950 }
951 
952 /*
953  * SysCtl
954  */
955 
956 static int
957 sysctl_aio_listio_max(SYSCTLFN_ARGS)
958 {
959 	struct sysctlnode node;
960 	int error, newsize;
961 
962 	node = *rnode;
963 	node.sysctl_data = &newsize;
964 
965 	newsize = aio_listio_max;
966 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
967 	if (error || newp == NULL)
968 		return error;
969 
970 	if (newsize < 1 || newsize > aio_max)
971 		return EINVAL;
972 	aio_listio_max = newsize;
973 
974 	return 0;
975 }
976 
977 static int
978 sysctl_aio_max(SYSCTLFN_ARGS)
979 {
980 	struct sysctlnode node;
981 	int error, newsize;
982 
983 	node = *rnode;
984 	node.sysctl_data = &newsize;
985 
986 	newsize = aio_max;
987 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
988 	if (error || newp == NULL)
989 		return error;
990 
991 	if (newsize < 1 || newsize < aio_listio_max)
992 		return EINVAL;
993 	aio_max = newsize;
994 
995 	return 0;
996 }
997 
998 SYSCTL_SETUP(sysctl_aio_setup, "sysctl aio setup")
999 {
1000 
1001 	sysctl_createv(clog, 0, NULL, NULL,
1002 		CTLFLAG_PERMANENT,
1003 		CTLTYPE_NODE, "kern", NULL,
1004 		NULL, 0, NULL, 0,
1005 		CTL_KERN, CTL_EOL);
1006 	sysctl_createv(clog, 0, NULL, NULL,
1007 		CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
1008 		CTLTYPE_INT, "posix_aio",
1009 		SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
1010 			     "Asynchronous I/O option to which the "
1011 			     "system attempts to conform"),
1012 		NULL, _POSIX_ASYNCHRONOUS_IO, NULL, 0,
1013 		CTL_KERN, CTL_CREATE, CTL_EOL);
1014 	sysctl_createv(clog, 0, NULL, NULL,
1015 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1016 		CTLTYPE_INT, "aio_listio_max",
1017 		SYSCTL_DESCR("Maximum number of asynchronous I/O "
1018 			     "operations in a single list I/O call"),
1019 		sysctl_aio_listio_max, 0, &aio_listio_max, 0,
1020 		CTL_KERN, CTL_CREATE, CTL_EOL);
1021 	sysctl_createv(clog, 0, NULL, NULL,
1022 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1023 		CTLTYPE_INT, "aio_max",
1024 		SYSCTL_DESCR("Maximum number of asynchronous I/O "
1025 			     "operations"),
1026 		sysctl_aio_max, 0, &aio_max, 0,
1027 		CTL_KERN, CTL_CREATE, CTL_EOL);
1028 }
1029 
1030 /*
1031  * Debugging
1032  */
1033 #if defined(DDB)
1034 void
1035 aio_print_jobs(void (*pr)(const char *, ...))
1036 {
1037 	struct proc *p = (curlwp == NULL ? NULL : curlwp->l_proc);
1038 	struct aioproc *aio;
1039 	struct aio_job *a_job;
1040 	struct aiocb *aiocbp;
1041 
1042 	if (p == NULL) {
1043 		(*pr)("AIO: We are not in the processes right now.\n");
1044 		return;
1045 	}
1046 
1047 	aio = p->p_aio;
1048 	if (aio == NULL) {
1049 		(*pr)("AIO data is not initialized (PID = %d).\n", p->p_pid);
1050 		return;
1051 	}
1052 
1053 	(*pr)("AIO: PID = %d\n", p->p_pid);
1054 	(*pr)("AIO: Global count of the jobs = %u\n", aio_jobs_count);
1055 	(*pr)("AIO: Count of the jobs = %u\n", aio->jobs_count);
1056 
1057 	if (aio->curjob) {
1058 		a_job = aio->curjob;
1059 		(*pr)("\nAIO current job:\n");
1060 		(*pr)(" opcode = %d, errno = %d, state = %d, aiocb_ptr = %p\n",
1061 		    a_job->aio_op, a_job->aiocbp._errno,
1062 		    a_job->aiocbp._state, a_job->aiocb_uptr);
1063 		aiocbp = &a_job->aiocbp;
1064 		(*pr)("   fd = %d, offset = %u, buf = %p, nbytes = %u\n",
1065 		    aiocbp->aio_fildes, aiocbp->aio_offset,
1066 		    aiocbp->aio_buf, aiocbp->aio_nbytes);
1067 	}
1068 
1069 	(*pr)("\nAIO queue:\n");
1070 	TAILQ_FOREACH(a_job, &aio->jobs_queue, list) {
1071 		(*pr)(" opcode = %d, errno = %d, state = %d, aiocb_ptr = %p\n",
1072 		    a_job->aio_op, a_job->aiocbp._errno,
1073 		    a_job->aiocbp._state, a_job->aiocb_uptr);
1074 		aiocbp = &a_job->aiocbp;
1075 		(*pr)("   fd = %d, offset = %u, buf = %p, nbytes = %u\n",
1076 		    aiocbp->aio_fildes, aiocbp->aio_offset,
1077 		    aiocbp->aio_buf, aiocbp->aio_nbytes);
1078 	}
1079 }
1080 #endif /* defined(DDB) */
1081