xref: /netbsd-src/sys/kern/sys_aio.c (revision 34335fd211163d14142f73fea8d0b7e87b6d07bc)
1 /*	$NetBSD: sys_aio.c,v 1.50 2024/12/07 02:38:51 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 2007 Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * Implementation of POSIX asynchronous I/O.
31  * Defined in the Base Definitions volume of IEEE Std 1003.1-2001.
32  */
33 
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(0, "$NetBSD: sys_aio.c,v 1.50 2024/12/07 02:38:51 riastradh Exp $");
36 
37 #ifdef _KERNEL_OPT
38 #include "opt_ddb.h"
39 #endif
40 
41 #include <sys/param.h>
42 #include <sys/types.h>
43 
44 #include <sys/atomic.h>
45 #include <sys/buf.h>
46 #include <sys/condvar.h>
47 #include <sys/file.h>
48 #include <sys/filedesc.h>
49 #include <sys/kernel.h>
50 #include <sys/kmem.h>
51 #include <sys/lwp.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
54 #include <sys/pool.h>
55 #include <sys/proc.h>
56 #include <sys/queue.h>
57 #include <sys/sdt.h>
58 #include <sys/signal.h>
59 #include <sys/signalvar.h>
60 #include <sys/syscall.h>
61 #include <sys/syscallargs.h>
62 #include <sys/syscallvar.h>
63 #include <sys/sysctl.h>
64 #include <sys/systm.h>
65 #include <sys/types.h>
66 #include <sys/vnode.h>
67 
68 #include <uvm/uvm_extern.h>
69 
70 MODULE(MODULE_CLASS_MISC, aio, NULL);
71 
72 /*
73  * System-wide limits and counter of AIO operations.
74  */
75 u_int			aio_listio_max = AIO_LISTIO_MAX;
76 static u_int		aio_max = AIO_MAX;
77 static u_int		aio_jobs_count;
78 
79 static struct pool	aio_job_pool;
80 static struct pool	aio_lio_pool;
81 static void *		aio_ehook;
82 
83 static void		aio_worker(void *);
84 static void		aio_process(struct aio_job *);
85 static void		aio_sendsig(struct proc *, struct sigevent *);
86 static int		aio_enqueue_job(int, void *, struct lio_req *);
87 static void		aio_exit(proc_t *, void *);
88 
89 static int		sysctl_aio_listio_max(SYSCTLFN_PROTO);
90 static int		sysctl_aio_max(SYSCTLFN_PROTO);
91 
92 static const struct syscall_package aio_syscalls[] = {
93 	{ SYS_aio_cancel, 0, (sy_call_t *)sys_aio_cancel },
94 	{ SYS_aio_error, 0, (sy_call_t *)sys_aio_error },
95 	{ SYS_aio_fsync, 0, (sy_call_t *)sys_aio_fsync },
96 	{ SYS_aio_read, 0, (sy_call_t *)sys_aio_read },
97 	{ SYS_aio_return, 0, (sy_call_t *)sys_aio_return },
98 	{ SYS___aio_suspend50, 0, (sy_call_t *)sys___aio_suspend50 },
99 	{ SYS_aio_write, 0, (sy_call_t *)sys_aio_write },
100 	{ SYS_lio_listio, 0, (sy_call_t *)sys_lio_listio },
101 	{ 0, 0, NULL },
102 };
103 
104 /*
105  * Tear down all AIO state.
106  */
107 static int
108 aio_fini(bool interface)
109 {
110 	int error;
111 	proc_t *p;
112 
113 	if (interface) {
114 		/* Stop syscall activity. */
115 		error = syscall_disestablish(NULL, aio_syscalls);
116 		if (error != 0)
117 			return error;
118 		/* Abort if any processes are using AIO. */
119 		mutex_enter(&proc_lock);
120 		PROCLIST_FOREACH(p, &allproc) {
121 			if (p->p_aio != NULL)
122 				break;
123 		}
124 		mutex_exit(&proc_lock);
125 		if (p != NULL) {
126 			error = syscall_establish(NULL, aio_syscalls);
127 			KASSERT(error == 0);
128 			return SET_ERROR(EBUSY);
129 		}
130 	}
131 
132 	KASSERT(aio_jobs_count == 0);
133 	exithook_disestablish(aio_ehook);
134 	pool_destroy(&aio_job_pool);
135 	pool_destroy(&aio_lio_pool);
136 	return 0;
137 }
138 
139 /*
140  * Initialize global AIO state.
141  */
142 static int
143 aio_init(void)
144 {
145 	int error;
146 
147 	pool_init(&aio_job_pool, sizeof(struct aio_job), 0, 0, 0,
148 	    "aio_jobs_pool", &pool_allocator_nointr, IPL_NONE);
149 	pool_init(&aio_lio_pool, sizeof(struct lio_req), 0, 0, 0,
150 	    "aio_lio_pool", &pool_allocator_nointr, IPL_NONE);
151 	aio_ehook = exithook_establish(aio_exit, NULL);
152 
153 	error = syscall_establish(NULL, aio_syscalls);
154 	if (error != 0)
155 		(void)aio_fini(false);
156 	return error;
157 }
158 
159 /*
160  * Module interface.
161  */
162 static int
163 aio_modcmd(modcmd_t cmd, void *arg)
164 {
165 
166 	switch (cmd) {
167 	case MODULE_CMD_INIT:
168 		return aio_init();
169 	case MODULE_CMD_FINI:
170 		return aio_fini(true);
171 	default:
172 		return SET_ERROR(ENOTTY);
173 	}
174 }
175 
176 /*
177  * Initialize Asynchronous I/O data structures for the process.
178  */
179 static int
180 aio_procinit(struct proc *p)
181 {
182 	struct aioproc *aio;
183 	struct lwp *l;
184 	int error;
185 	vaddr_t uaddr;
186 
187 	/* Allocate and initialize AIO structure */
188 	aio = kmem_zalloc(sizeof(struct aioproc), KM_SLEEP);
189 
190 	/* Initialize queue and their synchronization structures */
191 	mutex_init(&aio->aio_mtx, MUTEX_DEFAULT, IPL_NONE);
192 	cv_init(&aio->aio_worker_cv, "aiowork");
193 	cv_init(&aio->done_cv, "aiodone");
194 	TAILQ_INIT(&aio->jobs_queue);
195 
196 	/*
197 	 * Create an AIO worker thread.
198 	 * XXX: Currently, AIO thread is not protected against user's actions.
199 	 */
200 	uaddr = uvm_uarea_alloc();
201 	if (uaddr == 0) {
202 		aio_exit(p, aio);
203 		return SET_ERROR(EAGAIN);
204 	}
205 	error = lwp_create(curlwp, p, uaddr, 0, NULL, 0, aio_worker,
206 	    NULL, &l, curlwp->l_class, &curlwp->l_sigmask, &curlwp->l_sigstk);
207 	if (error != 0) {
208 		uvm_uarea_free(uaddr);
209 		aio_exit(p, aio);
210 		return error;
211 	}
212 
213 	/* Recheck if we are really first */
214 	mutex_enter(p->p_lock);
215 	if (p->p_aio) {
216 		mutex_exit(p->p_lock);
217 		aio_exit(p, aio);
218 		lwp_exit(l);
219 		return 0;
220 	}
221 	p->p_aio = aio;
222 
223 	/* Complete the initialization of thread, and run it */
224 	aio->aio_worker = l;
225 	lwp_lock(l);
226 	lwp_changepri(l, MAXPRI_USER);
227 	setrunnable(l);
228 	/* LWP now unlocked */
229 	mutex_exit(p->p_lock);
230 
231 	return 0;
232 }
233 
234 /*
235  * Exit of Asynchronous I/O subsystem of process.
236  */
237 static void
238 aio_exit(struct proc *p, void *cookie)
239 {
240 	struct aio_job *a_job;
241 	struct aioproc *aio;
242 
243 	if (cookie != NULL)
244 		aio = cookie;
245 	else if ((aio = p->p_aio) == NULL)
246 		return;
247 
248 	/* Free AIO queue */
249 	while (!TAILQ_EMPTY(&aio->jobs_queue)) {
250 		a_job = TAILQ_FIRST(&aio->jobs_queue);
251 		TAILQ_REMOVE(&aio->jobs_queue, a_job, list);
252 		pool_put(&aio_job_pool, a_job);
253 		atomic_dec_uint(&aio_jobs_count);
254 	}
255 
256 	/* Destroy and free the entire AIO data structure */
257 	cv_destroy(&aio->aio_worker_cv);
258 	cv_destroy(&aio->done_cv);
259 	mutex_destroy(&aio->aio_mtx);
260 	kmem_free(aio, sizeof(struct aioproc));
261 }
262 
263 /*
264  * AIO worker thread and processor.
265  */
266 static void
267 aio_worker(void *arg)
268 {
269 	struct proc *p = curlwp->l_proc;
270 	struct aioproc *aio = p->p_aio;
271 	struct aio_job *a_job;
272 	struct lio_req *lio;
273 	sigset_t oss, nss;
274 	int error __diagused, refcnt;
275 
276 	/*
277 	 * Make an empty signal mask, so it
278 	 * handles only SIGKILL and SIGSTOP.
279 	 */
280 	sigfillset(&nss);
281 	mutex_enter(p->p_lock);
282 	error = sigprocmask1(curlwp, SIG_SETMASK, &nss, &oss);
283 	mutex_exit(p->p_lock);
284 	KASSERT(error == 0);
285 
286 	for (;;) {
287 		/*
288 		 * Loop for each job in the queue.  If there
289 		 * are no jobs then sleep.
290 		 */
291 		mutex_enter(&aio->aio_mtx);
292 		while ((a_job = TAILQ_FIRST(&aio->jobs_queue)) == NULL) {
293 			if (cv_wait_sig(&aio->aio_worker_cv, &aio->aio_mtx)) {
294 				/*
295 				 * Thread was interrupted - check for
296 				 * pending exit or suspend.
297 				 */
298 				mutex_exit(&aio->aio_mtx);
299 				lwp_userret(curlwp);
300 				mutex_enter(&aio->aio_mtx);
301 			}
302 		}
303 
304 		/* Take the job from the queue */
305 		aio->curjob = a_job;
306 		TAILQ_REMOVE(&aio->jobs_queue, a_job, list);
307 
308 		atomic_dec_uint(&aio_jobs_count);
309 		aio->jobs_count--;
310 
311 		mutex_exit(&aio->aio_mtx);
312 
313 		/* Process an AIO operation */
314 		aio_process(a_job);
315 
316 		/* Copy data structure back to the user-space */
317 		(void)copyout(&a_job->aiocbp, a_job->aiocb_uptr,
318 		    sizeof(struct aiocb));
319 
320 		mutex_enter(&aio->aio_mtx);
321 		KASSERT(aio->curjob == a_job);
322 		aio->curjob = NULL;
323 
324 		/* Decrease a reference counter, if there is a LIO structure */
325 		lio = a_job->lio;
326 		refcnt = (lio != NULL ? --lio->refcnt : -1);
327 
328 		/* Notify all suspenders */
329 		cv_broadcast(&aio->done_cv);
330 		mutex_exit(&aio->aio_mtx);
331 
332 		/* Send a signal, if any */
333 		aio_sendsig(p, &a_job->aiocbp.aio_sigevent);
334 
335 		/* Destroy the LIO structure */
336 		if (refcnt == 0) {
337 			aio_sendsig(p, &lio->sig);
338 			pool_put(&aio_lio_pool, lio);
339 		}
340 
341 		/* Destroy the job */
342 		pool_put(&aio_job_pool, a_job);
343 	}
344 
345 	/* NOTREACHED */
346 }
347 
348 static void
349 aio_process(struct aio_job *a_job)
350 {
351 	struct proc *p = curlwp->l_proc;
352 	struct aiocb *aiocbp = &a_job->aiocbp;
353 	struct file *fp;
354 	int fd = aiocbp->aio_fildes;
355 	int error = 0;
356 
357 	KASSERT(a_job->aio_op != 0);
358 
359 	if ((a_job->aio_op & (AIO_READ | AIO_WRITE)) != 0) {
360 		struct iovec aiov;
361 		struct uio auio;
362 
363 		if (aiocbp->aio_nbytes > SSIZE_MAX) {
364 			error = SET_ERROR(EINVAL);
365 			goto done;
366 		}
367 
368 		fp = fd_getfile(fd);
369 		if (fp == NULL) {
370 			error = SET_ERROR(EBADF);
371 			goto done;
372 		}
373 
374 		aiov.iov_base = (void *)(uintptr_t)aiocbp->aio_buf;
375 		aiov.iov_len = aiocbp->aio_nbytes;
376 		auio.uio_iov = &aiov;
377 		auio.uio_iovcnt = 1;
378 		auio.uio_resid = aiocbp->aio_nbytes;
379 		auio.uio_vmspace = p->p_vmspace;
380 
381 		if (a_job->aio_op & AIO_READ) {
382 			/*
383 			 * Perform a Read operation
384 			 */
385 			KASSERT((a_job->aio_op & AIO_WRITE) == 0);
386 
387 			if ((fp->f_flag & FREAD) == 0) {
388 				fd_putfile(fd);
389 				error = SET_ERROR(EBADF);
390 				goto done;
391 			}
392 			auio.uio_rw = UIO_READ;
393 			error = (*fp->f_ops->fo_read)(fp, &aiocbp->aio_offset,
394 			    &auio, fp->f_cred, FOF_UPDATE_OFFSET);
395 		} else {
396 			/*
397 			 * Perform a Write operation
398 			 */
399 			KASSERT(a_job->aio_op & AIO_WRITE);
400 
401 			if ((fp->f_flag & FWRITE) == 0) {
402 				fd_putfile(fd);
403 				error = SET_ERROR(EBADF);
404 				goto done;
405 			}
406 			auio.uio_rw = UIO_WRITE;
407 			error = (*fp->f_ops->fo_write)(fp, &aiocbp->aio_offset,
408 			    &auio, fp->f_cred, FOF_UPDATE_OFFSET);
409 		}
410 		fd_putfile(fd);
411 
412 		/* Store the result value */
413 		a_job->aiocbp.aio_nbytes -= auio.uio_resid;
414 		a_job->aiocbp._retval = (error == 0) ?
415 		    a_job->aiocbp.aio_nbytes : -1;
416 
417 	} else if ((a_job->aio_op & (AIO_SYNC | AIO_DSYNC)) != 0) {
418 		/*
419 		 * Perform a file Sync operation
420 		 */
421 		struct vnode *vp;
422 
423 		if ((error = fd_getvnode(fd, &fp)) != 0)
424 			goto done;
425 
426 		if ((fp->f_flag & FWRITE) == 0) {
427 			fd_putfile(fd);
428 			error = SET_ERROR(EBADF);
429 			goto done;
430 		}
431 
432 		vp = fp->f_vnode;
433 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
434 		if (a_job->aio_op & AIO_DSYNC) {
435 			error = VOP_FSYNC(vp, fp->f_cred,
436 			    FSYNC_WAIT | FSYNC_DATAONLY, 0, 0);
437 		} else if (a_job->aio_op & AIO_SYNC) {
438 			error = VOP_FSYNC(vp, fp->f_cred,
439 			    FSYNC_WAIT, 0, 0);
440 		}
441 		VOP_UNLOCK(vp);
442 		fd_putfile(fd);
443 
444 		/* Store the result value */
445 		a_job->aiocbp._retval = (error == 0) ? 0 : -1;
446 
447 	} else
448 		panic("aio_process: invalid operation code\n");
449 
450 done:
451 	/* Job is done, set the error, if any */
452 	a_job->aiocbp._errno = error;
453 	a_job->aiocbp._state = JOB_DONE;
454 }
455 
456 /*
457  * Send AIO signal.
458  */
459 static void
460 aio_sendsig(struct proc *p, struct sigevent *sig)
461 {
462 	ksiginfo_t ksi;
463 
464 	if (sig->sigev_signo == 0 || sig->sigev_notify == SIGEV_NONE)
465 		return;
466 
467 	KSI_INIT(&ksi);
468 	ksi.ksi_signo = sig->sigev_signo;
469 	ksi.ksi_code = SI_ASYNCIO;
470 	ksi.ksi_value = sig->sigev_value;
471 	mutex_enter(&proc_lock);
472 	kpsignal(p, &ksi, NULL);
473 	mutex_exit(&proc_lock);
474 }
475 
476 /*
477  * Enqueue the job.
478  */
479 static int
480 aio_enqueue_job(int op, void *aiocb_uptr, struct lio_req *lio)
481 {
482 	struct proc *p = curlwp->l_proc;
483 	struct aioproc *aio;
484 	struct aio_job *a_job;
485 	struct aiocb aiocbp;
486 	struct sigevent *sig;
487 	int error;
488 
489 	/* Non-accurate check for the limit */
490 	if (aio_jobs_count + 1 > aio_max)
491 		return SET_ERROR(EAGAIN);
492 
493 	/* Get the data structure from user-space */
494 	error = copyin(aiocb_uptr, &aiocbp, sizeof(struct aiocb));
495 	if (error)
496 		return error;
497 
498 	/* Check if signal is set, and validate it */
499 	sig = &aiocbp.aio_sigevent;
500 	if (sig->sigev_signo < 0 || sig->sigev_signo >= NSIG ||
501 	    sig->sigev_notify < SIGEV_NONE || sig->sigev_notify > SIGEV_SA)
502 		return SET_ERROR(EINVAL);
503 
504 	/* Buffer and byte count */
505 	if (((AIO_SYNC | AIO_DSYNC) & op) == 0)
506 		if (aiocbp.aio_buf == NULL || aiocbp.aio_nbytes > SSIZE_MAX)
507 			return SET_ERROR(EINVAL);
508 
509 	/* Check the opcode, if LIO_NOP - simply ignore */
510 	if (op == AIO_LIO) {
511 		KASSERT(lio != NULL);
512 		if (aiocbp.aio_lio_opcode == LIO_WRITE)
513 			op = AIO_WRITE;
514 		else if (aiocbp.aio_lio_opcode == LIO_READ)
515 			op = AIO_READ;
516 		else
517 			return (aiocbp.aio_lio_opcode == LIO_NOP) ? 0 :
518 			    SET_ERROR(EINVAL);
519 	} else {
520 		KASSERT(lio == NULL);
521 	}
522 
523 	/*
524 	 * Look for already existing job.  If found - the job is in-progress.
525 	 * According to POSIX this is invalid, so return the error.
526 	 */
527 	aio = p->p_aio;
528 	if (aio) {
529 		mutex_enter(&aio->aio_mtx);
530 		TAILQ_FOREACH(a_job, &aio->jobs_queue, list) {
531 			if (a_job->aiocb_uptr != aiocb_uptr)
532 				continue;
533 			mutex_exit(&aio->aio_mtx);
534 			return SET_ERROR(EINVAL);
535 		}
536 		mutex_exit(&aio->aio_mtx);
537 	}
538 
539 	/*
540 	 * Check if AIO structure is initialized, if not - initialize it.
541 	 * In LIO case, we did that already.  We will recheck this with
542 	 * the lock in aio_procinit().
543 	 */
544 	if (lio == NULL && p->p_aio == NULL)
545 		if (aio_procinit(p))
546 			return SET_ERROR(EAGAIN);
547 	aio = p->p_aio;
548 
549 	/*
550 	 * Set the state with errno, and copy data
551 	 * structure back to the user-space.
552 	 */
553 	aiocbp._state = JOB_WIP;
554 	aiocbp._errno = SET_ERROR(EINPROGRESS);
555 	aiocbp._retval = -1;
556 	error = copyout(&aiocbp, aiocb_uptr, sizeof(struct aiocb));
557 	if (error)
558 		return error;
559 
560 	/* Allocate and initialize a new AIO job */
561 	a_job = pool_get(&aio_job_pool, PR_WAITOK | PR_ZERO);
562 
563 	/*
564 	 * Set the data.
565 	 * Store the user-space pointer for searching.  Since we
566 	 * are storing only per proc pointers - it is safe.
567 	 */
568 	memcpy(&a_job->aiocbp, &aiocbp, sizeof(struct aiocb));
569 	a_job->aiocb_uptr = aiocb_uptr;
570 	a_job->aio_op |= op;
571 	a_job->lio = lio;
572 
573 	/*
574 	 * Add the job to the queue, update the counters, and
575 	 * notify the AIO worker thread to handle the job.
576 	 */
577 	mutex_enter(&aio->aio_mtx);
578 
579 	/* Fail, if the limit was reached */
580 	if (atomic_inc_uint_nv(&aio_jobs_count) > aio_max ||
581 	    aio->jobs_count >= aio_listio_max) {
582 		atomic_dec_uint(&aio_jobs_count);
583 		mutex_exit(&aio->aio_mtx);
584 		pool_put(&aio_job_pool, a_job);
585 		return SET_ERROR(EAGAIN);
586 	}
587 
588 	TAILQ_INSERT_TAIL(&aio->jobs_queue, a_job, list);
589 	aio->jobs_count++;
590 	if (lio)
591 		lio->refcnt++;
592 	cv_signal(&aio->aio_worker_cv);
593 
594 	mutex_exit(&aio->aio_mtx);
595 
596 	/*
597 	 * One would handle the errors only with aio_error() function.
598 	 * This way is appropriate according to POSIX.
599 	 */
600 	return 0;
601 }
602 
603 /*
604  * Syscall functions.
605  */
606 
607 int
608 sys_aio_cancel(struct lwp *l, const struct sys_aio_cancel_args *uap,
609     register_t *retval)
610 {
611 	/* {
612 		syscallarg(int) fildes;
613 		syscallarg(struct aiocb *) aiocbp;
614 	} */
615 	struct proc *p = l->l_proc;
616 	struct aioproc *aio;
617 	struct aio_job *a_job;
618 	struct aiocb *aiocbp_ptr;
619 	struct lio_req *lio;
620 	struct filedesc	*fdp = p->p_fd;
621 	unsigned int cn, errcnt, fildes;
622 	fdtab_t *dt;
623 
624 	TAILQ_HEAD(, aio_job) tmp_jobs_list;
625 
626 	/* Check for invalid file descriptor */
627 	fildes = (unsigned int)SCARG(uap, fildes);
628 	dt = atomic_load_consume(&fdp->fd_dt);
629 	if (fildes >= dt->dt_nfiles)
630 		return SET_ERROR(EBADF);
631 	if (dt->dt_ff[fildes] == NULL || dt->dt_ff[fildes]->ff_file == NULL)
632 		return SET_ERROR(EBADF);
633 
634 	/* Check if AIO structure is initialized */
635 	if (p->p_aio == NULL) {
636 		*retval = AIO_NOTCANCELED;
637 		return 0;
638 	}
639 
640 	aio = p->p_aio;
641 	aiocbp_ptr = (struct aiocb *)SCARG(uap, aiocbp);
642 
643 	mutex_enter(&aio->aio_mtx);
644 
645 	/* Cancel the jobs, and remove them from the queue */
646 	cn = 0;
647 	TAILQ_INIT(&tmp_jobs_list);
648 	TAILQ_FOREACH(a_job, &aio->jobs_queue, list) {
649 		if (aiocbp_ptr) {
650 			if (aiocbp_ptr != a_job->aiocb_uptr)
651 				continue;
652 			if (fildes != a_job->aiocbp.aio_fildes) {
653 				mutex_exit(&aio->aio_mtx);
654 				return SET_ERROR(EBADF);
655 			}
656 		} else if (a_job->aiocbp.aio_fildes != fildes)
657 			continue;
658 
659 		TAILQ_REMOVE(&aio->jobs_queue, a_job, list);
660 		TAILQ_INSERT_TAIL(&tmp_jobs_list, a_job, list);
661 
662 		/* Decrease the counters */
663 		atomic_dec_uint(&aio_jobs_count);
664 		aio->jobs_count--;
665 		lio = a_job->lio;
666 		if (lio != NULL && --lio->refcnt != 0)
667 			a_job->lio = NULL;
668 
669 		cn++;
670 		if (aiocbp_ptr)
671 			break;
672 	}
673 
674 	/* There are canceled jobs */
675 	if (cn)
676 		*retval = AIO_CANCELED;
677 
678 	/* We cannot cancel current job */
679 	a_job = aio->curjob;
680 	if (a_job && ((a_job->aiocbp.aio_fildes == fildes) ||
681 	    (a_job->aiocb_uptr == aiocbp_ptr)))
682 		*retval = AIO_NOTCANCELED;
683 
684 	mutex_exit(&aio->aio_mtx);
685 
686 	/* Free the jobs after the lock */
687 	errcnt = 0;
688 	while (!TAILQ_EMPTY(&tmp_jobs_list)) {
689 		a_job = TAILQ_FIRST(&tmp_jobs_list);
690 		TAILQ_REMOVE(&tmp_jobs_list, a_job, list);
691 		/* Set the errno and copy structures back to the user-space */
692 		a_job->aiocbp._errno = SET_ERROR(ECANCELED);
693 		a_job->aiocbp._state = JOB_DONE;
694 		if (copyout(&a_job->aiocbp, a_job->aiocb_uptr,
695 		    sizeof(struct aiocb)))
696 			errcnt++;
697 		/* Send a signal if any */
698 		aio_sendsig(p, &a_job->aiocbp.aio_sigevent);
699 		if (a_job->lio) {
700 			lio = a_job->lio;
701 			aio_sendsig(p, &lio->sig);
702 			pool_put(&aio_lio_pool, lio);
703 		}
704 		pool_put(&aio_job_pool, a_job);
705 	}
706 
707 	if (errcnt)
708 		return SET_ERROR(EFAULT);
709 
710 	/* Set a correct return value */
711 	if (*retval == 0)
712 		*retval = AIO_ALLDONE;
713 
714 	return 0;
715 }
716 
717 int
718 sys_aio_error(struct lwp *l, const struct sys_aio_error_args *uap,
719     register_t *retval)
720 {
721 	/* {
722 		syscallarg(const struct aiocb *) aiocbp;
723 	} */
724 	struct proc *p = l->l_proc;
725 	struct aioproc *aio = p->p_aio;
726 	struct aiocb aiocbp;
727 	int error;
728 
729 	if (aio == NULL)
730 		return SET_ERROR(EINVAL);
731 
732 	error = copyin(SCARG(uap, aiocbp), &aiocbp, sizeof(struct aiocb));
733 	if (error)
734 		return error;
735 
736 	if (aiocbp._state == JOB_NONE)
737 		return SET_ERROR(EINVAL);
738 
739 	*retval = aiocbp._errno;
740 
741 	return 0;
742 }
743 
744 int
745 sys_aio_fsync(struct lwp *l, const struct sys_aio_fsync_args *uap,
746     register_t *retval)
747 {
748 	/* {
749 		syscallarg(int) op;
750 		syscallarg(struct aiocb *) aiocbp;
751 	} */
752 	int op = SCARG(uap, op);
753 
754 	if ((op != O_DSYNC) && (op != O_SYNC))
755 		return SET_ERROR(EINVAL);
756 
757 	op = O_DSYNC ? AIO_DSYNC : AIO_SYNC;
758 
759 	return aio_enqueue_job(op, SCARG(uap, aiocbp), NULL);
760 }
761 
762 int
763 sys_aio_read(struct lwp *l, const struct sys_aio_read_args *uap,
764     register_t *retval)
765 {
766 	/* {
767 		syscallarg(struct aiocb *) aiocbp;
768 	} */
769 
770 	return aio_enqueue_job(AIO_READ, SCARG(uap, aiocbp), NULL);
771 }
772 
773 int
774 sys_aio_return(struct lwp *l, const struct sys_aio_return_args *uap,
775     register_t *retval)
776 {
777 	/* {
778 		syscallarg(struct aiocb *) aiocbp;
779 	} */
780 	struct proc *p = l->l_proc;
781 	struct aioproc *aio = p->p_aio;
782 	struct aiocb aiocbp;
783 	int error;
784 
785 	if (aio == NULL)
786 		return SET_ERROR(EINVAL);
787 
788 	error = copyin(SCARG(uap, aiocbp), &aiocbp, sizeof(struct aiocb));
789 	if (error)
790 		return error;
791 
792 	if (aiocbp._errno == EINPROGRESS || aiocbp._state != JOB_DONE)
793 		return SET_ERROR(EINVAL);
794 
795 	*retval = aiocbp._retval;
796 
797 	/* Reset the internal variables */
798 	aiocbp._errno = 0;
799 	aiocbp._retval = -1;
800 	aiocbp._state = JOB_NONE;
801 	error = copyout(&aiocbp, SCARG(uap, aiocbp), sizeof(struct aiocb));
802 
803 	return error;
804 }
805 
806 int
807 sys___aio_suspend50(struct lwp *l, const struct sys___aio_suspend50_args *uap,
808     register_t *retval)
809 {
810 	/* {
811 		syscallarg(const struct aiocb *const[]) list;
812 		syscallarg(int) nent;
813 		syscallarg(const struct timespec *) timeout;
814 	} */
815 	struct aiocb **list;
816 	struct timespec ts;
817 	int error, nent;
818 
819 	nent = SCARG(uap, nent);
820 	if (nent <= 0 || nent > aio_listio_max)
821 		return SET_ERROR(EAGAIN);
822 
823 	if (SCARG(uap, timeout)) {
824 		/* Convert timespec to ticks */
825 		error = copyin(SCARG(uap, timeout), &ts,
826 		    sizeof(struct timespec));
827 		if (error)
828 			return error;
829 	}
830 
831 	list = kmem_alloc(nent * sizeof(*list), KM_SLEEP);
832 	error = copyin(SCARG(uap, list), list, nent * sizeof(*list));
833 	if (error)
834 		goto out;
835 	error = aio_suspend1(l, list, nent, SCARG(uap, timeout) ? &ts : NULL);
836 out:
837 	kmem_free(list, nent * sizeof(*list));
838 	return error;
839 }
840 
841 int
842 aio_suspend1(struct lwp *l, struct aiocb **aiocbp_list, int nent,
843     struct timespec *ts)
844 {
845 	struct proc *p = l->l_proc;
846 	struct aioproc *aio;
847 	struct aio_job *a_job;
848 	int i, error, timo;
849 
850 	if (p->p_aio == NULL)
851 		return SET_ERROR(EAGAIN);
852 	aio = p->p_aio;
853 
854 	if (ts) {
855 		timo = mstohz((ts->tv_sec * 1000) + (ts->tv_nsec / 1000000));
856 		if (timo == 0 && ts->tv_sec == 0 && ts->tv_nsec > 0)
857 			timo = 1;
858 		if (timo <= 0)
859 			return SET_ERROR(EAGAIN);
860 	} else
861 		timo = 0;
862 
863 	mutex_enter(&aio->aio_mtx);
864 	for (;;) {
865 		for (i = 0; i < nent; i++) {
866 
867 			/* Skip NULL entries */
868 			if (aiocbp_list[i] == NULL)
869 				continue;
870 
871 			/* Skip current job */
872 			if (aio->curjob) {
873 				a_job = aio->curjob;
874 				if (a_job->aiocb_uptr == aiocbp_list[i])
875 					continue;
876 			}
877 
878 			/* Look for a job in the queue */
879 			TAILQ_FOREACH(a_job, &aio->jobs_queue, list)
880 				if (a_job->aiocb_uptr == aiocbp_list[i])
881 					break;
882 
883 			if (a_job == NULL) {
884 				struct aiocb aiocbp;
885 
886 				mutex_exit(&aio->aio_mtx);
887 
888 				/* Check if the job is done. */
889 				error = copyin(aiocbp_list[i], &aiocbp,
890 				    sizeof(struct aiocb));
891 				if (error == 0 && aiocbp._state != JOB_DONE) {
892 					mutex_enter(&aio->aio_mtx);
893 					continue;
894 				}
895 				return error;
896 			}
897 		}
898 
899 		/* Wait for a signal or when timeout occurs */
900 		error = cv_timedwait_sig(&aio->done_cv, &aio->aio_mtx, timo);
901 		if (error) {
902 			if (error == EWOULDBLOCK)
903 				error = SET_ERROR(EAGAIN);
904 			break;
905 		}
906 	}
907 	mutex_exit(&aio->aio_mtx);
908 	return error;
909 }
910 
911 int
912 sys_aio_write(struct lwp *l, const struct sys_aio_write_args *uap,
913     register_t *retval)
914 {
915 	/* {
916 		syscallarg(struct aiocb *) aiocbp;
917 	} */
918 
919 	return aio_enqueue_job(AIO_WRITE, SCARG(uap, aiocbp), NULL);
920 }
921 
922 int
923 sys_lio_listio(struct lwp *l, const struct sys_lio_listio_args *uap,
924     register_t *retval)
925 {
926 	/* {
927 		syscallarg(int) mode;
928 		syscallarg(struct aiocb *const[]) list;
929 		syscallarg(int) nent;
930 		syscallarg(struct sigevent *) sig;
931 	} */
932 	struct proc *p = l->l_proc;
933 	struct aioproc *aio;
934 	struct aiocb **aiocbp_list;
935 	struct lio_req *lio;
936 	int i, error, errcnt, mode, nent;
937 
938 	mode = SCARG(uap, mode);
939 	nent = SCARG(uap, nent);
940 
941 	/* Non-accurate checks for the limit and invalid values */
942 	if (nent < 1 || nent > aio_listio_max)
943 		return SET_ERROR(EINVAL);
944 	if (aio_jobs_count + nent > aio_max)
945 		return SET_ERROR(EAGAIN);
946 
947 	/* Check if AIO structure is initialized, if not - initialize it */
948 	if (p->p_aio == NULL)
949 		if (aio_procinit(p))
950 			return SET_ERROR(EAGAIN);
951 	aio = p->p_aio;
952 
953 	/* Create a LIO structure */
954 	lio = pool_get(&aio_lio_pool, PR_WAITOK);
955 	lio->refcnt = 1;
956 	error = 0;
957 
958 	switch (mode) {
959 	case LIO_WAIT:
960 		memset(&lio->sig, 0, sizeof(struct sigevent));
961 		break;
962 	case LIO_NOWAIT:
963 		/* Check for signal, validate it */
964 		if (SCARG(uap, sig)) {
965 			struct sigevent *sig = &lio->sig;
966 
967 			error = copyin(SCARG(uap, sig), &lio->sig,
968 			    sizeof(struct sigevent));
969 			if (error == 0 &&
970 			    (sig->sigev_signo < 0 ||
971 			    sig->sigev_signo >= NSIG ||
972 			    sig->sigev_notify < SIGEV_NONE ||
973 			    sig->sigev_notify > SIGEV_SA))
974 				error = SET_ERROR(EINVAL);
975 		} else
976 			memset(&lio->sig, 0, sizeof(struct sigevent));
977 		break;
978 	default:
979 		error = SET_ERROR(EINVAL);
980 		break;
981 	}
982 
983 	if (error != 0) {
984 		pool_put(&aio_lio_pool, lio);
985 		return error;
986 	}
987 
988 	/* Get the list from user-space */
989 	aiocbp_list = kmem_alloc(nent * sizeof(*aiocbp_list), KM_SLEEP);
990 	error = copyin(SCARG(uap, list), aiocbp_list,
991 	    nent * sizeof(*aiocbp_list));
992 	if (error) {
993 		mutex_enter(&aio->aio_mtx);
994 		goto err;
995 	}
996 
997 	/* Enqueue all jobs */
998 	errcnt = 0;
999 	for (i = 0; i < nent; i++) {
1000 		error = aio_enqueue_job(AIO_LIO, aiocbp_list[i], lio);
1001 		/*
1002 		 * According to POSIX, in such error case it may
1003 		 * fail with other I/O operations initiated.
1004 		 */
1005 		if (error)
1006 			errcnt++;
1007 	}
1008 
1009 	mutex_enter(&aio->aio_mtx);
1010 
1011 	/* Return an error, if any */
1012 	if (errcnt) {
1013 		error = SET_ERROR(EIO);
1014 		goto err;
1015 	}
1016 
1017 	if (mode == LIO_WAIT) {
1018 		/*
1019 		 * Wait for AIO completion.  In such case,
1020 		 * the LIO structure will be freed here.
1021 		 */
1022 		while (lio->refcnt > 1 && error == 0)
1023 			error = cv_wait_sig(&aio->done_cv, &aio->aio_mtx);
1024 		if (error)
1025 			error = SET_ERROR(EINTR);
1026 	}
1027 
1028 err:
1029 	if (--lio->refcnt != 0)
1030 		lio = NULL;
1031 	mutex_exit(&aio->aio_mtx);
1032 	if (lio != NULL) {
1033 		aio_sendsig(p, &lio->sig);
1034 		pool_put(&aio_lio_pool, lio);
1035 	}
1036 	kmem_free(aiocbp_list, nent * sizeof(*aiocbp_list));
1037 	return error;
1038 }
1039 
1040 /*
1041  * SysCtl
1042  */
1043 
1044 static int
1045 sysctl_aio_listio_max(SYSCTLFN_ARGS)
1046 {
1047 	struct sysctlnode node;
1048 	int error, newsize;
1049 
1050 	node = *rnode;
1051 	node.sysctl_data = &newsize;
1052 
1053 	newsize = aio_listio_max;
1054 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1055 	if (error || newp == NULL)
1056 		return error;
1057 
1058 	if (newsize < 1 || newsize > aio_max)
1059 		return SET_ERROR(EINVAL);
1060 	aio_listio_max = newsize;
1061 
1062 	return 0;
1063 }
1064 
1065 static int
1066 sysctl_aio_max(SYSCTLFN_ARGS)
1067 {
1068 	struct sysctlnode node;
1069 	int error, newsize;
1070 
1071 	node = *rnode;
1072 	node.sysctl_data = &newsize;
1073 
1074 	newsize = aio_max;
1075 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1076 	if (error || newp == NULL)
1077 		return error;
1078 
1079 	if (newsize < 1 || newsize < aio_listio_max)
1080 		return SET_ERROR(EINVAL);
1081 	aio_max = newsize;
1082 
1083 	return 0;
1084 }
1085 
1086 SYSCTL_SETUP(sysctl_aio_init, "aio sysctl")
1087 {
1088 	int rv;
1089 
1090 	rv = sysctl_createv(clog, 0, NULL, NULL,
1091 	    CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
1092 	    CTLTYPE_INT, "posix_aio",
1093 	    SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
1094 		"Asynchronous I/O option to which the "
1095 		"system attempts to conform"),
1096 	    NULL, _POSIX_ASYNCHRONOUS_IO, NULL, 0,
1097 	    CTL_KERN, CTL_CREATE, CTL_EOL);
1098 
1099 	if (rv != 0)
1100 		return;
1101 
1102 	rv = sysctl_createv(clog, 0, NULL, NULL,
1103 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1104 	    CTLTYPE_INT, "aio_listio_max",
1105 	    SYSCTL_DESCR("Maximum number of asynchronous I/O "
1106 		"operations in a single list I/O call"),
1107 	    sysctl_aio_listio_max, 0, &aio_listio_max, 0,
1108 	    CTL_KERN, CTL_CREATE, CTL_EOL);
1109 
1110 	if (rv != 0)
1111 		return;
1112 
1113 	rv = sysctl_createv(clog, 0, NULL, NULL,
1114 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1115 	    CTLTYPE_INT, "aio_max",
1116 	    SYSCTL_DESCR("Maximum number of asynchronous I/O "
1117 		"operations"),
1118 	    sysctl_aio_max, 0, &aio_max, 0,
1119 	    CTL_KERN, CTL_CREATE, CTL_EOL);
1120 
1121 	return;
1122 }
1123 
1124 /*
1125  * Debugging
1126  */
1127 #if defined(DDB)
1128 void
1129 aio_print_jobs(void (*pr)(const char *, ...))
1130 {
1131 	struct proc *p = curlwp->l_proc;
1132 	struct aioproc *aio;
1133 	struct aio_job *a_job;
1134 	struct aiocb *aiocbp;
1135 
1136 	if (p == NULL) {
1137 		(*pr)("AIO: We are not in the processes right now.\n");
1138 		return;
1139 	}
1140 
1141 	aio = p->p_aio;
1142 	if (aio == NULL) {
1143 		(*pr)("AIO data is not initialized (PID = %d).\n", p->p_pid);
1144 		return;
1145 	}
1146 
1147 	(*pr)("AIO: PID = %d\n", p->p_pid);
1148 	(*pr)("AIO: Global count of the jobs = %u\n", aio_jobs_count);
1149 	(*pr)("AIO: Count of the jobs = %u\n", aio->jobs_count);
1150 
1151 	if (aio->curjob) {
1152 		a_job = aio->curjob;
1153 		(*pr)("\nAIO current job:\n");
1154 		(*pr)(" opcode = %d, errno = %d, state = %d, aiocb_ptr = %p\n",
1155 		    a_job->aio_op, a_job->aiocbp._errno,
1156 		    a_job->aiocbp._state, a_job->aiocb_uptr);
1157 		aiocbp = &a_job->aiocbp;
1158 		(*pr)("   fd = %d, offset = %u, buf = %p, nbytes = %u\n",
1159 		    aiocbp->aio_fildes, aiocbp->aio_offset,
1160 		    aiocbp->aio_buf, aiocbp->aio_nbytes);
1161 	}
1162 
1163 	(*pr)("\nAIO queue:\n");
1164 	TAILQ_FOREACH(a_job, &aio->jobs_queue, list) {
1165 		(*pr)(" opcode = %d, errno = %d, state = %d, aiocb_ptr = %p\n",
1166 		    a_job->aio_op, a_job->aiocbp._errno,
1167 		    a_job->aiocbp._state, a_job->aiocb_uptr);
1168 		aiocbp = &a_job->aiocbp;
1169 		(*pr)("   fd = %d, offset = %u, buf = %p, nbytes = %u\n",
1170 		    aiocbp->aio_fildes, aiocbp->aio_offset,
1171 		    aiocbp->aio_buf, aiocbp->aio_nbytes);
1172 	}
1173 }
1174 #endif /* defined(DDB) */
1175