1 /* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $FreeBSD: src/sys/kern/vfs_aio.c,v 1.70.2.28 2003/05/29 06:15:35 alc Exp $ 17 * $DragonFly: src/sys/kern/vfs_aio.c,v 1.42 2007/07/20 17:21:52 dillon Exp $ 18 */ 19 20 /* 21 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 22 */ 23 24 #include <sys/param.h> 25 #include <sys/systm.h> 26 #include <sys/buf.h> 27 #include <sys/sysproto.h> 28 #include <sys/filedesc.h> 29 #include <sys/kernel.h> 30 #include <sys/fcntl.h> 31 #include <sys/file.h> 32 #include <sys/lock.h> 33 #include <sys/unistd.h> 34 #include <sys/proc.h> 35 #include <sys/resourcevar.h> 36 #include <sys/signalvar.h> 37 #include <sys/protosw.h> 38 #include <sys/socketvar.h> 39 #include <sys/sysctl.h> 40 #include <sys/vnode.h> 41 #include <sys/conf.h> 42 #include <sys/event.h> 43 #include <sys/objcache.h> 44 45 #include <vm/vm.h> 46 #include <vm/vm_extern.h> 47 #include <vm/pmap.h> 48 #include <vm/vm_map.h> 49 #include <sys/aio.h> 50 51 #include <sys/file2.h> 52 #include <sys/buf2.h> 53 #include <sys/sysref2.h> 54 #include <sys/thread2.h> 55 #include <sys/mplock2.h> 56 57 #include <machine/limits.h> 58 #include "opt_vfs_aio.h" 59 60 #ifdef VFS_AIO 61 62 /* 63 * Counter for allocating reference ids to new jobs. Wrapped to 1 on 64 * overflow. 65 */ 66 static long jobrefid; 67 68 #define JOBST_NULL 0x0 69 #define JOBST_JOBQGLOBAL 0x2 70 #define JOBST_JOBRUNNING 0x3 71 #define JOBST_JOBFINISHED 0x4 72 #define JOBST_JOBQBUF 0x5 73 #define JOBST_JOBBFINISHED 0x6 74 75 #ifndef MAX_AIO_PER_PROC 76 #define MAX_AIO_PER_PROC 32 77 #endif 78 79 #ifndef MAX_AIO_QUEUE_PER_PROC 80 #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 81 #endif 82 83 #ifndef MAX_AIO_PROCS 84 #define MAX_AIO_PROCS 32 85 #endif 86 87 #ifndef MAX_AIO_QUEUE 88 #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 89 #endif 90 91 #ifndef TARGET_AIO_PROCS 92 #define TARGET_AIO_PROCS 4 93 #endif 94 95 #ifndef MAX_BUF_AIO 96 #define MAX_BUF_AIO 16 97 #endif 98 99 #ifndef AIOD_TIMEOUT_DEFAULT 100 #define AIOD_TIMEOUT_DEFAULT (10 * hz) 101 #endif 102 103 #ifndef AIOD_LIFETIME_DEFAULT 104 #define AIOD_LIFETIME_DEFAULT (30 * hz) 105 #endif 106 107 SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); 108 109 static int max_aio_procs = MAX_AIO_PROCS; 110 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 111 CTLFLAG_RW, &max_aio_procs, 0, 112 "Maximum number of kernel threads to use for handling async IO"); 113 114 static int num_aio_procs = 0; 115 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 116 CTLFLAG_RD, &num_aio_procs, 0, 117 "Number of presently active kernel threads for async IO"); 118 119 /* 120 * The code will adjust the actual number of AIO processes towards this 121 * number when it gets a chance. 122 */ 123 static int target_aio_procs = TARGET_AIO_PROCS; 124 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 125 0, "Preferred number of ready kernel threads for async IO"); 126 127 static int max_queue_count = MAX_AIO_QUEUE; 128 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, 129 "Maximum number of aio requests to queue, globally"); 130 131 static int num_queue_count = 0; 132 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, 133 "Number of queued aio requests"); 134 135 static int num_buf_aio = 0; 136 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, 137 "Number of aio requests presently handled by the buf subsystem"); 138 139 /* Number of async I/O thread in the process of being started */ 140 /* XXX This should be local to _aio_aqueue() */ 141 static int num_aio_resv_start = 0; 142 143 static int aiod_timeout; 144 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, 145 "Timeout value for synchronous aio operations"); 146 147 static int aiod_lifetime; 148 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, 149 "Maximum lifetime for idle aiod"); 150 151 static int max_aio_per_proc = MAX_AIO_PER_PROC; 152 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 153 0, "Maximum active aio requests per process (stored in the process)"); 154 155 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 156 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, 157 &max_aio_queue_per_proc, 0, 158 "Maximum queued aio requests per process (stored in the process)"); 159 160 static int max_buf_aio = MAX_BUF_AIO; 161 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, 162 "Maximum buf aio requests per process (stored in the process)"); 163 164 /* 165 * AIO process info 166 */ 167 #define AIOP_FREE 0x1 /* proc on free queue */ 168 #define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 169 170 struct aioproclist { 171 int aioprocflags; /* AIO proc flags */ 172 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 173 struct proc *aioproc; /* The AIO thread */ 174 }; 175 176 /* 177 * data-structure for lio signal management 178 */ 179 struct aio_liojob { 180 int lioj_flags; 181 int lioj_buffer_count; 182 int lioj_buffer_finished_count; 183 int lioj_queue_count; 184 int lioj_queue_finished_count; 185 struct sigevent lioj_signal; /* signal on all I/O done */ 186 TAILQ_ENTRY(aio_liojob) lioj_list; 187 struct kaioinfo *lioj_ki; 188 }; 189 #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 190 #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 191 192 /* 193 * per process aio data structure 194 */ 195 struct kaioinfo { 196 int kaio_flags; /* per process kaio flags */ 197 int kaio_maxactive_count; /* maximum number of AIOs */ 198 int kaio_active_count; /* number of currently used AIOs */ 199 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 200 int kaio_queue_count; /* size of AIO queue */ 201 int kaio_ballowed_count; /* maximum number of buffers */ 202 int kaio_queue_finished_count; /* number of daemon jobs finished */ 203 int kaio_buffer_count; /* number of physio buffers */ 204 int kaio_buffer_finished_count; /* count of I/O done */ 205 struct proc *kaio_p; /* process that uses this kaio block */ 206 TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 207 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */ 208 TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */ 209 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 210 TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */ 211 TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ 212 }; 213 214 #define KAIO_RUNDOWN 0x1 /* process is being run down */ 215 #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 216 217 static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc; 218 static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 219 static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 220 static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 221 222 static void aio_init_aioinfo(struct proc *p); 223 static void aio_onceonly(void *); 224 static int aio_free_entry(struct aiocblist *aiocbe); 225 static void aio_process(struct aiocblist *aiocbe); 226 static int aio_newproc(void); 227 static int aio_aqueue(struct aiocb *job, int type); 228 static void aio_physwakeup(struct bio *bio); 229 static int aio_fphysio(struct aiocblist *aiocbe); 230 static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 231 static void aio_daemon(void *uproc, struct trapframe *frame); 232 static void process_signal(void *aioj); 233 234 SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 235 236 /* 237 * Zones for: 238 * kaio Per process async io info 239 * aiop async io thread data 240 * aiocb async io jobs 241 * aiol list io job pointer - internal to aio_suspend XXX 242 * aiolio list io jobs 243 */ 244 static struct objcache *kaio_oc, *aiop_oc, *aiocb_oc, *aiol_oc, *aiolio_oc; 245 246 static MALLOC_DEFINE(M_AIO, "AIO", "AIO"); 247 static MALLOC_DEFINE(M_AIOP, "AIO proc", "AIO process"); 248 static MALLOC_DEFINE(M_AIOCB, "AIO cb", "AIO cb"); 249 static MALLOC_DEFINE(M_AIOL, "AIO list io", "AIO list io"); 250 static MALLOC_DEFINE(M_AIOLIO, "AIO list io job", "AIO list io job"); 251 252 /* 253 * Startup initialization 254 */ 255 static void 256 aio_onceonly(void *na) 257 { 258 TAILQ_INIT(&aio_freeproc); 259 TAILQ_INIT(&aio_activeproc); 260 TAILQ_INIT(&aio_jobs); 261 TAILQ_INIT(&aio_bufjobs); 262 TAILQ_INIT(&aio_freejobs); 263 kaio_oc = objcache_create_simple(M_AIO, sizeof(struct kaioinfo)); 264 aiop_oc = objcache_create_simple(M_AIOP, sizeof(struct aioproclist)); 265 aiocb_oc = objcache_create_simple(M_AIOCB, sizeof(struct aiocblist)); 266 aiol_oc = objcache_create_simple(M_AIOL, AIO_LISTIO_MAX*sizeof(intptr_t)); 267 aiolio_oc = objcache_create_simple(M_AIOLIO, sizeof(struct aio_liojob)); 268 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 269 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 270 jobrefid = 1; 271 } 272 273 /* 274 * Init the per-process aioinfo structure. The aioinfo limits are set 275 * per-process for user limit (resource) management. 276 */ 277 static void 278 aio_init_aioinfo(struct proc *p) 279 { 280 struct kaioinfo *ki; 281 if (p->p_aioinfo == NULL) { 282 ki = objcache_get(kaio_oc, M_WAITOK); 283 p->p_aioinfo = ki; 284 ki->kaio_flags = 0; 285 ki->kaio_maxactive_count = max_aio_per_proc; 286 ki->kaio_active_count = 0; 287 ki->kaio_qallowed_count = max_aio_queue_per_proc; 288 ki->kaio_queue_count = 0; 289 ki->kaio_ballowed_count = max_buf_aio; 290 ki->kaio_buffer_count = 0; 291 ki->kaio_buffer_finished_count = 0; 292 ki->kaio_p = p; 293 TAILQ_INIT(&ki->kaio_jobdone); 294 TAILQ_INIT(&ki->kaio_jobqueue); 295 TAILQ_INIT(&ki->kaio_bufdone); 296 TAILQ_INIT(&ki->kaio_bufqueue); 297 TAILQ_INIT(&ki->kaio_liojoblist); 298 TAILQ_INIT(&ki->kaio_sockqueue); 299 } 300 301 while (num_aio_procs < target_aio_procs) 302 aio_newproc(); 303 } 304 305 /* 306 * Free a job entry. Wait for completion if it is currently active, but don't 307 * delay forever. If we delay, we return a flag that says that we have to 308 * restart the queue scan. 309 */ 310 static int 311 aio_free_entry(struct aiocblist *aiocbe) 312 { 313 struct kaioinfo *ki; 314 struct aio_liojob *lj; 315 struct proc *p; 316 int error; 317 318 if (aiocbe->jobstate == JOBST_NULL) 319 panic("aio_free_entry: freeing already free job"); 320 321 p = aiocbe->userproc; 322 ki = p->p_aioinfo; 323 lj = aiocbe->lio; 324 if (ki == NULL) 325 panic("aio_free_entry: missing p->p_aioinfo"); 326 327 while (aiocbe->jobstate == JOBST_JOBRUNNING) { 328 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 329 tsleep(aiocbe, 0, "jobwai", 0); 330 } 331 if (aiocbe->bp == NULL) { 332 if (ki->kaio_queue_count <= 0) 333 panic("aio_free_entry: process queue size <= 0"); 334 if (num_queue_count <= 0) 335 panic("aio_free_entry: system wide queue size <= 0"); 336 337 if (lj) { 338 lj->lioj_queue_count--; 339 if (aiocbe->jobflags & AIOCBLIST_DONE) 340 lj->lioj_queue_finished_count--; 341 } 342 ki->kaio_queue_count--; 343 if (aiocbe->jobflags & AIOCBLIST_DONE) 344 ki->kaio_queue_finished_count--; 345 num_queue_count--; 346 } else { 347 if (lj) { 348 lj->lioj_buffer_count--; 349 if (aiocbe->jobflags & AIOCBLIST_DONE) 350 lj->lioj_buffer_finished_count--; 351 } 352 if (aiocbe->jobflags & AIOCBLIST_DONE) 353 ki->kaio_buffer_finished_count--; 354 ki->kaio_buffer_count--; 355 num_buf_aio--; 356 } 357 358 /* aiocbe is going away, we need to destroy any knotes */ 359 /* XXX lwp knote wants a thread, but only cares about the process */ 360 knote_empty(&aiocbe->klist); 361 362 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) 363 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { 364 ki->kaio_flags &= ~KAIO_WAKEUP; 365 wakeup(p); 366 } 367 368 if (aiocbe->jobstate == JOBST_JOBQBUF) { 369 if ((error = aio_fphysio(aiocbe)) != 0) 370 return error; 371 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 372 panic("aio_free_entry: invalid physio finish-up state"); 373 crit_enter(); 374 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 375 crit_exit(); 376 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) { 377 crit_enter(); 378 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 379 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 380 crit_exit(); 381 } else if (aiocbe->jobstate == JOBST_JOBFINISHED) 382 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 383 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { 384 crit_enter(); 385 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 386 crit_exit(); 387 if (aiocbe->bp) { 388 vunmapbuf(aiocbe->bp); 389 relpbuf(aiocbe->bp, NULL); 390 aiocbe->bp = NULL; 391 } 392 } 393 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 394 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 395 objcache_put(aiolio_oc, lj); 396 } 397 aiocbe->jobstate = JOBST_NULL; 398 callout_stop(&aiocbe->timeout); 399 fdrop(aiocbe->fd_file); 400 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 401 return 0; 402 } 403 #endif /* VFS_AIO */ 404 405 /* 406 * Rundown the jobs for a given process. 407 */ 408 void 409 aio_proc_rundown(struct proc *p) 410 { 411 #ifndef VFS_AIO 412 return; 413 #else 414 struct kaioinfo *ki; 415 struct aio_liojob *lj, *ljn; 416 struct aiocblist *aiocbe, *aiocbn; 417 struct file *fp; 418 struct socket *so; 419 420 ki = p->p_aioinfo; 421 if (ki == NULL) 422 return; 423 424 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 425 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > 426 ki->kaio_buffer_finished_count)) { 427 ki->kaio_flags |= KAIO_RUNDOWN; 428 if (tsleep(p, 0, "kaiowt", aiod_timeout)) 429 break; 430 } 431 432 /* 433 * Move any aio ops that are waiting on socket I/O to the normal job 434 * queues so they are cleaned up with any others. 435 */ 436 crit_enter(); 437 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = 438 aiocbn) { 439 aiocbn = TAILQ_NEXT(aiocbe, plist); 440 fp = aiocbe->fd_file; 441 if (fp != NULL) { 442 so = (struct socket *)fp->f_data; 443 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); 444 if (TAILQ_EMPTY(&so->so_aiojobq)) { 445 atomic_clear_int(&so->so_snd.ssb_flags, 446 SSB_AIO); 447 atomic_clear_int(&so->so_rcv.ssb_flags, 448 SSB_AIO); 449 } 450 } 451 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); 452 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); 453 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); 454 } 455 crit_exit(); 456 457 restart1: 458 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { 459 aiocbn = TAILQ_NEXT(aiocbe, plist); 460 if (aio_free_entry(aiocbe)) 461 goto restart1; 462 } 463 464 restart2: 465 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = 466 aiocbn) { 467 aiocbn = TAILQ_NEXT(aiocbe, plist); 468 if (aio_free_entry(aiocbe)) 469 goto restart2; 470 } 471 472 restart3: 473 crit_enter(); 474 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 475 ki->kaio_flags |= KAIO_WAKEUP; 476 tsleep(p, 0, "aioprn", 0); 477 crit_exit(); 478 goto restart3; 479 } 480 crit_exit(); 481 482 restart4: 483 crit_enter(); 484 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { 485 aiocbn = TAILQ_NEXT(aiocbe, plist); 486 if (aio_free_entry(aiocbe)) { 487 crit_exit(); 488 goto restart4; 489 } 490 } 491 crit_exit(); 492 493 /* 494 * If we've slept, jobs might have moved from one queue to another. 495 * Retry rundown if we didn't manage to empty the queues. 496 */ 497 if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL || 498 TAILQ_FIRST(&ki->kaio_jobqueue) != NULL || 499 TAILQ_FIRST(&ki->kaio_bufqueue) != NULL || 500 TAILQ_FIRST(&ki->kaio_bufdone) != NULL) 501 goto restart1; 502 503 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { 504 ljn = TAILQ_NEXT(lj, lioj_list); 505 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 506 0)) { 507 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 508 objcache_put(aiolio_oc, lj); 509 } else { 510 #ifdef DIAGNOSTIC 511 kprintf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " 512 "QF:%d\n", lj->lioj_buffer_count, 513 lj->lioj_buffer_finished_count, 514 lj->lioj_queue_count, 515 lj->lioj_queue_finished_count); 516 #endif 517 } 518 } 519 520 objcache_put(kaio_oc, ki); 521 p->p_aioinfo = NULL; 522 #endif /* VFS_AIO */ 523 } 524 525 #ifdef VFS_AIO 526 /* 527 * Select a job to run (called by an AIO daemon). 528 */ 529 static struct aiocblist * 530 aio_selectjob(struct aioproclist *aiop) 531 { 532 struct aiocblist *aiocbe; 533 struct kaioinfo *ki; 534 struct proc *userp; 535 536 crit_enter(); 537 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = 538 TAILQ_NEXT(aiocbe, list)) { 539 userp = aiocbe->userproc; 540 ki = userp->p_aioinfo; 541 542 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 543 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 544 crit_exit(); 545 return aiocbe; 546 } 547 } 548 crit_exit(); 549 550 return NULL; 551 } 552 553 /* 554 * The AIO processing activity. This is the code that does the I/O request for 555 * the non-physio version of the operations. The normal vn operations are used, 556 * and this code should work in all instances for every type of file, including 557 * pipes, sockets, fifos, and regular files. 558 */ 559 static void 560 aio_process(struct aiocblist *aiocbe) 561 { 562 struct thread *mytd; 563 struct aiocb *cb; 564 struct file *fp; 565 struct uio auio; 566 struct iovec aiov; 567 int cnt; 568 int error; 569 int oublock_st, oublock_end; 570 int inblock_st, inblock_end; 571 572 mytd = curthread; 573 cb = &aiocbe->uaiocb; 574 fp = aiocbe->fd_file; 575 576 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; 577 aiov.iov_len = cb->aio_nbytes; 578 579 auio.uio_iov = &aiov; 580 auio.uio_iovcnt = 1; 581 auio.uio_offset = cb->aio_offset; 582 auio.uio_resid = cb->aio_nbytes; 583 cnt = cb->aio_nbytes; 584 auio.uio_segflg = UIO_USERSPACE; 585 auio.uio_td = mytd; 586 587 inblock_st = mytd->td_lwp->lwp_ru.ru_inblock; 588 oublock_st = mytd->td_lwp->lwp_ru.ru_oublock; 589 /* 590 * _aio_aqueue() acquires a reference to the file that is 591 * released in aio_free_entry(). 592 */ 593 if (cb->aio_lio_opcode == LIO_READ) { 594 auio.uio_rw = UIO_READ; 595 error = fo_read(fp, &auio, fp->f_cred, O_FOFFSET); 596 } else { 597 auio.uio_rw = UIO_WRITE; 598 error = fo_write(fp, &auio, fp->f_cred, O_FOFFSET); 599 } 600 inblock_end = mytd->td_lwp->lwp_ru.ru_inblock; 601 oublock_end = mytd->td_lwp->lwp_ru.ru_oublock; 602 603 aiocbe->inputcharge = inblock_end - inblock_st; 604 aiocbe->outputcharge = oublock_end - oublock_st; 605 606 if ((error) && (auio.uio_resid != cnt)) { 607 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 608 error = 0; 609 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 610 ksignal(aiocbe->userproc, SIGPIPE); 611 } 612 613 cnt -= auio.uio_resid; 614 cb->_aiocb_private.error = error; 615 cb->_aiocb_private.status = cnt; 616 } 617 618 /* 619 * The AIO daemon, most of the actual work is done in aio_process, 620 * but the setup (and address space mgmt) is done in this routine. 621 */ 622 static void 623 aio_daemon(void *uproc, struct trapframe *frame) 624 { 625 struct aio_liojob *lj; 626 struct aiocb *cb; 627 struct aiocblist *aiocbe; 628 struct aioproclist *aiop; 629 struct kaioinfo *ki; 630 struct proc *mycp, *userp; 631 struct vmspace *curvm; 632 struct lwp *mylwp; 633 struct ucred *cr; 634 635 /* 636 * mplock not held on entry but we aren't mpsafe yet. 637 */ 638 get_mplock(); 639 640 mylwp = curthread->td_lwp; 641 mycp = mylwp->lwp_proc; 642 643 if (mycp->p_textvp) { 644 vrele(mycp->p_textvp); 645 mycp->p_textvp = NULL; 646 } 647 648 /* 649 * Allocate and ready the aio control info. There is one aiop structure 650 * per daemon. 651 */ 652 aiop = objcache_get(aiop_oc, M_WAITOK); 653 aiop->aioproc = mycp; 654 aiop->aioprocflags |= AIOP_FREE; 655 656 crit_enter(); 657 658 /* 659 * Place thread (lightweight process) onto the AIO free thread list. 660 */ 661 if (TAILQ_EMPTY(&aio_freeproc)) 662 wakeup(&aio_freeproc); 663 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 664 665 crit_exit(); 666 667 /* Make up a name for the daemon. */ 668 strcpy(mycp->p_comm, "aiod"); 669 670 /* 671 * Get rid of our current filedescriptors. AIOD's don't need any 672 * filedescriptors, except as temporarily inherited from the client. 673 * Credentials are also cloned, and made equivalent to "root". 674 */ 675 fdfree(mycp, NULL); 676 cr = cratom(&mycp->p_ucred); 677 cr->cr_uid = 0; 678 uireplace(&cr->cr_uidinfo, uifind(0)); 679 cr->cr_ngroups = 1; 680 cr->cr_groups[0] = 1; 681 682 /* The daemon resides in its own pgrp. */ 683 enterpgrp(mycp, mycp->p_pid, 1); 684 685 /* Mark special process type. */ 686 mycp->p_flag |= P_SYSTEM | P_KTHREADP; 687 688 /* 689 * Wakeup parent process. (Parent sleeps to keep from blasting away 690 * and creating too many daemons.) 691 */ 692 wakeup(mycp); 693 curvm = NULL; 694 695 for (;;) { 696 /* 697 * Take daemon off of free queue 698 */ 699 if (aiop->aioprocflags & AIOP_FREE) { 700 crit_enter(); 701 TAILQ_REMOVE(&aio_freeproc, aiop, list); 702 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 703 aiop->aioprocflags &= ~AIOP_FREE; 704 crit_exit(); 705 } 706 aiop->aioprocflags &= ~AIOP_SCHED; 707 708 /* 709 * Check for jobs. 710 */ 711 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 712 cb = &aiocbe->uaiocb; 713 userp = aiocbe->userproc; 714 715 aiocbe->jobstate = JOBST_JOBRUNNING; 716 717 /* 718 * Connect to process address space for user program. 719 */ 720 if (curvm != userp->p_vmspace) { 721 pmap_setlwpvm(mylwp, userp->p_vmspace); 722 if (curvm) 723 sysref_put(&curvm->vm_sysref); 724 curvm = userp->p_vmspace; 725 sysref_get(&curvm->vm_sysref); 726 } 727 728 ki = userp->p_aioinfo; 729 lj = aiocbe->lio; 730 731 /* Account for currently active jobs. */ 732 ki->kaio_active_count++; 733 734 /* Do the I/O function. */ 735 aio_process(aiocbe); 736 737 /* Decrement the active job count. */ 738 ki->kaio_active_count--; 739 740 /* 741 * Increment the completion count for wakeup/signal 742 * comparisons. 743 */ 744 aiocbe->jobflags |= AIOCBLIST_DONE; 745 ki->kaio_queue_finished_count++; 746 if (lj) 747 lj->lioj_queue_finished_count++; 748 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags 749 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { 750 ki->kaio_flags &= ~KAIO_WAKEUP; 751 wakeup(userp); 752 } 753 754 crit_enter(); 755 if (lj && (lj->lioj_flags & 756 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { 757 if ((lj->lioj_queue_finished_count == 758 lj->lioj_queue_count) && 759 (lj->lioj_buffer_finished_count == 760 lj->lioj_buffer_count)) { 761 ksignal(userp, 762 lj->lioj_signal.sigev_signo); 763 lj->lioj_flags |= 764 LIOJ_SIGNAL_POSTED; 765 } 766 } 767 crit_exit(); 768 769 aiocbe->jobstate = JOBST_JOBFINISHED; 770 771 crit_enter(); 772 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 773 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist); 774 crit_exit(); 775 KNOTE(&aiocbe->klist, 0); 776 777 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 778 wakeup(aiocbe); 779 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 780 } 781 782 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 783 ksignal(userp, cb->aio_sigevent.sigev_signo); 784 } 785 } 786 787 /* 788 * Disconnect from user address space. 789 */ 790 if (curvm) { 791 /* swap our original address space back in */ 792 pmap_setlwpvm(mylwp, mycp->p_vmspace); 793 sysref_put(&curvm->vm_sysref); 794 curvm = NULL; 795 } 796 797 /* 798 * If we are the first to be put onto the free queue, wakeup 799 * anyone waiting for a daemon. 800 */ 801 crit_enter(); 802 TAILQ_REMOVE(&aio_activeproc, aiop, list); 803 if (TAILQ_EMPTY(&aio_freeproc)) 804 wakeup(&aio_freeproc); 805 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 806 aiop->aioprocflags |= AIOP_FREE; 807 crit_exit(); 808 809 /* 810 * If daemon is inactive for a long time, allow it to exit, 811 * thereby freeing resources. 812 */ 813 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp, 814 0, "aiordy", aiod_lifetime)) { 815 crit_enter(); 816 if (TAILQ_EMPTY(&aio_jobs)) { 817 if ((aiop->aioprocflags & AIOP_FREE) && 818 (num_aio_procs > target_aio_procs)) { 819 TAILQ_REMOVE(&aio_freeproc, aiop, list); 820 crit_exit(); 821 objcache_put(aiop_oc, aiop); 822 num_aio_procs--; 823 #ifdef DIAGNOSTIC 824 if (mycp->p_vmspace->vm_sysref.refcnt <= 1) { 825 kprintf("AIOD: bad vm refcnt for" 826 " exiting daemon: %d\n", 827 mycp->p_vmspace->vm_sysref.refcnt); 828 } 829 #endif 830 exit1(0); 831 } 832 } 833 crit_exit(); 834 } 835 } 836 } 837 838 /* 839 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 840 * AIO daemon modifies its environment itself. 841 */ 842 static int 843 aio_newproc(void) 844 { 845 int error; 846 struct lwp *lp, *nlp; 847 struct proc *np; 848 849 lp = &lwp0; 850 error = fork1(lp, RFPROC|RFMEM|RFNOWAIT, &np); 851 if (error) 852 return error; 853 nlp = ONLY_LWP_IN_PROC(np); 854 cpu_set_fork_handler(nlp, aio_daemon, curproc); 855 start_forked_proc(lp, np); 856 857 /* 858 * Wait until daemon is started, but continue on just in case to 859 * handle error conditions. 860 */ 861 error = tsleep(np, 0, "aiosta", aiod_timeout); 862 num_aio_procs++; 863 864 return error; 865 } 866 867 /* 868 * Try the high-performance, low-overhead physio method for eligible 869 * VCHR devices. This method doesn't use an aio helper thread, and 870 * thus has very low overhead. 871 * 872 * Assumes that the caller, _aio_aqueue(), has incremented the file 873 * structure's reference count, preventing its deallocation for the 874 * duration of this call. 875 */ 876 static int 877 aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 878 { 879 int error; 880 struct aiocb *cb; 881 struct file *fp; 882 struct buf *bp; 883 struct vnode *vp; 884 struct kaioinfo *ki; 885 struct aio_liojob *lj; 886 int notify; 887 888 cb = &aiocbe->uaiocb; 889 fp = aiocbe->fd_file; 890 891 if (fp->f_type != DTYPE_VNODE) 892 return (-1); 893 894 vp = (struct vnode *)fp->f_data; 895 896 /* 897 * If its not a disk, we don't want to return a positive error. 898 * It causes the aio code to not fall through to try the thread 899 * way when you're talking to a regular file. 900 */ 901 if (!vn_isdisk(vp, &error)) { 902 if (error == ENOTBLK) 903 return (-1); 904 else 905 return (error); 906 } 907 908 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) 909 return (-1); 910 911 if (cb->aio_nbytes > 912 MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) 913 return (-1); 914 915 ki = p->p_aioinfo; 916 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 917 return (-1); 918 919 ki->kaio_buffer_count++; 920 921 lj = aiocbe->lio; 922 if (lj) 923 lj->lioj_buffer_count++; 924 925 /* Create and build a buffer header for a transfer. */ 926 bp = getpbuf_kva(NULL); 927 BUF_KERNPROC(bp); 928 929 /* 930 * Get a copy of the kva from the physical buffer. 931 */ 932 bp->b_bio1.bio_caller_info1.ptr = p; 933 error = 0; 934 935 bp->b_cmd = (cb->aio_lio_opcode == LIO_WRITE) ? 936 BUF_CMD_WRITE : BUF_CMD_READ; 937 bp->b_bio1.bio_done = aio_physwakeup; 938 bp->b_bio1.bio_flags |= BIO_SYNC; 939 bp->b_bio1.bio_offset = cb->aio_offset; 940 941 /* Bring buffer into kernel space. */ 942 if (vmapbuf(bp, __DEVOLATILE(char *, cb->aio_buf), cb->aio_nbytes) < 0) { 943 error = EFAULT; 944 goto doerror; 945 } 946 947 crit_enter(); 948 949 aiocbe->bp = bp; 950 bp->b_bio1.bio_caller_info2.ptr = aiocbe; 951 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 952 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 953 aiocbe->jobstate = JOBST_JOBQBUF; 954 cb->_aiocb_private.status = cb->aio_nbytes; 955 num_buf_aio++; 956 bp->b_error = 0; 957 958 crit_exit(); 959 960 /* 961 * Perform the transfer. vn_strategy must be used even though we 962 * know we have a device in order to deal with requests which exceed 963 * device DMA limitations. 964 */ 965 vn_strategy(vp, &bp->b_bio1); 966 967 notify = 0; 968 crit_enter(); 969 970 #if 0 971 /* 972 * If we had an error invoking the request, or an error in processing 973 * the request before we have returned, we process it as an error in 974 * transfer. Note that such an I/O error is not indicated immediately, 975 * but is returned using the aio_error mechanism. In this case, 976 * aio_suspend will return immediately. 977 */ 978 if (bp->b_error || (bp->b_flags & B_ERROR)) { 979 struct aiocb *job = aiocbe->uuaiocb; 980 981 aiocbe->uaiocb._aiocb_private.status = 0; 982 suword(&job->_aiocb_private.status, 0); 983 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 984 suword(&job->_aiocb_private.error, bp->b_error); 985 986 ki->kaio_buffer_finished_count++; 987 988 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 989 aiocbe->jobstate = JOBST_JOBBFINISHED; 990 aiocbe->jobflags |= AIOCBLIST_DONE; 991 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 992 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 993 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 994 notify = 1; 995 } 996 } 997 #endif 998 crit_exit(); 999 if (notify) 1000 KNOTE(&aiocbe->klist, 0); 1001 return 0; 1002 1003 doerror: 1004 ki->kaio_buffer_count--; 1005 if (lj) 1006 lj->lioj_buffer_count--; 1007 aiocbe->bp = NULL; 1008 relpbuf(bp, NULL); 1009 return error; 1010 } 1011 1012 /* 1013 * This waits/tests physio completion. 1014 */ 1015 static int 1016 aio_fphysio(struct aiocblist *iocb) 1017 { 1018 struct buf *bp; 1019 int error; 1020 1021 bp = iocb->bp; 1022 1023 error = biowait_timeout(&bp->b_bio1, "physstr", aiod_timeout); 1024 if (error == EWOULDBLOCK) 1025 return EINPROGRESS; 1026 1027 /* Release mapping into kernel space. */ 1028 vunmapbuf(bp); 1029 iocb->bp = 0; 1030 1031 error = 0; 1032 1033 /* Check for an error. */ 1034 if (bp->b_flags & B_ERROR) 1035 error = bp->b_error; 1036 1037 relpbuf(bp, NULL); 1038 return (error); 1039 } 1040 #endif /* VFS_AIO */ 1041 1042 /* 1043 * Wake up aio requests that may be serviceable now. 1044 */ 1045 void 1046 aio_swake(struct socket *so, struct signalsockbuf *ssb) 1047 { 1048 #ifndef VFS_AIO 1049 return; 1050 #else 1051 struct aiocblist *cb,*cbn; 1052 struct proc *p; 1053 struct kaioinfo *ki = NULL; 1054 int opcode, wakecount = 0; 1055 struct aioproclist *aiop; 1056 1057 if (ssb == &so->so_snd) { 1058 opcode = LIO_WRITE; 1059 atomic_clear_int(&so->so_snd.ssb_flags, SSB_AIO); 1060 } else { 1061 opcode = LIO_READ; 1062 atomic_clear_int(&so->so_rcv.ssb_flags, SSB_AIO); 1063 } 1064 1065 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { 1066 cbn = TAILQ_NEXT(cb, list); 1067 if (opcode == cb->uaiocb.aio_lio_opcode) { 1068 p = cb->userproc; 1069 ki = p->p_aioinfo; 1070 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1071 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); 1072 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1073 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); 1074 wakecount++; 1075 if (cb->jobstate != JOBST_JOBQGLOBAL) 1076 panic("invalid queue value"); 1077 } 1078 } 1079 1080 while (wakecount--) { 1081 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { 1082 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1083 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1084 aiop->aioprocflags &= ~AIOP_FREE; 1085 wakeup(aiop->aioproc); 1086 } 1087 } 1088 #endif /* VFS_AIO */ 1089 } 1090 1091 #ifdef VFS_AIO 1092 /* 1093 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1094 * technique is done in this code. 1095 */ 1096 static int 1097 _aio_aqueue(struct aiocb *job, struct aio_liojob *lj, int type) 1098 { 1099 struct proc *p = curproc; 1100 struct file *fp; 1101 unsigned int fd; 1102 struct socket *so; 1103 int error; 1104 int opcode, user_opcode; 1105 struct aiocblist *aiocbe; 1106 struct aioproclist *aiop; 1107 struct kaioinfo *ki; 1108 struct kevent kev; 1109 struct kqueue *kq; 1110 struct file *kq_fp; 1111 int fflags; 1112 1113 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) 1114 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1115 else 1116 aiocbe = objcache_get(aiocb_oc, M_WAITOK); 1117 1118 aiocbe->inputcharge = 0; 1119 aiocbe->outputcharge = 0; 1120 callout_init(&aiocbe->timeout); 1121 SLIST_INIT(&aiocbe->klist); 1122 1123 suword(&job->_aiocb_private.status, -1); 1124 suword(&job->_aiocb_private.error, 0); 1125 suword(&job->_aiocb_private.kernelinfo, -1); 1126 1127 error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb)); 1128 if (error) { 1129 suword(&job->_aiocb_private.error, error); 1130 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1131 return error; 1132 } 1133 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL && 1134 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { 1135 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1136 return EINVAL; 1137 } 1138 1139 /* Save userspace address of the job info. */ 1140 aiocbe->uuaiocb = job; 1141 1142 /* Get the opcode. */ 1143 user_opcode = aiocbe->uaiocb.aio_lio_opcode; 1144 if (type != LIO_NOP) 1145 aiocbe->uaiocb.aio_lio_opcode = type; 1146 opcode = aiocbe->uaiocb.aio_lio_opcode; 1147 1148 /* 1149 * Range check file descriptor. 1150 */ 1151 fflags = (opcode == LIO_WRITE) ? FWRITE : FREAD; 1152 fd = aiocbe->uaiocb.aio_fildes; 1153 fp = holdfp(p->p_fd, fd, fflags); 1154 if (fp == NULL) { 1155 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1156 if (type == 0) 1157 suword(&job->_aiocb_private.error, EBADF); 1158 return EBADF; 1159 } 1160 1161 aiocbe->fd_file = fp; 1162 1163 if (aiocbe->uaiocb.aio_offset == -1LL) { 1164 error = EINVAL; 1165 goto aqueue_fail; 1166 } 1167 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1168 if (error) { 1169 error = EINVAL; 1170 goto aqueue_fail; 1171 } 1172 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1173 if (jobrefid == LONG_MAX) 1174 jobrefid = 1; 1175 else 1176 jobrefid++; 1177 1178 if (opcode == LIO_NOP) { 1179 fdrop(fp); 1180 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1181 if (type == 0) { 1182 suword(&job->_aiocb_private.error, 0); 1183 suword(&job->_aiocb_private.status, 0); 1184 suword(&job->_aiocb_private.kernelinfo, 0); 1185 } 1186 return 0; 1187 } 1188 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1189 if (type == 0) 1190 suword(&job->_aiocb_private.status, 0); 1191 error = EINVAL; 1192 goto aqueue_fail; 1193 } 1194 1195 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { 1196 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; 1197 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr; 1198 } 1199 else { 1200 /* 1201 * This method for requesting kevent-based notification won't 1202 * work on the alpha, since we're passing in a pointer 1203 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT- 1204 * based method instead. 1205 */ 1206 if (user_opcode == LIO_NOP || user_opcode == LIO_READ || 1207 user_opcode == LIO_WRITE) 1208 goto no_kqueue; 1209 1210 error = copyin((struct kevent *)(uintptr_t)user_opcode, 1211 &kev, sizeof(kev)); 1212 if (error) 1213 goto aqueue_fail; 1214 } 1215 kq_fp = holdfp(p->p_fd, (int)kev.ident, -1); 1216 if (kq_fp == NULL || kq_fp->f_type != DTYPE_KQUEUE) { 1217 if (kq_fp) { 1218 fdrop(kq_fp); 1219 kq_fp = NULL; 1220 } 1221 error = EBADF; 1222 goto aqueue_fail; 1223 } 1224 kq = (struct kqueue *)kq_fp->f_data; 1225 kev.ident = (uintptr_t)aiocbe->uuaiocb; 1226 kev.filter = EVFILT_AIO; 1227 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1228 kev.data = (intptr_t)aiocbe; 1229 error = kqueue_register(kq, &kev); 1230 fdrop(kq_fp); 1231 aqueue_fail: 1232 if (error) { 1233 fdrop(fp); 1234 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1235 if (type == 0) 1236 suword(&job->_aiocb_private.error, error); 1237 goto done; 1238 } 1239 no_kqueue: 1240 1241 suword(&job->_aiocb_private.error, EINPROGRESS); 1242 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1243 aiocbe->userproc = p; 1244 aiocbe->jobflags = 0; 1245 aiocbe->lio = lj; 1246 ki = p->p_aioinfo; 1247 1248 if (fp->f_type == DTYPE_SOCKET) { 1249 /* 1250 * Alternate queueing for socket ops: Reach down into the 1251 * descriptor to get the socket data. Then check to see if the 1252 * socket is ready to be read or written (based on the requested 1253 * operation). 1254 * 1255 * If it is not ready for io, then queue the aiocbe on the 1256 * socket, and set the flags so we get a call when ssb_notify() 1257 * happens. 1258 */ 1259 so = (struct socket *)fp->f_data; 1260 crit_enter(); 1261 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1262 LIO_WRITE) && (!sowriteable(so)))) { 1263 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1264 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); 1265 if (opcode == LIO_READ) 1266 atomic_set_int(&so->so_rcv.ssb_flags, SSB_AIO); 1267 else 1268 atomic_set_int(&so->so_snd.ssb_flags, SSB_AIO); 1269 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ 1270 ki->kaio_queue_count++; 1271 num_queue_count++; 1272 crit_exit(); 1273 error = 0; 1274 goto done; 1275 } 1276 crit_exit(); 1277 } 1278 1279 if ((error = aio_qphysio(p, aiocbe)) == 0) 1280 goto done; 1281 if (error > 0) { 1282 suword(&job->_aiocb_private.status, 0); 1283 aiocbe->uaiocb._aiocb_private.error = error; 1284 suword(&job->_aiocb_private.error, error); 1285 goto done; 1286 } 1287 1288 /* No buffer for daemon I/O. */ 1289 aiocbe->bp = NULL; 1290 1291 ki->kaio_queue_count++; 1292 if (lj) 1293 lj->lioj_queue_count++; 1294 crit_enter(); 1295 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1296 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1297 crit_exit(); 1298 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1299 1300 num_queue_count++; 1301 error = 0; 1302 1303 /* 1304 * If we don't have a free AIO process, and we are below our quota, then 1305 * start one. Otherwise, depend on the subsequent I/O completions to 1306 * pick-up this job. If we don't successfully create the new process 1307 * (thread) due to resource issues, we return an error for now (EAGAIN), 1308 * which is likely not the correct thing to do. 1309 */ 1310 crit_enter(); 1311 retryproc: 1312 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1313 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1314 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1315 aiop->aioprocflags &= ~AIOP_FREE; 1316 wakeup(aiop->aioproc); 1317 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1318 ((ki->kaio_active_count + num_aio_resv_start) < 1319 ki->kaio_maxactive_count)) { 1320 num_aio_resv_start++; 1321 if ((error = aio_newproc()) == 0) { 1322 num_aio_resv_start--; 1323 goto retryproc; 1324 } 1325 num_aio_resv_start--; 1326 } 1327 crit_exit(); 1328 done: 1329 return error; 1330 } 1331 1332 /* 1333 * This routine queues an AIO request, checking for quotas. 1334 */ 1335 static int 1336 aio_aqueue(struct aiocb *job, int type) 1337 { 1338 struct proc *p = curproc; 1339 struct kaioinfo *ki; 1340 1341 if (p->p_aioinfo == NULL) 1342 aio_init_aioinfo(p); 1343 1344 if (num_queue_count >= max_queue_count) 1345 return EAGAIN; 1346 1347 ki = p->p_aioinfo; 1348 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1349 return EAGAIN; 1350 1351 return _aio_aqueue(job, NULL, type); 1352 } 1353 #endif /* VFS_AIO */ 1354 1355 /* 1356 * Support the aio_return system call, as a side-effect, kernel resources are 1357 * released. 1358 * 1359 * MPALMOSTSAFE 1360 */ 1361 int 1362 sys_aio_return(struct aio_return_args *uap) 1363 { 1364 #ifndef VFS_AIO 1365 return (ENOSYS); 1366 #else 1367 struct proc *p = curproc; 1368 struct lwp *lp = curthread->td_lwp; 1369 long jobref; 1370 struct aiocblist *cb, *ncb; 1371 struct aiocb *ujob; 1372 struct kaioinfo *ki; 1373 int error; 1374 1375 ki = p->p_aioinfo; 1376 if (ki == NULL) 1377 return EINVAL; 1378 1379 ujob = uap->aiocbp; 1380 1381 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1382 if (jobref == -1 || jobref == 0) 1383 return EINVAL; 1384 1385 get_mplock(); 1386 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1387 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1388 jobref) { 1389 if (ujob == cb->uuaiocb) { 1390 uap->sysmsg_result = 1391 cb->uaiocb._aiocb_private.status; 1392 } else { 1393 uap->sysmsg_result = EFAULT; 1394 } 1395 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1396 lp->lwp_ru.ru_oublock += cb->outputcharge; 1397 cb->outputcharge = 0; 1398 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1399 lp->lwp_ru.ru_inblock += cb->inputcharge; 1400 cb->inputcharge = 0; 1401 } 1402 aio_free_entry(cb); 1403 error = 0; 1404 goto done; 1405 } 1406 } 1407 crit_enter(); 1408 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { 1409 ncb = TAILQ_NEXT(cb, plist); 1410 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) 1411 == jobref) { 1412 crit_exit(); 1413 if (ujob == cb->uuaiocb) { 1414 uap->sysmsg_result = 1415 cb->uaiocb._aiocb_private.status; 1416 } else { 1417 uap->sysmsg_result = EFAULT; 1418 } 1419 aio_free_entry(cb); 1420 error = 0; 1421 goto done; 1422 } 1423 } 1424 crit_exit(); 1425 error = EINVAL; 1426 done: 1427 rel_mplock(); 1428 return (error); 1429 #endif /* VFS_AIO */ 1430 } 1431 1432 /* 1433 * Allow a process to wakeup when any of the I/O requests are completed. 1434 * 1435 * MPALMOSTSAFE 1436 */ 1437 int 1438 sys_aio_suspend(struct aio_suspend_args *uap) 1439 { 1440 #ifndef VFS_AIO 1441 return ENOSYS; 1442 #else 1443 struct proc *p = curproc; 1444 struct timeval atv; 1445 struct timespec ts; 1446 struct aiocb *const *cbptr, *cbp; 1447 struct kaioinfo *ki; 1448 struct aiocblist *cb; 1449 int i; 1450 int njoblist; 1451 int error, timo; 1452 long *ijoblist; 1453 struct aiocb **ujoblist; 1454 1455 if ((u_int)uap->nent > AIO_LISTIO_MAX) 1456 return EINVAL; 1457 1458 timo = 0; 1459 if (uap->timeout) { 1460 /* Get timespec struct. */ 1461 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1462 return error; 1463 1464 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1465 return (EINVAL); 1466 1467 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1468 if (itimerfix(&atv)) 1469 return (EINVAL); 1470 timo = tvtohz_high(&atv); 1471 } 1472 1473 ki = p->p_aioinfo; 1474 if (ki == NULL) 1475 return EAGAIN; 1476 1477 get_mplock(); 1478 1479 njoblist = 0; 1480 ijoblist = objcache_get(aiol_oc, M_WAITOK); 1481 ujoblist = objcache_get(aiol_oc, M_WAITOK); 1482 cbptr = uap->aiocbp; 1483 1484 for (i = 0; i < uap->nent; i++) { 1485 cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); 1486 if (cbp == 0) 1487 continue; 1488 ujoblist[njoblist] = cbp; 1489 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1490 njoblist++; 1491 } 1492 1493 if (njoblist == 0) { 1494 objcache_put(aiol_oc, ijoblist); 1495 objcache_put(aiol_oc, ujoblist); 1496 error = 0; 1497 goto done; 1498 } 1499 1500 error = 0; 1501 for (;;) { 1502 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1503 for (i = 0; i < njoblist; i++) { 1504 if (((intptr_t) 1505 cb->uaiocb._aiocb_private.kernelinfo) == 1506 ijoblist[i]) { 1507 if (ujoblist[i] != cb->uuaiocb) 1508 error = EINVAL; 1509 objcache_put(aiol_oc, ijoblist); 1510 objcache_put(aiol_oc, ujoblist); 1511 goto done; 1512 } 1513 } 1514 } 1515 1516 crit_enter(); 1517 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = 1518 TAILQ_NEXT(cb, plist)) { 1519 for (i = 0; i < njoblist; i++) { 1520 if (((intptr_t) 1521 cb->uaiocb._aiocb_private.kernelinfo) == 1522 ijoblist[i]) { 1523 crit_exit(); 1524 if (ujoblist[i] != cb->uuaiocb) 1525 error = EINVAL; 1526 objcache_put(aiol_oc, ijoblist); 1527 objcache_put(aiol_oc, ujoblist); 1528 goto done; 1529 } 1530 } 1531 } 1532 1533 ki->kaio_flags |= KAIO_WAKEUP; 1534 error = tsleep(p, PCATCH, "aiospn", timo); 1535 crit_exit(); 1536 1537 if (error == ERESTART || error == EINTR) { 1538 objcache_put(aiol_oc, ijoblist); 1539 objcache_put(aiol_oc, ujoblist); 1540 error = EINTR; 1541 goto done; 1542 } else if (error == EWOULDBLOCK) { 1543 objcache_put(aiol_oc, ijoblist); 1544 objcache_put(aiol_oc, ujoblist); 1545 error = EAGAIN; 1546 goto done; 1547 } 1548 } 1549 1550 /* NOTREACHED */ 1551 error = EINVAL; 1552 done: 1553 rel_mplock(); 1554 return (error); 1555 #endif /* VFS_AIO */ 1556 } 1557 1558 /* 1559 * aio_cancel cancels any non-physio aio operations not currently in 1560 * progress. 1561 * 1562 * MPALMOSTSAFE 1563 */ 1564 int 1565 sys_aio_cancel(struct aio_cancel_args *uap) 1566 { 1567 #ifndef VFS_AIO 1568 return ENOSYS; 1569 #else 1570 struct proc *p = curproc; 1571 struct kaioinfo *ki; 1572 struct aiocblist *cbe, *cbn; 1573 struct file *fp; 1574 struct socket *so; 1575 struct proc *po; 1576 int error; 1577 int cancelled=0; 1578 int notcancelled=0; 1579 struct vnode *vp; 1580 1581 fp = holdfp(p->p_fd, uap->fd, -1); 1582 if (fp == NULL) 1583 return (EBADF); 1584 1585 get_mplock(); 1586 1587 if (fp->f_type == DTYPE_VNODE) { 1588 vp = (struct vnode *)fp->f_data; 1589 1590 if (vn_isdisk(vp,&error)) { 1591 uap->sysmsg_result = AIO_NOTCANCELED; 1592 error = 0; 1593 goto done2; 1594 } 1595 } else if (fp->f_type == DTYPE_SOCKET) { 1596 so = (struct socket *)fp->f_data; 1597 1598 crit_enter(); 1599 1600 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { 1601 cbn = TAILQ_NEXT(cbe, list); 1602 if ((uap->aiocbp == NULL) || 1603 (uap->aiocbp == cbe->uuaiocb) ) { 1604 po = cbe->userproc; 1605 ki = po->p_aioinfo; 1606 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 1607 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); 1608 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); 1609 if (ki->kaio_flags & KAIO_WAKEUP) { 1610 wakeup(po); 1611 } 1612 cbe->jobstate = JOBST_JOBFINISHED; 1613 cbe->uaiocb._aiocb_private.status=-1; 1614 cbe->uaiocb._aiocb_private.error=ECANCELED; 1615 cancelled++; 1616 /* XXX cancelled, knote? */ 1617 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1618 SIGEV_SIGNAL) 1619 ksignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1620 if (uap->aiocbp) 1621 break; 1622 } 1623 } 1624 crit_exit(); 1625 1626 if ((cancelled) && (uap->aiocbp)) { 1627 uap->sysmsg_result = AIO_CANCELED; 1628 error = 0; 1629 goto done2; 1630 } 1631 } 1632 ki=p->p_aioinfo; 1633 if (ki == NULL) 1634 goto done; 1635 crit_enter(); 1636 1637 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { 1638 cbn = TAILQ_NEXT(cbe, plist); 1639 1640 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1641 ((uap->aiocbp == NULL ) || 1642 (uap->aiocbp == cbe->uuaiocb))) { 1643 1644 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1645 TAILQ_REMOVE(&aio_jobs, cbe, list); 1646 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 1647 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, 1648 plist); 1649 cancelled++; 1650 ki->kaio_queue_finished_count++; 1651 cbe->jobstate = JOBST_JOBFINISHED; 1652 cbe->uaiocb._aiocb_private.status = -1; 1653 cbe->uaiocb._aiocb_private.error = ECANCELED; 1654 /* XXX cancelled, knote? */ 1655 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1656 SIGEV_SIGNAL) 1657 ksignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1658 } else { 1659 notcancelled++; 1660 } 1661 } 1662 } 1663 crit_exit(); 1664 done: 1665 if (notcancelled) 1666 uap->sysmsg_result = AIO_NOTCANCELED; 1667 else if (cancelled) 1668 uap->sysmsg_result = AIO_CANCELED; 1669 else 1670 uap->sysmsg_result = AIO_ALLDONE; 1671 error = 0; 1672 done2: 1673 rel_mplock(); 1674 fdrop(fp); 1675 return error; 1676 #endif /* VFS_AIO */ 1677 } 1678 1679 /* 1680 * aio_error is implemented in the kernel level for compatibility purposes only. 1681 * For a user mode async implementation, it would be best to do it in a userland 1682 * subroutine. 1683 * 1684 * MPALMOSTSAFE 1685 */ 1686 int 1687 sys_aio_error(struct aio_error_args *uap) 1688 { 1689 #ifndef VFS_AIO 1690 return ENOSYS; 1691 #else 1692 struct proc *p = curproc; 1693 struct aiocblist *cb; 1694 struct kaioinfo *ki; 1695 long jobref; 1696 int error; 1697 1698 ki = p->p_aioinfo; 1699 if (ki == NULL) 1700 return EINVAL; 1701 1702 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1703 if ((jobref == -1) || (jobref == 0)) 1704 return EINVAL; 1705 1706 get_mplock(); 1707 error = 0; 1708 1709 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1710 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1711 jobref) { 1712 uap->sysmsg_result = cb->uaiocb._aiocb_private.error; 1713 goto done; 1714 } 1715 } 1716 1717 crit_enter(); 1718 1719 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, 1720 plist)) { 1721 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1722 jobref) { 1723 uap->sysmsg_result = EINPROGRESS; 1724 crit_exit(); 1725 goto done; 1726 } 1727 } 1728 1729 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, 1730 plist)) { 1731 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1732 jobref) { 1733 uap->sysmsg_result = EINPROGRESS; 1734 crit_exit(); 1735 goto done; 1736 } 1737 } 1738 crit_exit(); 1739 1740 crit_enter(); 1741 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, 1742 plist)) { 1743 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1744 jobref) { 1745 uap->sysmsg_result = cb->uaiocb._aiocb_private.error; 1746 crit_exit(); 1747 goto done; 1748 } 1749 } 1750 1751 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, 1752 plist)) { 1753 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1754 jobref) { 1755 uap->sysmsg_result = EINPROGRESS; 1756 crit_exit(); 1757 goto done; 1758 } 1759 } 1760 crit_exit(); 1761 error = EINVAL; 1762 done: 1763 rel_mplock(); 1764 return (error); 1765 #endif /* VFS_AIO */ 1766 } 1767 1768 /* 1769 * syscall - asynchronous read from a file (REALTIME) 1770 * 1771 * MPALMOSTSAFE 1772 */ 1773 int 1774 sys_aio_read(struct aio_read_args *uap) 1775 { 1776 #ifndef VFS_AIO 1777 return ENOSYS; 1778 #else 1779 int error; 1780 1781 get_mplock(); 1782 error = aio_aqueue(uap->aiocbp, LIO_READ); 1783 rel_mplock(); 1784 return (error); 1785 #endif /* VFS_AIO */ 1786 } 1787 1788 /* 1789 * syscall - asynchronous write to a file (REALTIME) 1790 * 1791 * MPALMOSTSAFE 1792 */ 1793 int 1794 sys_aio_write(struct aio_write_args *uap) 1795 { 1796 #ifndef VFS_AIO 1797 return ENOSYS; 1798 #else 1799 int error; 1800 1801 get_mplock(); 1802 error = aio_aqueue(uap->aiocbp, LIO_WRITE); 1803 rel_mplock(); 1804 return (error); 1805 #endif /* VFS_AIO */ 1806 } 1807 1808 /* 1809 * syscall - XXX undocumented 1810 * 1811 * MPALMOSTSAFE 1812 */ 1813 int 1814 sys_lio_listio(struct lio_listio_args *uap) 1815 { 1816 #ifndef VFS_AIO 1817 return ENOSYS; 1818 #else 1819 struct proc *p = curproc; 1820 struct lwp *lp = curthread->td_lwp; 1821 int nent, nentqueued; 1822 struct aiocb *iocb, * const *cbptr; 1823 struct aiocblist *cb; 1824 struct kaioinfo *ki; 1825 struct aio_liojob *lj; 1826 int error, runningcode; 1827 int nerror; 1828 int i; 1829 1830 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 1831 return EINVAL; 1832 1833 nent = uap->nent; 1834 if (nent > AIO_LISTIO_MAX) 1835 return EINVAL; 1836 1837 get_mplock(); 1838 1839 if (p->p_aioinfo == NULL) 1840 aio_init_aioinfo(p); 1841 1842 if ((nent + num_queue_count) > max_queue_count) { 1843 error = EAGAIN; 1844 goto done; 1845 } 1846 1847 ki = p->p_aioinfo; 1848 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) { 1849 error = EAGAIN; 1850 goto done; 1851 } 1852 1853 lj = objcache_get(aiolio_oc, M_WAITOK); 1854 if (lj == NULL) { 1855 error = EAGAIN; 1856 goto done; 1857 } 1858 1859 lj->lioj_flags = 0; 1860 lj->lioj_buffer_count = 0; 1861 lj->lioj_buffer_finished_count = 0; 1862 lj->lioj_queue_count = 0; 1863 lj->lioj_queue_finished_count = 0; 1864 lj->lioj_ki = ki; 1865 1866 /* 1867 * Setup signal. 1868 */ 1869 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 1870 error = copyin(uap->sig, &lj->lioj_signal, 1871 sizeof(lj->lioj_signal)); 1872 if (error) { 1873 objcache_put(aiolio_oc, lj); 1874 goto done; 1875 } 1876 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { 1877 objcache_put(aiolio_oc, lj); 1878 error = EINVAL; 1879 goto done; 1880 } 1881 lj->lioj_flags |= LIOJ_SIGNAL; 1882 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 1883 } else 1884 lj->lioj_flags &= ~LIOJ_SIGNAL; 1885 1886 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 1887 /* 1888 * Get pointers to the list of I/O requests. 1889 */ 1890 nerror = 0; 1891 nentqueued = 0; 1892 cbptr = uap->acb_list; 1893 for (i = 0; i < uap->nent; i++) { 1894 iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); 1895 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) { 1896 error = _aio_aqueue(iocb, lj, 0); 1897 if (error == 0) 1898 nentqueued++; 1899 else 1900 nerror++; 1901 } 1902 } 1903 1904 /* 1905 * If we haven't queued any, then just return error. 1906 */ 1907 if (nentqueued == 0) { 1908 error = 0; 1909 goto done; 1910 } 1911 1912 /* 1913 * Calculate the appropriate error return. 1914 */ 1915 runningcode = 0; 1916 if (nerror) 1917 runningcode = EIO; 1918 1919 if (uap->mode == LIO_WAIT) { 1920 int command, found, jobref; 1921 1922 for (;;) { 1923 found = 0; 1924 for (i = 0; i < uap->nent; i++) { 1925 /* 1926 * Fetch address of the control buf pointer in 1927 * user space. 1928 */ 1929 iocb = (struct aiocb *) 1930 (intptr_t)fuword(&cbptr[i]); 1931 if (((intptr_t)iocb == -1) || ((intptr_t)iocb 1932 == 0)) 1933 continue; 1934 1935 /* 1936 * Fetch the associated command from user space. 1937 */ 1938 command = fuword(&iocb->aio_lio_opcode); 1939 if (command == LIO_NOP) { 1940 found++; 1941 continue; 1942 } 1943 1944 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 1945 1946 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 1947 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 1948 == jobref) { 1949 if (cb->uaiocb.aio_lio_opcode 1950 == LIO_WRITE) { 1951 lp->lwp_ru.ru_oublock += 1952 cb->outputcharge; 1953 cb->outputcharge = 0; 1954 } else if (cb->uaiocb.aio_lio_opcode 1955 == LIO_READ) { 1956 lp->lwp_ru.ru_inblock += 1957 cb->inputcharge; 1958 cb->inputcharge = 0; 1959 } 1960 found++; 1961 break; 1962 } 1963 } 1964 1965 crit_enter(); 1966 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) { 1967 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 1968 == jobref) { 1969 found++; 1970 break; 1971 } 1972 } 1973 crit_exit(); 1974 } 1975 1976 /* 1977 * If all I/Os have been disposed of, then we can 1978 * return. 1979 */ 1980 if (found == nentqueued) { 1981 error = runningcode; 1982 goto done; 1983 } 1984 1985 ki->kaio_flags |= KAIO_WAKEUP; 1986 error = tsleep(p, PCATCH, "aiospn", 0); 1987 1988 if (error == EINTR) { 1989 goto done; 1990 } else if (error == EWOULDBLOCK) { 1991 error = EAGAIN; 1992 goto done; 1993 } 1994 } 1995 } 1996 1997 error = runningcode; 1998 done: 1999 rel_mplock(); 2000 return (error); 2001 #endif /* VFS_AIO */ 2002 } 2003 2004 #ifdef VFS_AIO 2005 /* 2006 * This is a weird hack so that we can post a signal. It is safe to do so from 2007 * a timeout routine, but *not* from an interrupt routine. 2008 */ 2009 static void 2010 process_signal(void *aioj) 2011 { 2012 struct aiocblist *aiocbe = aioj; 2013 struct aio_liojob *lj = aiocbe->lio; 2014 struct aiocb *cb = &aiocbe->uaiocb; 2015 2016 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && 2017 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { 2018 ksignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 2019 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2020 } 2021 2022 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2023 ksignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); 2024 } 2025 2026 /* 2027 * Interrupt handler for physio, performs the necessary process wakeups, and 2028 * signals. 2029 */ 2030 static void 2031 aio_physwakeup(struct bio *bio) 2032 { 2033 struct buf *bp = bio->bio_buf; 2034 struct aiocblist *aiocbe; 2035 struct proc *p; 2036 struct kaioinfo *ki; 2037 struct aio_liojob *lj; 2038 2039 aiocbe = bio->bio_caller_info2.ptr; 2040 get_mplock(); 2041 2042 if (aiocbe) { 2043 p = bio->bio_caller_info1.ptr; 2044 2045 aiocbe->jobstate = JOBST_JOBBFINISHED; 2046 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2047 aiocbe->uaiocb._aiocb_private.error = 0; 2048 aiocbe->jobflags |= AIOCBLIST_DONE; 2049 2050 if (bp->b_flags & B_ERROR) 2051 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2052 2053 lj = aiocbe->lio; 2054 if (lj) { 2055 lj->lioj_buffer_finished_count++; 2056 2057 /* 2058 * wakeup/signal if all of the interrupt jobs are done. 2059 */ 2060 if (lj->lioj_buffer_finished_count == 2061 lj->lioj_buffer_count) { 2062 /* 2063 * Post a signal if it is called for. 2064 */ 2065 if ((lj->lioj_flags & 2066 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2067 LIOJ_SIGNAL) { 2068 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2069 callout_reset(&aiocbe->timeout, 0, 2070 process_signal, aiocbe); 2071 } 2072 } 2073 } 2074 2075 ki = p->p_aioinfo; 2076 if (ki) { 2077 ki->kaio_buffer_finished_count++; 2078 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2079 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2080 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2081 2082 KNOTE(&aiocbe->klist, 0); 2083 /* Do the wakeup. */ 2084 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2085 ki->kaio_flags &= ~KAIO_WAKEUP; 2086 wakeup(p); 2087 } 2088 } 2089 2090 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 2091 callout_reset(&aiocbe->timeout, 0, 2092 process_signal, aiocbe); 2093 } 2094 } 2095 biodone_sync(bio); 2096 rel_mplock(); 2097 } 2098 #endif /* VFS_AIO */ 2099 2100 /* 2101 * syscall - wait for the next completion of an aio request 2102 * 2103 * MPALMOSTSAFE 2104 */ 2105 int 2106 sys_aio_waitcomplete(struct aio_waitcomplete_args *uap) 2107 { 2108 #ifndef VFS_AIO 2109 return ENOSYS; 2110 #else 2111 struct proc *p = curproc; 2112 struct lwp *lp = curthread->td_lwp; 2113 struct timeval atv; 2114 struct timespec ts; 2115 struct kaioinfo *ki; 2116 struct aiocblist *cb = NULL; 2117 int error, timo; 2118 2119 suword(uap->aiocbp, (int)NULL); 2120 2121 timo = 0; 2122 if (uap->timeout) { 2123 /* Get timespec struct. */ 2124 error = copyin(uap->timeout, &ts, sizeof(ts)); 2125 if (error) 2126 return error; 2127 2128 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) 2129 return (EINVAL); 2130 2131 TIMESPEC_TO_TIMEVAL(&atv, &ts); 2132 if (itimerfix(&atv)) 2133 return (EINVAL); 2134 timo = tvtohz_high(&atv); 2135 } 2136 2137 ki = p->p_aioinfo; 2138 if (ki == NULL) 2139 return EAGAIN; 2140 2141 get_mplock(); 2142 2143 for (;;) { 2144 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { 2145 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); 2146 uap->sysmsg_result = cb->uaiocb._aiocb_private.status; 2147 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2148 lp->lwp_ru.ru_oublock += 2149 cb->outputcharge; 2150 cb->outputcharge = 0; 2151 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2152 lp->lwp_ru.ru_inblock += cb->inputcharge; 2153 cb->inputcharge = 0; 2154 } 2155 aio_free_entry(cb); 2156 error = cb->uaiocb._aiocb_private.error; 2157 break; 2158 } 2159 2160 crit_enter(); 2161 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { 2162 crit_exit(); 2163 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb); 2164 uap->sysmsg_result = cb->uaiocb._aiocb_private.status; 2165 aio_free_entry(cb); 2166 error = cb->uaiocb._aiocb_private.error; 2167 break; 2168 } 2169 2170 ki->kaio_flags |= KAIO_WAKEUP; 2171 error = tsleep(p, PCATCH, "aiowc", timo); 2172 crit_exit(); 2173 2174 if (error == ERESTART) { 2175 error = EINTR; 2176 break; 2177 } 2178 if (error < 0) 2179 break; 2180 if (error == EINTR) 2181 break; 2182 if (error == EWOULDBLOCK) { 2183 error = EAGAIN; 2184 break; 2185 } 2186 } 2187 rel_mplock(); 2188 return (error); 2189 #endif /* VFS_AIO */ 2190 } 2191 2192 #ifndef VFS_AIO 2193 static int 2194 filt_aioattach(struct knote *kn) 2195 { 2196 2197 return (ENXIO); 2198 } 2199 2200 struct filterops aio_filtops = 2201 { 0, filt_aioattach, NULL, NULL }; 2202 2203 #else 2204 /* kqueue attach function */ 2205 static int 2206 filt_aioattach(struct knote *kn) 2207 { 2208 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2209 2210 /* 2211 * The aiocbe pointer must be validated before using it, so 2212 * registration is restricted to the kernel; the user cannot 2213 * set EV_FLAG1. 2214 */ 2215 if ((kn->kn_flags & EV_FLAG1) == 0) 2216 return (EPERM); 2217 kn->kn_flags &= ~EV_FLAG1; 2218 2219 knote_insert(&aiocbe->klist, kn); 2220 2221 return (0); 2222 } 2223 2224 /* kqueue detach function */ 2225 static void 2226 filt_aiodetach(struct knote *kn) 2227 { 2228 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2229 2230 knote_remove(&aiocbe->klist, kn); 2231 } 2232 2233 /* kqueue filter function */ 2234 /*ARGSUSED*/ 2235 static int 2236 filt_aio(struct knote *kn, long hint) 2237 { 2238 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; 2239 2240 kn->kn_data = aiocbe->uaiocb._aiocb_private.error; 2241 if (aiocbe->jobstate != JOBST_JOBFINISHED && 2242 aiocbe->jobstate != JOBST_JOBBFINISHED) 2243 return (0); 2244 kn->kn_flags |= EV_EOF; 2245 return (1); 2246 } 2247 2248 struct filterops aio_filtops = 2249 { 0, filt_aioattach, filt_aiodetach, filt_aio }; 2250 #endif /* VFS_AIO */ 2251