1f841f6adSraf /*
2f841f6adSraf * CDDL HEADER START
3f841f6adSraf *
4f841f6adSraf * The contents of this file are subject to the terms of the
5f841f6adSraf * Common Development and Distribution License (the "License").
6f841f6adSraf * You may not use this file except in compliance with the License.
7f841f6adSraf *
8f841f6adSraf * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9f841f6adSraf * or http://www.opensolaris.org/os/licensing.
10f841f6adSraf * See the License for the specific language governing permissions
11f841f6adSraf * and limitations under the License.
12f841f6adSraf *
13f841f6adSraf * When distributing Covered Code, include this CDDL HEADER in each
14f841f6adSraf * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15f841f6adSraf * If applicable, add the following below this CDDL HEADER, with the
16f841f6adSraf * fields enclosed by brackets "[]" replaced with your own identifying
17f841f6adSraf * information: Portions Copyright [yyyy] [name of copyright owner]
18f841f6adSraf *
19f841f6adSraf * CDDL HEADER END
20f841f6adSraf */
21f841f6adSraf
22f841f6adSraf /*
2375e1bcdeSPrakash Sangappa * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24f841f6adSraf * Use is subject to license terms.
25f841f6adSraf */
26f841f6adSraf
27f841f6adSraf /*
28f841f6adSraf * posix_aio.c implements the POSIX async. I/O functions.
29f841f6adSraf *
30f841f6adSraf * aio_read
31f841f6adSraf * aio_write
32f841f6adSraf * aio_error
33f841f6adSraf * aio_return
34f841f6adSraf * aio_suspend
35f841f6adSraf * lio_listio
36f841f6adSraf * aio_fsync
37f841f6adSraf * aio_cancel
38f841f6adSraf */
39f841f6adSraf
407257d1b4Sraf #include "lint.h"
41f841f6adSraf #include "thr_uberdata.h"
424763305eSRobert Mustacchi #include "libc.h"
43f841f6adSraf #include "asyncio.h"
44f841f6adSraf #include <atomic.h>
45f841f6adSraf #include <sys/file.h>
46f841f6adSraf #include <sys/port.h>
47f841f6adSraf
48f841f6adSraf cond_t _aio_waitn_cv = DEFAULTCV; /* wait for end of aio_waitn */
49f841f6adSraf
50f841f6adSraf static int _aio_check_timeout(const timespec_t *, timespec_t *, int *);
51f841f6adSraf
52f841f6adSraf /* defines for timedwait in __aio_waitn() and __aio_suspend() */
53f841f6adSraf #define AIO_TIMEOUT_INDEF -1
54f841f6adSraf #define AIO_TIMEOUT_POLL 0
55f841f6adSraf #define AIO_TIMEOUT_WAIT 1
56f841f6adSraf #define AIO_TIMEOUT_UNDEF 2
57f841f6adSraf
58f841f6adSraf /*
59f841f6adSraf * List I/O stuff
60f841f6adSraf */
61f841f6adSraf static void _lio_list_decr(aio_lio_t *);
62f841f6adSraf static long aio_list_max = 0;
63f841f6adSraf
64f841f6adSraf int
aio_read(aiocb_t * aiocbp)65f841f6adSraf aio_read(aiocb_t *aiocbp)
66f841f6adSraf {
676e628f27Sraf if (aiocbp == NULL || aiocbp->aio_reqprio != 0) {
68f841f6adSraf errno = EINVAL;
69f841f6adSraf return (-1);
70f841f6adSraf }
71f841f6adSraf if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
72f841f6adSraf errno = EBUSY;
73f841f6adSraf return (-1);
74f841f6adSraf }
75f841f6adSraf if (_aio_sigev_thread(aiocbp) != 0)
76f841f6adSraf return (-1);
77f841f6adSraf aiocbp->aio_lio_opcode = LIO_READ;
78f841f6adSraf return (_aio_rw(aiocbp, NULL, &__nextworker_rw, AIOAREAD,
79f841f6adSraf (AIO_KAIO | AIO_NO_DUPS)));
80f841f6adSraf }
81f841f6adSraf
82f841f6adSraf int
aio_write(aiocb_t * aiocbp)83f841f6adSraf aio_write(aiocb_t *aiocbp)
84f841f6adSraf {
856e628f27Sraf if (aiocbp == NULL || aiocbp->aio_reqprio != 0) {
86f841f6adSraf errno = EINVAL;
87f841f6adSraf return (-1);
88f841f6adSraf }
89f841f6adSraf if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
90f841f6adSraf errno = EBUSY;
91f841f6adSraf return (-1);
92f841f6adSraf }
93f841f6adSraf if (_aio_sigev_thread(aiocbp) != 0)
94f841f6adSraf return (-1);
95f841f6adSraf aiocbp->aio_lio_opcode = LIO_WRITE;
96f841f6adSraf return (_aio_rw(aiocbp, NULL, &__nextworker_rw, AIOAWRITE,
97f841f6adSraf (AIO_KAIO | AIO_NO_DUPS)));
98f841f6adSraf }
99f841f6adSraf
100f841f6adSraf /*
101f841f6adSraf * __lio_listio() cancellation handler.
102f841f6adSraf */
103f841f6adSraf /* ARGSUSED */
104f841f6adSraf static void
_lio_listio_cleanup(aio_lio_t * head)105f841f6adSraf _lio_listio_cleanup(aio_lio_t *head)
106f841f6adSraf {
107f841f6adSraf int freeit = 0;
108f841f6adSraf
109f841f6adSraf ASSERT(MUTEX_HELD(&head->lio_mutex));
110f841f6adSraf if (head->lio_refcnt == 0) {
111f841f6adSraf ASSERT(head->lio_nent == 0);
112f841f6adSraf freeit = 1;
113f841f6adSraf }
114f841f6adSraf head->lio_waiting = 0;
115f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
116f841f6adSraf if (freeit)
117f841f6adSraf _aio_lio_free(head);
118f841f6adSraf }
119f841f6adSraf
120f841f6adSraf int
lio_listio(int mode,aiocb_t * _RESTRICT_KYWD const * _RESTRICT_KYWD list,int nent,struct sigevent * _RESTRICT_KYWD sigevp)121f841f6adSraf lio_listio(int mode, aiocb_t *_RESTRICT_KYWD const *_RESTRICT_KYWD list,
122f841f6adSraf int nent, struct sigevent *_RESTRICT_KYWD sigevp)
123f841f6adSraf {
124f841f6adSraf int aio_ufs = 0;
125f841f6adSraf int oerrno = 0;
126f841f6adSraf aio_lio_t *head = NULL;
127f841f6adSraf aiocb_t *aiocbp;
128f841f6adSraf int state = 0;
129f841f6adSraf int EIOflg = 0;
130f841f6adSraf int rw;
131f841f6adSraf int do_kaio = 0;
132f841f6adSraf int error;
133f841f6adSraf int i;
134f841f6adSraf
135f841f6adSraf if (!_kaio_ok)
136f841f6adSraf _kaio_init();
137f841f6adSraf
138f841f6adSraf if (aio_list_max == 0)
139f841f6adSraf aio_list_max = sysconf(_SC_AIO_LISTIO_MAX);
140f841f6adSraf
141f841f6adSraf if (nent <= 0 || nent > aio_list_max) {
142f841f6adSraf errno = EINVAL;
143f841f6adSraf return (-1);
144f841f6adSraf }
145f841f6adSraf
146f841f6adSraf switch (mode) {
147f841f6adSraf case LIO_WAIT:
148f841f6adSraf state = NOCHECK;
149f841f6adSraf break;
150f841f6adSraf case LIO_NOWAIT:
151f841f6adSraf state = CHECK;
152f841f6adSraf break;
153f841f6adSraf default:
154f841f6adSraf errno = EINVAL;
155f841f6adSraf return (-1);
156f841f6adSraf }
157f841f6adSraf
158f841f6adSraf for (i = 0; i < nent; i++) {
159f841f6adSraf if ((aiocbp = list[i]) == NULL)
160f841f6adSraf continue;
161f841f6adSraf if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
162f841f6adSraf errno = EBUSY;
163f841f6adSraf return (-1);
164f841f6adSraf }
165f841f6adSraf if (_aio_sigev_thread(aiocbp) != 0)
166f841f6adSraf return (-1);
167f841f6adSraf if (aiocbp->aio_lio_opcode == LIO_NOP)
168f841f6adSraf aiocbp->aio_state = NOCHECK;
169f841f6adSraf else {
170f841f6adSraf aiocbp->aio_state = state;
171f841f6adSraf if (KAIO_SUPPORTED(aiocbp->aio_fildes))
172f841f6adSraf do_kaio++;
173f841f6adSraf else
174f841f6adSraf aiocbp->aio_resultp.aio_errno = ENOTSUP;
175f841f6adSraf }
176f841f6adSraf }
177f841f6adSraf if (_aio_sigev_thread_init(sigevp) != 0)
178f841f6adSraf return (-1);
179f841f6adSraf
180f841f6adSraf if (do_kaio) {
181f841f6adSraf error = (int)_kaio(AIOLIO, mode, list, nent, sigevp);
182f841f6adSraf if (error == 0)
183f841f6adSraf return (0);
184f841f6adSraf oerrno = errno;
185f841f6adSraf } else {
186f841f6adSraf oerrno = errno = ENOTSUP;
187f841f6adSraf error = -1;
188f841f6adSraf }
189f841f6adSraf
190f841f6adSraf if (error == -1 && errno == ENOTSUP) {
191f841f6adSraf error = errno = 0;
192f841f6adSraf /*
193f841f6adSraf * If LIO_WAIT, or notification required, allocate a list head.
194f841f6adSraf */
195f841f6adSraf if (mode == LIO_WAIT ||
196f841f6adSraf (sigevp != NULL &&
197f841f6adSraf (sigevp->sigev_notify == SIGEV_SIGNAL ||
198f841f6adSraf sigevp->sigev_notify == SIGEV_THREAD ||
199f841f6adSraf sigevp->sigev_notify == SIGEV_PORT)))
200f841f6adSraf head = _aio_lio_alloc();
201f841f6adSraf if (head) {
202f841f6adSraf sig_mutex_lock(&head->lio_mutex);
203f841f6adSraf head->lio_mode = mode;
204f841f6adSraf head->lio_largefile = 0;
205f841f6adSraf if (mode == LIO_NOWAIT && sigevp != NULL) {
206f841f6adSraf if (sigevp->sigev_notify == SIGEV_THREAD) {
207f841f6adSraf head->lio_port = sigevp->sigev_signo;
208f841f6adSraf head->lio_event = AIOLIO;
209f841f6adSraf head->lio_sigevent = sigevp;
210f841f6adSraf head->lio_sigval.sival_ptr =
211f841f6adSraf sigevp->sigev_value.sival_ptr;
212f841f6adSraf } else if (sigevp->sigev_notify == SIGEV_PORT) {
213f841f6adSraf port_notify_t *pn =
214f841f6adSraf sigevp->sigev_value.sival_ptr;
215f841f6adSraf head->lio_port = pn->portnfy_port;
216f841f6adSraf head->lio_event = AIOLIO;
217f841f6adSraf head->lio_sigevent = sigevp;
218f841f6adSraf head->lio_sigval.sival_ptr =
219f841f6adSraf pn->portnfy_user;
220f841f6adSraf } else { /* SIGEV_SIGNAL */
221f841f6adSraf head->lio_signo = sigevp->sigev_signo;
222f841f6adSraf head->lio_sigval.sival_ptr =
223f841f6adSraf sigevp->sigev_value.sival_ptr;
224f841f6adSraf }
225f841f6adSraf }
226f841f6adSraf head->lio_nent = head->lio_refcnt = nent;
227f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
228f841f6adSraf }
229f841f6adSraf /*
230f841f6adSraf * find UFS requests, errno == ENOTSUP/EBADFD,
231f841f6adSraf */
232f841f6adSraf for (i = 0; i < nent; i++) {
233f841f6adSraf if ((aiocbp = list[i]) == NULL ||
234f841f6adSraf aiocbp->aio_lio_opcode == LIO_NOP ||
235f841f6adSraf (aiocbp->aio_resultp.aio_errno != ENOTSUP &&
236f841f6adSraf aiocbp->aio_resultp.aio_errno != EBADFD)) {
237f841f6adSraf if (head)
238f841f6adSraf _lio_list_decr(head);
239f841f6adSraf continue;
240f841f6adSraf }
241f841f6adSraf if (aiocbp->aio_resultp.aio_errno == EBADFD)
242f841f6adSraf SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
2436e628f27Sraf if (aiocbp->aio_reqprio != 0) {
244f841f6adSraf aiocbp->aio_resultp.aio_errno = EINVAL;
245f841f6adSraf aiocbp->aio_resultp.aio_return = -1;
246f841f6adSraf EIOflg = 1;
247f841f6adSraf if (head)
248f841f6adSraf _lio_list_decr(head);
249f841f6adSraf continue;
250f841f6adSraf }
251f841f6adSraf /*
252f841f6adSraf * submit an AIO request with flags AIO_NO_KAIO
253f841f6adSraf * to avoid the kaio() syscall in _aio_rw()
254f841f6adSraf */
255f841f6adSraf switch (aiocbp->aio_lio_opcode) {
256f841f6adSraf case LIO_READ:
257f841f6adSraf rw = AIOAREAD;
258f841f6adSraf break;
259f841f6adSraf case LIO_WRITE:
260f841f6adSraf rw = AIOAWRITE;
261f841f6adSraf break;
262f841f6adSraf }
263f841f6adSraf error = _aio_rw(aiocbp, head, &__nextworker_rw, rw,
264f841f6adSraf (AIO_NO_KAIO | AIO_NO_DUPS));
265f841f6adSraf if (error == 0)
266f841f6adSraf aio_ufs++;
267f841f6adSraf else {
268f841f6adSraf if (head)
269f841f6adSraf _lio_list_decr(head);
270f841f6adSraf aiocbp->aio_resultp.aio_errno = error;
271f841f6adSraf EIOflg = 1;
272f841f6adSraf }
273f841f6adSraf }
274f841f6adSraf }
275f841f6adSraf if (EIOflg) {
276f841f6adSraf errno = EIO;
277f841f6adSraf return (-1);
278f841f6adSraf }
279f841f6adSraf if (mode == LIO_WAIT && oerrno == ENOTSUP) {
280f841f6adSraf /*
281f841f6adSraf * call kaio(AIOLIOWAIT) to get all outstanding
282f841f6adSraf * kernel AIO requests
283f841f6adSraf */
284f841f6adSraf if ((nent - aio_ufs) > 0)
285f841f6adSraf (void) _kaio(AIOLIOWAIT, mode, list, nent, sigevp);
286f841f6adSraf if (head != NULL && head->lio_nent > 0) {
287f841f6adSraf sig_mutex_lock(&head->lio_mutex);
288f841f6adSraf while (head->lio_refcnt > 0) {
289f841f6adSraf int err;
290f841f6adSraf head->lio_waiting = 1;
291f841f6adSraf pthread_cleanup_push(_lio_listio_cleanup, head);
292f841f6adSraf err = sig_cond_wait(&head->lio_cond_cv,
293f841f6adSraf &head->lio_mutex);
294f841f6adSraf pthread_cleanup_pop(0);
295f841f6adSraf head->lio_waiting = 0;
296f841f6adSraf if (err && head->lio_nent > 0) {
297f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
298f841f6adSraf errno = err;
299f841f6adSraf return (-1);
300f841f6adSraf }
301f841f6adSraf }
302f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
303f841f6adSraf ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0);
304f841f6adSraf _aio_lio_free(head);
305f841f6adSraf for (i = 0; i < nent; i++) {
306f841f6adSraf if ((aiocbp = list[i]) != NULL &&
307f841f6adSraf aiocbp->aio_resultp.aio_errno) {
308f841f6adSraf errno = EIO;
309f841f6adSraf return (-1);
310f841f6adSraf }
311f841f6adSraf }
312f841f6adSraf }
313f841f6adSraf return (0);
314f841f6adSraf }
315f841f6adSraf return (error);
316f841f6adSraf }
317f841f6adSraf
318f841f6adSraf static void
_lio_list_decr(aio_lio_t * head)319f841f6adSraf _lio_list_decr(aio_lio_t *head)
320f841f6adSraf {
321f841f6adSraf sig_mutex_lock(&head->lio_mutex);
322f841f6adSraf head->lio_nent--;
323f841f6adSraf head->lio_refcnt--;
324f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
325f841f6adSraf }
326f841f6adSraf
327f841f6adSraf /*
328f841f6adSraf * __aio_suspend() cancellation handler.
329f841f6adSraf */
330f841f6adSraf /* ARGSUSED */
331f841f6adSraf static void
_aio_suspend_cleanup(int * counter)332f841f6adSraf _aio_suspend_cleanup(int *counter)
333f841f6adSraf {
334f841f6adSraf ASSERT(MUTEX_HELD(&__aio_mutex));
335f841f6adSraf (*counter)--; /* _aio_kernel_suspend or _aio_suscv_cnt */
336f841f6adSraf sig_mutex_unlock(&__aio_mutex);
337f841f6adSraf }
338f841f6adSraf
339f841f6adSraf static int
__aio_suspend(void ** list,int nent,const timespec_t * timo,int largefile)340f841f6adSraf __aio_suspend(void **list, int nent, const timespec_t *timo, int largefile)
341f841f6adSraf {
342f841f6adSraf int cv_err; /* error code from cond_xxx() */
343f841f6adSraf int kerr; /* error code from _kaio(AIOSUSPEND) */
344f841f6adSraf int i;
345f841f6adSraf timespec_t twait; /* copy of timo for internal calculations */
346f841f6adSraf timespec_t *wait = NULL;
347f841f6adSraf int timedwait;
348f841f6adSraf int req_outstanding;
349f841f6adSraf aiocb_t **listp;
350f841f6adSraf aiocb_t *aiocbp;
351f841f6adSraf #if !defined(_LP64)
352f841f6adSraf aiocb64_t **listp64;
353f841f6adSraf aiocb64_t *aiocbp64;
354f841f6adSraf #endif
355f841f6adSraf hrtime_t hrtstart;
356f841f6adSraf hrtime_t hrtend;
357f841f6adSraf hrtime_t hrtres;
358f841f6adSraf
359f841f6adSraf #if defined(_LP64)
360f841f6adSraf if (largefile)
361f841f6adSraf aio_panic("__aio_suspend: largefile set when _LP64 defined");
362f841f6adSraf #endif
363f841f6adSraf
364f841f6adSraf if (nent <= 0) {
365f841f6adSraf errno = EINVAL;
366f841f6adSraf return (-1);
367f841f6adSraf }
368f841f6adSraf
369f841f6adSraf if (timo) {
370f841f6adSraf if (timo->tv_sec < 0 || timo->tv_nsec < 0 ||
371f841f6adSraf timo->tv_nsec >= NANOSEC) {
372f841f6adSraf errno = EINVAL;
373f841f6adSraf return (-1);
374f841f6adSraf }
375f841f6adSraf /* Initialize start time if time monitoring desired */
376f841f6adSraf if (timo->tv_sec > 0 || timo->tv_nsec > 0) {
377f841f6adSraf timedwait = AIO_TIMEOUT_WAIT;
378f841f6adSraf hrtstart = gethrtime();
379f841f6adSraf } else {
380f841f6adSraf /* content of timeout = 0 : polling */
381f841f6adSraf timedwait = AIO_TIMEOUT_POLL;
382f841f6adSraf }
383f841f6adSraf } else {
384f841f6adSraf /* timeout pointer = NULL : wait indefinitely */
385f841f6adSraf timedwait = AIO_TIMEOUT_INDEF;
386f841f6adSraf }
387f841f6adSraf
388f841f6adSraf #if !defined(_LP64)
389f841f6adSraf if (largefile) {
390f841f6adSraf listp64 = (aiocb64_t **)list;
391f841f6adSraf for (i = 0; i < nent; i++) {
392f841f6adSraf if ((aiocbp64 = listp64[i]) != NULL &&
393f841f6adSraf aiocbp64->aio_state == CHECK)
394f841f6adSraf aiocbp64->aio_state = CHECKED;
395f841f6adSraf }
396f841f6adSraf } else
397f841f6adSraf #endif /* !_LP64 */
398f841f6adSraf {
399f841f6adSraf listp = (aiocb_t **)list;
400f841f6adSraf for (i = 0; i < nent; i++) {
401f841f6adSraf if ((aiocbp = listp[i]) != NULL &&
402f841f6adSraf aiocbp->aio_state == CHECK)
403f841f6adSraf aiocbp->aio_state = CHECKED;
404f841f6adSraf }
405f841f6adSraf }
406f841f6adSraf
407f841f6adSraf sig_mutex_lock(&__aio_mutex);
408f841f6adSraf
409f841f6adSraf /*
410f841f6adSraf * The next "if -case" is required to accelerate the
411f841f6adSraf * access to completed RAW-IO requests.
412f841f6adSraf */
413f841f6adSraf if ((_aio_doneq_cnt + _aio_outstand_cnt) == 0) {
414f841f6adSraf /* Only kernel requests pending */
415f841f6adSraf
416f841f6adSraf /*
417f841f6adSraf * _aio_kernel_suspend is used to detect completed non RAW-IO
418f841f6adSraf * requests.
419f841f6adSraf * As long as this thread resides in the kernel (_kaio) further
420f841f6adSraf * asynchronous non RAW-IO requests could be submitted.
421f841f6adSraf */
422f841f6adSraf _aio_kernel_suspend++;
423f841f6adSraf
424f841f6adSraf /*
425f841f6adSraf * Always do the kaio() call without using the KAIO_SUPPORTED()
426f841f6adSraf * checks because it is not mandatory to have a valid fd
427f841f6adSraf * set in the list entries, only the resultp must be set.
428f841f6adSraf *
429f841f6adSraf * _kaio(AIOSUSPEND ...) return values :
430f841f6adSraf * 0: everythink ok, completed request found
431f841f6adSraf * -1: error
432f841f6adSraf * 1: no error : _aiodone awaked the _kaio(AIOSUSPEND,,)
433f841f6adSraf * system call using _kaio(AIONOTIFY). It means, that some
434f841f6adSraf * non RAW-IOs completed inbetween.
435f841f6adSraf */
436f841f6adSraf
437f841f6adSraf pthread_cleanup_push(_aio_suspend_cleanup,
438f841f6adSraf &_aio_kernel_suspend);
439f841f6adSraf pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
440f841f6adSraf sig_mutex_unlock(&__aio_mutex);
441f841f6adSraf _cancel_prologue();
442f841f6adSraf kerr = (int)_kaio(largefile? AIOSUSPEND64 : AIOSUSPEND,
443f841f6adSraf list, nent, timo, -1);
444f841f6adSraf _cancel_epilogue();
445f841f6adSraf pthread_cleanup_pop(1); /* sig_mutex_lock(&__aio_mutex) */
446f841f6adSraf pthread_cleanup_pop(0);
447f841f6adSraf
448f841f6adSraf _aio_kernel_suspend--;
449f841f6adSraf
450f841f6adSraf if (!kerr) {
451f841f6adSraf sig_mutex_unlock(&__aio_mutex);
452f841f6adSraf return (0);
453f841f6adSraf }
454f841f6adSraf } else {
455f841f6adSraf kerr = 1; /* simulation: _kaio detected AIONOTIFY */
456f841f6adSraf }
457f841f6adSraf
458f841f6adSraf /*
459f841f6adSraf * Return kernel error code if no other IOs are outstanding.
460f841f6adSraf */
461f841f6adSraf req_outstanding = _aio_doneq_cnt + _aio_outstand_cnt;
462f841f6adSraf
463f841f6adSraf sig_mutex_unlock(&__aio_mutex);
464f841f6adSraf
465f841f6adSraf if (req_outstanding == 0) {
466f841f6adSraf /* no IOs outstanding in the thread pool */
467f841f6adSraf if (kerr == 1)
468f841f6adSraf /* return "no IOs completed" */
469f841f6adSraf errno = EAGAIN;
470f841f6adSraf return (-1);
471f841f6adSraf }
472f841f6adSraf
473f841f6adSraf /*
474f841f6adSraf * IOs using the thread pool are outstanding.
475f841f6adSraf */
476f841f6adSraf if (timedwait == AIO_TIMEOUT_WAIT) {
477f841f6adSraf /* time monitoring */
478f841f6adSraf hrtend = hrtstart + (hrtime_t)timo->tv_sec * (hrtime_t)NANOSEC +
479f841f6adSraf (hrtime_t)timo->tv_nsec;
480f841f6adSraf hrtres = hrtend - gethrtime();
481f841f6adSraf if (hrtres <= 0)
482f841f6adSraf hrtres = 1;
483f841f6adSraf twait.tv_sec = hrtres / (hrtime_t)NANOSEC;
484f841f6adSraf twait.tv_nsec = hrtres % (hrtime_t)NANOSEC;
485f841f6adSraf wait = &twait;
486f841f6adSraf } else if (timedwait == AIO_TIMEOUT_POLL) {
487f841f6adSraf twait = *timo; /* content of timo = 0 : polling */
488f841f6adSraf wait = &twait;
489f841f6adSraf }
490f841f6adSraf
491f841f6adSraf for (;;) {
492f841f6adSraf int error;
493f841f6adSraf int inprogress;
494f841f6adSraf
495f841f6adSraf /* first scan file system requests */
496f841f6adSraf inprogress = 0;
497f841f6adSraf for (i = 0; i < nent; i++) {
498f841f6adSraf #if !defined(_LP64)
499f841f6adSraf if (largefile) {
500f841f6adSraf if ((aiocbp64 = listp64[i]) == NULL)
501f841f6adSraf continue;
502f841f6adSraf error = aiocbp64->aio_resultp.aio_errno;
503f841f6adSraf } else
504f841f6adSraf #endif
505f841f6adSraf {
506f841f6adSraf if ((aiocbp = listp[i]) == NULL)
507f841f6adSraf continue;
508f841f6adSraf error = aiocbp->aio_resultp.aio_errno;
509f841f6adSraf }
510f841f6adSraf if (error == EINPROGRESS)
511f841f6adSraf inprogress = 1;
512f841f6adSraf else if (error != ECANCELED) {
513f841f6adSraf errno = 0;
514f841f6adSraf return (0);
515f841f6adSraf }
516f841f6adSraf }
517f841f6adSraf
518f841f6adSraf sig_mutex_lock(&__aio_mutex);
519f841f6adSraf
520f841f6adSraf /*
521f841f6adSraf * If there aren't outstanding I/Os in the thread pool then
522f841f6adSraf * we have to return here, provided that all kernel RAW-IOs
523f841f6adSraf * also completed.
524f841f6adSraf * If the kernel was notified to return, then we have to check
525f841f6adSraf * possible pending RAW-IOs.
526f841f6adSraf */
527f841f6adSraf if (_aio_outstand_cnt == 0 && inprogress == 0 && kerr != 1) {
528f841f6adSraf sig_mutex_unlock(&__aio_mutex);
529f841f6adSraf errno = EAGAIN;
530f841f6adSraf break;
531f841f6adSraf }
532f841f6adSraf
533f841f6adSraf /*
534f841f6adSraf * There are outstanding IOs in the thread pool or the kernel
535f841f6adSraf * was notified to return.
536f841f6adSraf * Check pending RAW-IOs first.
537f841f6adSraf */
538f841f6adSraf if (kerr == 1) {
539f841f6adSraf /*
540f841f6adSraf * _aiodone just notified the kernel about
541f841f6adSraf * completed non RAW-IOs (AIONOTIFY was detected).
542f841f6adSraf */
543f841f6adSraf if (timedwait == AIO_TIMEOUT_WAIT) {
544f841f6adSraf /* Update remaining timeout for the kernel */
545f841f6adSraf hrtres = hrtend - gethrtime();
546f841f6adSraf if (hrtres <= 0) {
547f841f6adSraf /* timer expired */
548f841f6adSraf sig_mutex_unlock(&__aio_mutex);
549f841f6adSraf errno = EAGAIN;
550f841f6adSraf break;
551f841f6adSraf }
552f841f6adSraf wait->tv_sec = hrtres / (hrtime_t)NANOSEC;
553f841f6adSraf wait->tv_nsec = hrtres % (hrtime_t)NANOSEC;
554f841f6adSraf }
555f841f6adSraf _aio_kernel_suspend++;
556f841f6adSraf
557f841f6adSraf pthread_cleanup_push(_aio_suspend_cleanup,
558f841f6adSraf &_aio_kernel_suspend);
559f841f6adSraf pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
560f841f6adSraf sig_mutex_unlock(&__aio_mutex);
561f841f6adSraf _cancel_prologue();
562f841f6adSraf kerr = (int)_kaio(largefile? AIOSUSPEND64 : AIOSUSPEND,
563f841f6adSraf list, nent, wait, -1);
564f841f6adSraf _cancel_epilogue();
565f841f6adSraf pthread_cleanup_pop(1);
566f841f6adSraf pthread_cleanup_pop(0);
567f841f6adSraf
568f841f6adSraf _aio_kernel_suspend--;
569f841f6adSraf
570f841f6adSraf if (!kerr) {
571f841f6adSraf sig_mutex_unlock(&__aio_mutex);
572f841f6adSraf return (0);
573f841f6adSraf }
574f841f6adSraf }
575f841f6adSraf
576f841f6adSraf if (timedwait == AIO_TIMEOUT_POLL) {
577f841f6adSraf sig_mutex_unlock(&__aio_mutex);
578f841f6adSraf errno = EAGAIN;
579f841f6adSraf break;
580f841f6adSraf }
581f841f6adSraf
582f841f6adSraf if (timedwait == AIO_TIMEOUT_WAIT) {
583f841f6adSraf /* Update remaining timeout */
584f841f6adSraf hrtres = hrtend - gethrtime();
585f841f6adSraf if (hrtres <= 0) {
586f841f6adSraf /* timer expired */
587f841f6adSraf sig_mutex_unlock(&__aio_mutex);
588f841f6adSraf errno = EAGAIN;
589f841f6adSraf break;
590f841f6adSraf }
591f841f6adSraf wait->tv_sec = hrtres / (hrtime_t)NANOSEC;
592f841f6adSraf wait->tv_nsec = hrtres % (hrtime_t)NANOSEC;
593f841f6adSraf }
594f841f6adSraf
595f841f6adSraf if (_aio_outstand_cnt == 0) {
596f841f6adSraf sig_mutex_unlock(&__aio_mutex);
597f841f6adSraf continue;
598f841f6adSraf }
599f841f6adSraf
600f841f6adSraf _aio_suscv_cnt++; /* ID for _aiodone (wake up) */
601f841f6adSraf
602f841f6adSraf pthread_cleanup_push(_aio_suspend_cleanup, &_aio_suscv_cnt);
603f841f6adSraf if (timedwait == AIO_TIMEOUT_WAIT) {
604f841f6adSraf cv_err = sig_cond_reltimedwait(&_aio_iowait_cv,
605f841f6adSraf &__aio_mutex, wait);
606f841f6adSraf if (cv_err == ETIME)
607f841f6adSraf cv_err = EAGAIN;
608f841f6adSraf } else {
609f841f6adSraf /* wait indefinitely */
610f841f6adSraf cv_err = sig_cond_wait(&_aio_iowait_cv, &__aio_mutex);
611f841f6adSraf }
612f841f6adSraf /* this decrements _aio_suscv_cnt and drops __aio_mutex */
613f841f6adSraf pthread_cleanup_pop(1);
614f841f6adSraf
615f841f6adSraf if (cv_err) {
616f841f6adSraf errno = cv_err;
617f841f6adSraf break;
618f841f6adSraf }
619f841f6adSraf }
620f841f6adSraf return (-1);
621f841f6adSraf }
622f841f6adSraf
623f841f6adSraf int
aio_suspend(const aiocb_t * const list[],int nent,const timespec_t * timeout)624f841f6adSraf aio_suspend(const aiocb_t * const list[], int nent,
625f841f6adSraf const timespec_t *timeout)
626f841f6adSraf {
627f841f6adSraf return (__aio_suspend((void **)list, nent, timeout, 0));
628f841f6adSraf }
629f841f6adSraf
630f841f6adSraf int
aio_error(const aiocb_t * aiocbp)631f841f6adSraf aio_error(const aiocb_t *aiocbp)
632f841f6adSraf {
633f841f6adSraf const aio_result_t *resultp = &aiocbp->aio_resultp;
63475e1bcdeSPrakash Sangappa aio_req_t *reqp;
635f841f6adSraf int error;
636f841f6adSraf
637f841f6adSraf if ((error = resultp->aio_errno) == EINPROGRESS) {
638f841f6adSraf if (aiocbp->aio_state == CHECK) {
639f841f6adSraf /*
640f841f6adSraf * Always do the kaio() call without using the
641f841f6adSraf * KAIO_SUPPORTED() checks because it is not
642f841f6adSraf * mandatory to have a valid fd set in the
643f841f6adSraf * aiocb, only the resultp must be set.
644f841f6adSraf */
645f841f6adSraf if ((int)_kaio(AIOERROR, aiocbp) == EINVAL) {
646f841f6adSraf errno = EINVAL;
647f841f6adSraf return (-1);
648f841f6adSraf }
649f841f6adSraf error = resultp->aio_errno;
650f841f6adSraf } else if (aiocbp->aio_state == CHECKED) {
651f841f6adSraf ((aiocb_t *)aiocbp)->aio_state = CHECK;
652f841f6adSraf }
65375e1bcdeSPrakash Sangappa } else if (aiocbp->aio_state == USERAIO) {
65475e1bcdeSPrakash Sangappa sig_mutex_lock(&__aio_mutex);
65575e1bcdeSPrakash Sangappa if ((reqp = _aio_hash_del((aio_result_t *)resultp)) == NULL) {
65675e1bcdeSPrakash Sangappa sig_mutex_unlock(&__aio_mutex);
65775e1bcdeSPrakash Sangappa ((aiocb_t *)aiocbp)->aio_state = CHECKED;
65875e1bcdeSPrakash Sangappa } else {
65975e1bcdeSPrakash Sangappa ((aiocb_t *)aiocbp)->aio_state = NOCHECK;
66075e1bcdeSPrakash Sangappa ASSERT(reqp->req_head == NULL);
66175e1bcdeSPrakash Sangappa (void) _aio_req_remove(reqp);
66275e1bcdeSPrakash Sangappa sig_mutex_unlock(&__aio_mutex);
66375e1bcdeSPrakash Sangappa _aio_req_free(reqp);
66475e1bcdeSPrakash Sangappa }
665f841f6adSraf }
666f841f6adSraf return (error);
667f841f6adSraf }
668f841f6adSraf
669f841f6adSraf ssize_t
aio_return(aiocb_t * aiocbp)670f841f6adSraf aio_return(aiocb_t *aiocbp)
671f841f6adSraf {
672f841f6adSraf aio_result_t *resultp = &aiocbp->aio_resultp;
673f841f6adSraf aio_req_t *reqp;
674f841f6adSraf int error;
675f841f6adSraf ssize_t retval;
676f841f6adSraf
677f841f6adSraf /*
678f841f6adSraf * The _aiodone() function stores resultp->aio_return before
679f841f6adSraf * storing resultp->aio_errno (with an membar_producer() in
680f841f6adSraf * between). We use membar_consumer() below to ensure proper
681f841f6adSraf * memory ordering between _aiodone() and ourself.
682f841f6adSraf */
683f841f6adSraf error = resultp->aio_errno;
684f841f6adSraf membar_consumer();
685f841f6adSraf retval = resultp->aio_return;
686f841f6adSraf
687f841f6adSraf /*
688f841f6adSraf * we use this condition to indicate either that
689f841f6adSraf * aio_return() has been called before or should
690f841f6adSraf * not have been called yet.
691f841f6adSraf */
692f841f6adSraf if ((retval == -1 && error == EINVAL) || error == EINPROGRESS) {
693f841f6adSraf errno = error;
694f841f6adSraf return (-1);
695f841f6adSraf }
696f841f6adSraf
697f841f6adSraf /*
698f841f6adSraf * Before we return, mark the result as being returned so that later
699f841f6adSraf * calls to aio_return() will return the fact that the result has
700f841f6adSraf * already been returned.
701f841f6adSraf */
702f841f6adSraf sig_mutex_lock(&__aio_mutex);
703f841f6adSraf /* retest, in case more than one thread actually got in here */
704f841f6adSraf if (resultp->aio_return == -1 && resultp->aio_errno == EINVAL) {
705f841f6adSraf sig_mutex_unlock(&__aio_mutex);
706f841f6adSraf errno = EINVAL;
707f841f6adSraf return (-1);
708f841f6adSraf }
709f841f6adSraf resultp->aio_return = -1;
710f841f6adSraf resultp->aio_errno = EINVAL;
711f841f6adSraf if ((reqp = _aio_hash_del(resultp)) == NULL)
712f841f6adSraf sig_mutex_unlock(&__aio_mutex);
713f841f6adSraf else {
714f841f6adSraf aiocbp->aio_state = NOCHECK;
715f841f6adSraf ASSERT(reqp->req_head == NULL);
716f841f6adSraf (void) _aio_req_remove(reqp);
717f841f6adSraf sig_mutex_unlock(&__aio_mutex);
718f841f6adSraf _aio_req_free(reqp);
719f841f6adSraf }
720f841f6adSraf
721f841f6adSraf if (retval == -1)
722f841f6adSraf errno = error;
723f841f6adSraf return (retval);
724f841f6adSraf }
725f841f6adSraf
726f841f6adSraf void
_lio_remove(aio_req_t * reqp)727f841f6adSraf _lio_remove(aio_req_t *reqp)
728f841f6adSraf {
729f841f6adSraf aio_lio_t *head;
730f841f6adSraf int refcnt;
731f841f6adSraf
732f841f6adSraf if ((head = reqp->req_head) != NULL) {
733f841f6adSraf sig_mutex_lock(&head->lio_mutex);
734f841f6adSraf ASSERT(head->lio_refcnt == head->lio_nent);
735f841f6adSraf refcnt = --head->lio_nent;
736f841f6adSraf head->lio_refcnt--;
737f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
738f841f6adSraf if (refcnt == 0)
739f841f6adSraf _aio_lio_free(head);
740f841f6adSraf reqp->req_head = NULL;
741f841f6adSraf }
742f841f6adSraf }
743f841f6adSraf
744f841f6adSraf /*
745f841f6adSraf * This function returns the number of asynchronous I/O requests submitted.
746f841f6adSraf */
747f841f6adSraf static int
__aio_fsync_bar(aiocb_t * aiocbp,aio_lio_t * head,aio_worker_t * aiowp,int workerscnt)748f841f6adSraf __aio_fsync_bar(aiocb_t *aiocbp, aio_lio_t *head, aio_worker_t *aiowp,
749f841f6adSraf int workerscnt)
750f841f6adSraf {
751f841f6adSraf int i;
752f841f6adSraf int error;
753f841f6adSraf aio_worker_t *next = aiowp;
754f841f6adSraf
755f841f6adSraf for (i = 0; i < workerscnt; i++) {
756f841f6adSraf error = _aio_rw(aiocbp, head, &next, AIOFSYNC, AIO_NO_KAIO);
757f841f6adSraf if (error != 0) {
758f841f6adSraf sig_mutex_lock(&head->lio_mutex);
759f841f6adSraf head->lio_mode = LIO_DESTROY; /* ignore fsync */
760f841f6adSraf head->lio_nent -= workerscnt - i;
761f841f6adSraf head->lio_refcnt -= workerscnt - i;
762f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
763f841f6adSraf errno = EAGAIN;
764f841f6adSraf return (i);
765f841f6adSraf }
766f841f6adSraf next = next->work_forw;
767f841f6adSraf }
768f841f6adSraf return (i);
769f841f6adSraf }
770f841f6adSraf
771f841f6adSraf int
aio_fsync(int op,aiocb_t * aiocbp)772f841f6adSraf aio_fsync(int op, aiocb_t *aiocbp)
773f841f6adSraf {
774f841f6adSraf aio_lio_t *head;
775f841f6adSraf struct stat statb;
776f841f6adSraf int fret;
777f841f6adSraf
778f841f6adSraf if (aiocbp == NULL)
779f841f6adSraf return (0);
7806e628f27Sraf if (op != O_DSYNC && op != O_SYNC) {
781f841f6adSraf errno = EINVAL;
782f841f6adSraf return (-1);
783f841f6adSraf }
784f841f6adSraf if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
785f841f6adSraf errno = EBUSY;
786f841f6adSraf return (-1);
787f841f6adSraf }
788f841f6adSraf if (fstat(aiocbp->aio_fildes, &statb) < 0)
789f841f6adSraf return (-1);
790f841f6adSraf if (_aio_sigev_thread(aiocbp) != 0)
791f841f6adSraf return (-1);
792f841f6adSraf
793f841f6adSraf /*
794f841f6adSraf * Kernel aio_fsync() is not supported.
795f841f6adSraf * We force user-level aio_fsync() just
796f841f6adSraf * for the notification side-effect.
797f841f6adSraf */
798f841f6adSraf if (!__uaio_ok && __uaio_init() == -1)
799f841f6adSraf return (-1);
800f841f6adSraf
801f841f6adSraf /*
802f841f6adSraf * The first asynchronous I/O request in the current process will
803f841f6adSraf * create a bunch of workers (via __uaio_init()). If the number
804f841f6adSraf * of workers is zero then the number of pending asynchronous I/O
805f841f6adSraf * requests is zero. In such a case only execute the standard
806*4b9db4f6SChris Fraire * fsync(3C) or fdatasync(3C) as appropriate.
807f841f6adSraf */
808f841f6adSraf if (__rw_workerscnt == 0) {
809f841f6adSraf if (op == O_DSYNC)
8104763305eSRobert Mustacchi return (__fdsync(aiocbp->aio_fildes, FDSYNC_DATA));
811f841f6adSraf else
8124763305eSRobert Mustacchi return (__fdsync(aiocbp->aio_fildes, FDSYNC_FILE));
813f841f6adSraf }
814f841f6adSraf
815f841f6adSraf /*
816f841f6adSraf * re-use aio_offset as the op field.
817f841f6adSraf * O_DSYNC - fdatasync()
818f841f6adSraf * O_SYNC - fsync()
819f841f6adSraf */
820f841f6adSraf aiocbp->aio_offset = op;
821f841f6adSraf aiocbp->aio_lio_opcode = AIOFSYNC;
822f841f6adSraf
823f841f6adSraf /*
824f841f6adSraf * Create a list of fsync requests. The worker that
825f841f6adSraf * gets the last request will do the fsync request.
826f841f6adSraf */
827f841f6adSraf head = _aio_lio_alloc();
828f841f6adSraf if (head == NULL) {
829f841f6adSraf errno = EAGAIN;
830f841f6adSraf return (-1);
831f841f6adSraf }
832f841f6adSraf head->lio_mode = LIO_FSYNC;
833f841f6adSraf head->lio_nent = head->lio_refcnt = __rw_workerscnt;
834f841f6adSraf head->lio_largefile = 0;
835f841f6adSraf
836f841f6adSraf /*
837f841f6adSraf * Insert an fsync request on every worker's queue.
838f841f6adSraf */
839f841f6adSraf fret = __aio_fsync_bar(aiocbp, head, __workers_rw, __rw_workerscnt);
840f841f6adSraf if (fret != __rw_workerscnt) {
841f841f6adSraf /*
842f841f6adSraf * Fewer fsync requests than workers means that it was
843f841f6adSraf * not possible to submit fsync requests to all workers.
844f841f6adSraf * Actions:
845f841f6adSraf * a) number of fsync requests submitted is 0:
846f841f6adSraf * => free allocated memory (aio_lio_t).
847f841f6adSraf * b) number of fsync requests submitted is > 0:
848f841f6adSraf * => the last worker executing the fsync request
849f841f6adSraf * will free the aio_lio_t struct.
850f841f6adSraf */
851f841f6adSraf if (fret == 0)
852f841f6adSraf _aio_lio_free(head);
853f841f6adSraf return (-1);
854f841f6adSraf }
855f841f6adSraf return (0);
856f841f6adSraf }
857f841f6adSraf
858f841f6adSraf int
aio_cancel(int fd,aiocb_t * aiocbp)859f841f6adSraf aio_cancel(int fd, aiocb_t *aiocbp)
860f841f6adSraf {
861f841f6adSraf aio_req_t *reqp;
862f841f6adSraf aio_worker_t *aiowp;
863f841f6adSraf int done = 0;
864f841f6adSraf int canceled = 0;
865f841f6adSraf struct stat buf;
866f841f6adSraf
867f841f6adSraf if (fstat(fd, &buf) < 0)
868f841f6adSraf return (-1);
869f841f6adSraf
870f841f6adSraf if (aiocbp != NULL) {
871f841f6adSraf if (fd != aiocbp->aio_fildes) {
872f841f6adSraf errno = EINVAL;
873f841f6adSraf return (-1);
874f841f6adSraf }
875f841f6adSraf if (aiocbp->aio_state == USERAIO) {
876f841f6adSraf sig_mutex_lock(&__aio_mutex);
877f841f6adSraf reqp = _aio_hash_find(&aiocbp->aio_resultp);
878f841f6adSraf if (reqp == NULL) {
879f841f6adSraf sig_mutex_unlock(&__aio_mutex);
880f841f6adSraf return (AIO_ALLDONE);
881f841f6adSraf }
882f841f6adSraf aiowp = reqp->req_worker;
883f841f6adSraf sig_mutex_lock(&aiowp->work_qlock1);
884f841f6adSraf (void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
885f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
886f841f6adSraf sig_mutex_unlock(&__aio_mutex);
887f841f6adSraf if (done)
888f841f6adSraf return (AIO_ALLDONE);
889f841f6adSraf if (canceled)
890f841f6adSraf return (AIO_CANCELED);
891f841f6adSraf return (AIO_NOTCANCELED);
892f841f6adSraf }
893f841f6adSraf if (aiocbp->aio_state == USERAIO_DONE)
894f841f6adSraf return (AIO_ALLDONE);
895f841f6adSraf return ((int)_kaio(AIOCANCEL, fd, aiocbp));
896f841f6adSraf }
897f841f6adSraf
898f841f6adSraf return (aiocancel_all(fd));
899f841f6adSraf }
900f841f6adSraf
901f841f6adSraf /*
902f841f6adSraf * __aio_waitn() cancellation handler.
903f841f6adSraf */
904f841f6adSraf static void
_aio_waitn_cleanup(void * arg __unused)9054a38094cSToomas Soome _aio_waitn_cleanup(void *arg __unused)
906f841f6adSraf {
907f841f6adSraf ASSERT(MUTEX_HELD(&__aio_mutex));
908f841f6adSraf
909f841f6adSraf /* check for pending aio_waitn() calls */
910f841f6adSraf _aio_flags &= ~(AIO_LIB_WAITN | AIO_WAIT_INPROGRESS | AIO_IO_WAITING);
911f841f6adSraf if (_aio_flags & AIO_LIB_WAITN_PENDING) {
912f841f6adSraf _aio_flags &= ~AIO_LIB_WAITN_PENDING;
913f841f6adSraf (void) cond_signal(&_aio_waitn_cv);
914f841f6adSraf }
915f841f6adSraf
916f841f6adSraf sig_mutex_unlock(&__aio_mutex);
917f841f6adSraf }
918f841f6adSraf
919f841f6adSraf /*
920f841f6adSraf * aio_waitn can be used to reap the results of several I/O operations that
921f841f6adSraf * were submitted asynchronously. The submission of I/Os can be done using
922f841f6adSraf * existing POSIX interfaces: lio_listio, aio_write or aio_read.
923f841f6adSraf * aio_waitn waits until "nwait" I/Os (supplied as a parameter) have
924f841f6adSraf * completed and it returns the descriptors for these I/Os in "list". The
925f841f6adSraf * maximum size of this list is given by "nent" and the actual number of I/Os
926f841f6adSraf * completed is returned in "nwait". Otherwise aio_waitn might also
927f841f6adSraf * return if the timeout expires. Additionally, aio_waitn returns 0 if
928f841f6adSraf * successful or -1 if an error occurred.
929f841f6adSraf */
930f841f6adSraf static int
__aio_waitn(void ** list,uint_t nent,uint_t * nwait,const timespec_t * utimo)931f841f6adSraf __aio_waitn(void **list, uint_t nent, uint_t *nwait, const timespec_t *utimo)
932f841f6adSraf {
933f841f6adSraf int error = 0;
934f841f6adSraf uint_t dnwait = 0; /* amount of requests in the waitn-done list */
935f841f6adSraf uint_t kwaitcnt; /* expected "done" requests from kernel */
936f841f6adSraf uint_t knentcnt; /* max. expected "done" requests from kernel */
937f841f6adSraf int uerrno = 0;
938f841f6adSraf int kerrno = 0; /* save errno from _kaio() call */
939f841f6adSraf int timedwait = AIO_TIMEOUT_UNDEF;
940f841f6adSraf aio_req_t *reqp;
941f841f6adSraf timespec_t end;
942f841f6adSraf timespec_t twait; /* copy of utimo for internal calculations */
943f841f6adSraf timespec_t *wait = NULL;
944f841f6adSraf
945f841f6adSraf if (nent == 0 || *nwait == 0 || *nwait > nent) {
946f841f6adSraf errno = EINVAL;
947f841f6adSraf return (-1);
948f841f6adSraf }
949f841f6adSraf
950f841f6adSraf /*
951f841f6adSraf * Only one running aio_waitn call per process allowed.
952f841f6adSraf * Further calls will be blocked here until the running
953f841f6adSraf * call finishes.
954f841f6adSraf */
955f841f6adSraf
956f841f6adSraf sig_mutex_lock(&__aio_mutex);
957f841f6adSraf
958f841f6adSraf while (_aio_flags & AIO_LIB_WAITN) {
959f841f6adSraf if (utimo && utimo->tv_sec == 0 && utimo->tv_nsec == 0) {
960f841f6adSraf sig_mutex_unlock(&__aio_mutex);
961f841f6adSraf *nwait = 0;
962f841f6adSraf return (0);
963f841f6adSraf }
964f841f6adSraf _aio_flags |= AIO_LIB_WAITN_PENDING;
965f841f6adSraf pthread_cleanup_push(sig_mutex_unlock, &__aio_mutex);
966f841f6adSraf error = sig_cond_wait(&_aio_waitn_cv, &__aio_mutex);
967f841f6adSraf pthread_cleanup_pop(0);
968f841f6adSraf if (error != 0) {
969f841f6adSraf sig_mutex_unlock(&__aio_mutex);
970f841f6adSraf *nwait = 0;
971f841f6adSraf errno = error;
972f841f6adSraf return (-1);
973f841f6adSraf }
974f841f6adSraf }
975f841f6adSraf
976f841f6adSraf pthread_cleanup_push(_aio_waitn_cleanup, NULL);
977f841f6adSraf
978f841f6adSraf _aio_flags |= AIO_LIB_WAITN;
979f841f6adSraf
980f841f6adSraf if (_aio_check_timeout(utimo, &end, &timedwait) != 0) {
981f841f6adSraf error = -1;
982f841f6adSraf dnwait = 0;
983f841f6adSraf goto out;
984f841f6adSraf }
985f841f6adSraf if (timedwait != AIO_TIMEOUT_INDEF) {
986f841f6adSraf twait = *utimo;
987f841f6adSraf wait = &twait;
988f841f6adSraf }
989f841f6adSraf
990f841f6adSraf /*
991f841f6adSraf * If both counters are still set to zero, then only
992f841f6adSraf * kernel requests are currently outstanding (raw-I/Os).
993f841f6adSraf */
994f841f6adSraf if ((_aio_doneq_cnt + _aio_outstand_cnt) == 0) {
995f841f6adSraf for (;;) {
996f841f6adSraf kwaitcnt = *nwait - dnwait;
997f841f6adSraf knentcnt = nent - dnwait;
998f841f6adSraf if (knentcnt > AIO_WAITN_MAXIOCBS)
999f841f6adSraf knentcnt = AIO_WAITN_MAXIOCBS;
1000f841f6adSraf kwaitcnt = (kwaitcnt > knentcnt) ? knentcnt : kwaitcnt;
1001f841f6adSraf
1002f841f6adSraf pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
1003f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1004f841f6adSraf _cancel_prologue();
1005f841f6adSraf error = (int)_kaio(AIOWAITN, &list[dnwait], knentcnt,
1006f841f6adSraf &kwaitcnt, wait);
1007f841f6adSraf _cancel_epilogue();
1008f841f6adSraf pthread_cleanup_pop(1);
1009f841f6adSraf
1010f841f6adSraf if (error == 0) {
1011f841f6adSraf dnwait += kwaitcnt;
1012f841f6adSraf if (dnwait >= *nwait ||
1013f841f6adSraf *nwait < AIO_WAITN_MAXIOCBS)
1014f841f6adSraf break;
1015f841f6adSraf if (timedwait == AIO_TIMEOUT_WAIT) {
1016f841f6adSraf error = _aio_get_timedelta(&end, wait);
1017f841f6adSraf if (error == -1) {
1018f841f6adSraf /* timer expired */
1019f841f6adSraf errno = ETIME;
1020f841f6adSraf break;
1021f841f6adSraf }
1022f841f6adSraf }
1023f841f6adSraf continue;
1024f841f6adSraf }
1025f841f6adSraf if (errno == EAGAIN) {
1026f841f6adSraf if (dnwait > 0)
1027f841f6adSraf error = 0;
1028f841f6adSraf break;
1029f841f6adSraf }
1030f841f6adSraf if (errno == ETIME || errno == EINTR) {
1031f841f6adSraf dnwait += kwaitcnt;
1032f841f6adSraf break;
1033f841f6adSraf }
1034f841f6adSraf /* fatal error */
1035f841f6adSraf break;
1036f841f6adSraf }
1037f841f6adSraf
1038f841f6adSraf goto out;
1039f841f6adSraf }
1040f841f6adSraf
1041f841f6adSraf /* File system I/Os outstanding ... */
1042f841f6adSraf
1043f841f6adSraf if (timedwait == AIO_TIMEOUT_UNDEF) {
1044f841f6adSraf if (_aio_check_timeout(utimo, &end, &timedwait) != 0) {
1045f841f6adSraf error = -1;
1046f841f6adSraf dnwait = 0;
1047f841f6adSraf goto out;
1048f841f6adSraf }
1049f841f6adSraf if (timedwait != AIO_TIMEOUT_INDEF) {
1050f841f6adSraf twait = *utimo;
1051f841f6adSraf wait = &twait;
1052f841f6adSraf }
1053f841f6adSraf }
1054f841f6adSraf
1055f841f6adSraf for (;;) {
1056f841f6adSraf uint_t sum_reqs;
1057f841f6adSraf
1058f841f6adSraf /*
1059f841f6adSraf * Calculate sum of active non RAW-IO requests (sum_reqs).
1060f841f6adSraf * If the expected amount of completed requests (*nwait) is
1061f841f6adSraf * greater than the calculated sum (sum_reqs) then
1062f841f6adSraf * use _kaio to check pending RAW-IO requests.
1063f841f6adSraf */
1064f841f6adSraf sum_reqs = _aio_doneq_cnt + dnwait + _aio_outstand_cnt;
1065f841f6adSraf kwaitcnt = (*nwait > sum_reqs) ? *nwait - sum_reqs : 0;
1066f841f6adSraf
1067f841f6adSraf if (kwaitcnt != 0) {
1068f841f6adSraf /* possibly some kernel I/Os outstanding */
1069f841f6adSraf knentcnt = nent - dnwait;
1070f841f6adSraf if (knentcnt > AIO_WAITN_MAXIOCBS)
1071f841f6adSraf knentcnt = AIO_WAITN_MAXIOCBS;
1072f841f6adSraf kwaitcnt = (kwaitcnt > knentcnt) ? knentcnt : kwaitcnt;
1073f841f6adSraf
1074f841f6adSraf _aio_flags |= AIO_WAIT_INPROGRESS;
1075f841f6adSraf
1076f841f6adSraf pthread_cleanup_push(sig_mutex_lock, &__aio_mutex);
1077f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1078f841f6adSraf _cancel_prologue();
1079f841f6adSraf error = (int)_kaio(AIOWAITN, &list[dnwait], knentcnt,
1080f841f6adSraf &kwaitcnt, wait);
1081f841f6adSraf _cancel_epilogue();
1082f841f6adSraf pthread_cleanup_pop(1);
1083f841f6adSraf
1084f841f6adSraf _aio_flags &= ~AIO_WAIT_INPROGRESS;
1085f841f6adSraf
1086f841f6adSraf if (error == 0) {
1087f841f6adSraf dnwait += kwaitcnt;
1088f841f6adSraf } else {
1089f841f6adSraf switch (errno) {
1090f841f6adSraf case EINVAL:
1091f841f6adSraf case EAGAIN:
1092f841f6adSraf /* don't wait for kernel I/Os */
1093f841f6adSraf kerrno = 0; /* ignore _kaio() errno */
1094f841f6adSraf *nwait = _aio_doneq_cnt +
1095f841f6adSraf _aio_outstand_cnt + dnwait;
1096f841f6adSraf error = 0;
1097f841f6adSraf break;
1098f841f6adSraf case EINTR:
1099f841f6adSraf case ETIME:
1100f841f6adSraf /* just scan for completed LIB I/Os */
1101f841f6adSraf dnwait += kwaitcnt;
1102f841f6adSraf timedwait = AIO_TIMEOUT_POLL;
1103f841f6adSraf kerrno = errno; /* save _kaio() errno */
1104f841f6adSraf error = 0;
1105f841f6adSraf break;
1106f841f6adSraf default:
1107f841f6adSraf kerrno = errno; /* save _kaio() errno */
1108f841f6adSraf break;
1109f841f6adSraf }
1110f841f6adSraf }
1111f841f6adSraf if (error)
1112f841f6adSraf break; /* fatal kernel error */
1113f841f6adSraf }
1114f841f6adSraf
1115f841f6adSraf /* check completed FS requests in the "done" queue */
1116f841f6adSraf
1117f841f6adSraf while (_aio_doneq_cnt && dnwait < nent) {
1118f841f6adSraf /* get done requests */
1119f841f6adSraf if ((reqp = _aio_req_remove(NULL)) != NULL) {
1120f841f6adSraf (void) _aio_hash_del(reqp->req_resultp);
1121f841f6adSraf list[dnwait++] = reqp->req_aiocbp;
1122f841f6adSraf _aio_req_mark_done(reqp);
1123f841f6adSraf _lio_remove(reqp);
1124f841f6adSraf _aio_req_free(reqp);
1125f841f6adSraf }
1126f841f6adSraf }
1127f841f6adSraf
1128f841f6adSraf if (dnwait >= *nwait) {
1129f841f6adSraf /* min. requested amount of completed I/Os satisfied */
1130f841f6adSraf break;
1131f841f6adSraf }
1132f841f6adSraf if (timedwait == AIO_TIMEOUT_WAIT &&
1133f841f6adSraf (error = _aio_get_timedelta(&end, wait)) == -1) {
1134f841f6adSraf /* timer expired */
1135f841f6adSraf uerrno = ETIME;
1136f841f6adSraf break;
1137f841f6adSraf }
1138f841f6adSraf
1139f841f6adSraf /*
1140f841f6adSraf * If some I/Os are outstanding and we have to wait for them,
1141f841f6adSraf * then sleep here. _aiodone() will call _aio_waitn_wakeup()
1142f841f6adSraf * to wakeup this thread as soon as the required amount of
1143f841f6adSraf * completed I/Os is done.
1144f841f6adSraf */
1145f841f6adSraf if (_aio_outstand_cnt > 0 && timedwait != AIO_TIMEOUT_POLL) {
1146f841f6adSraf /*
1147f841f6adSraf * _aio_waitn_wakeup() will wake up this thread when:
1148f841f6adSraf * - _aio_waitncnt requests are completed or
1149f841f6adSraf * - _aio_outstand_cnt becomes zero.
1150f841f6adSraf * sig_cond_reltimedwait() could also return with
1151f841f6adSraf * a timeout error (ETIME).
1152f841f6adSraf */
1153f841f6adSraf if (*nwait < _aio_outstand_cnt)
1154f841f6adSraf _aio_waitncnt = *nwait;
1155f841f6adSraf else
1156f841f6adSraf _aio_waitncnt = _aio_outstand_cnt;
1157f841f6adSraf
1158f841f6adSraf _aio_flags |= AIO_IO_WAITING;
1159f841f6adSraf
1160f841f6adSraf if (wait)
1161f841f6adSraf uerrno = sig_cond_reltimedwait(&_aio_iowait_cv,
1162f841f6adSraf &__aio_mutex, wait);
1163f841f6adSraf else
1164f841f6adSraf uerrno = sig_cond_wait(&_aio_iowait_cv,
1165f841f6adSraf &__aio_mutex);
1166f841f6adSraf
1167f841f6adSraf _aio_flags &= ~AIO_IO_WAITING;
1168f841f6adSraf
1169f841f6adSraf if (uerrno == ETIME) {
1170f841f6adSraf timedwait = AIO_TIMEOUT_POLL;
1171f841f6adSraf continue;
1172f841f6adSraf }
1173f841f6adSraf if (uerrno != 0)
1174f841f6adSraf timedwait = AIO_TIMEOUT_POLL;
1175f841f6adSraf }
1176f841f6adSraf
1177f841f6adSraf if (timedwait == AIO_TIMEOUT_POLL) {
1178f841f6adSraf /* polling or timer expired */
1179f841f6adSraf break;
1180f841f6adSraf }
1181f841f6adSraf }
1182f841f6adSraf
1183f841f6adSraf errno = uerrno == 0 ? kerrno : uerrno;
1184f841f6adSraf if (errno)
1185f841f6adSraf error = -1;
1186f841f6adSraf else
1187f841f6adSraf error = 0;
1188f841f6adSraf
1189f841f6adSraf out:
1190f841f6adSraf *nwait = dnwait;
1191f841f6adSraf
1192f841f6adSraf pthread_cleanup_pop(1); /* drops __aio_mutex */
1193f841f6adSraf
1194f841f6adSraf return (error);
1195f841f6adSraf }
1196f841f6adSraf
1197f841f6adSraf int
aio_waitn(aiocb_t * list[],uint_t nent,uint_t * nwait,const timespec_t * timeout)1198f841f6adSraf aio_waitn(aiocb_t *list[], uint_t nent, uint_t *nwait,
1199f841f6adSraf const timespec_t *timeout)
1200f841f6adSraf {
1201f841f6adSraf return (__aio_waitn((void **)list, nent, nwait, timeout));
1202f841f6adSraf }
1203f841f6adSraf
1204f841f6adSraf void
_aio_waitn_wakeup(void)1205f841f6adSraf _aio_waitn_wakeup(void)
1206f841f6adSraf {
1207f841f6adSraf /*
1208f841f6adSraf * __aio_waitn() sets AIO_IO_WAITING to notify _aiodone() that
1209f841f6adSraf * it is waiting for completed I/Os. The number of required
1210f841f6adSraf * completed I/Os is stored into "_aio_waitncnt".
1211f841f6adSraf * aio_waitn() is woken up when
1212f841f6adSraf * - there are no further outstanding I/Os
1213f841f6adSraf * (_aio_outstand_cnt == 0) or
1214f841f6adSraf * - the expected number of I/Os has completed.
1215f841f6adSraf * Only one __aio_waitn() function waits for completed I/Os at
1216f841f6adSraf * a time.
1217f841f6adSraf *
1218f841f6adSraf * __aio_suspend() increments "_aio_suscv_cnt" to notify
1219f841f6adSraf * _aiodone() that at least one __aio_suspend() call is
1220f841f6adSraf * waiting for completed I/Os.
1221f841f6adSraf * There could be more than one __aio_suspend() function
1222f841f6adSraf * waiting for completed I/Os. Because every function should
1223f841f6adSraf * be waiting for different I/Os, _aiodone() has to wake up all
1224f841f6adSraf * __aio_suspend() functions each time.
1225f841f6adSraf * Every __aio_suspend() function will compare the recently
1226f841f6adSraf * completed I/O with its own list.
1227f841f6adSraf */
1228f841f6adSraf ASSERT(MUTEX_HELD(&__aio_mutex));
1229f841f6adSraf if (_aio_flags & AIO_IO_WAITING) {
1230f841f6adSraf if (_aio_waitncnt > 0)
1231f841f6adSraf _aio_waitncnt--;
1232f841f6adSraf if (_aio_outstand_cnt == 0 || _aio_waitncnt == 0 ||
1233f841f6adSraf _aio_suscv_cnt > 0)
1234f841f6adSraf (void) cond_broadcast(&_aio_iowait_cv);
1235f841f6adSraf } else {
1236f841f6adSraf /* Wake up waiting aio_suspend calls */
1237f841f6adSraf if (_aio_suscv_cnt > 0)
1238f841f6adSraf (void) cond_broadcast(&_aio_iowait_cv);
1239f841f6adSraf }
1240f841f6adSraf }
1241f841f6adSraf
1242f841f6adSraf /*
1243f841f6adSraf * timedwait values :
1244f841f6adSraf * AIO_TIMEOUT_POLL : polling
1245f841f6adSraf * AIO_TIMEOUT_WAIT : timeout
1246f841f6adSraf * AIO_TIMEOUT_INDEF : wait indefinitely
1247f841f6adSraf */
1248f841f6adSraf static int
_aio_check_timeout(const timespec_t * utimo,timespec_t * end,int * timedwait)1249f841f6adSraf _aio_check_timeout(const timespec_t *utimo, timespec_t *end, int *timedwait)
1250f841f6adSraf {
1251f841f6adSraf struct timeval curtime;
1252f841f6adSraf
1253f841f6adSraf if (utimo) {
1254f841f6adSraf if (utimo->tv_sec < 0 || utimo->tv_nsec < 0 ||
1255f841f6adSraf utimo->tv_nsec >= NANOSEC) {
1256f841f6adSraf errno = EINVAL;
1257f841f6adSraf return (-1);
1258f841f6adSraf }
1259f841f6adSraf if (utimo->tv_sec > 0 || utimo->tv_nsec > 0) {
1260f841f6adSraf (void) gettimeofday(&curtime, NULL);
1261f841f6adSraf end->tv_sec = utimo->tv_sec + curtime.tv_sec;
1262f841f6adSraf end->tv_nsec = utimo->tv_nsec + 1000 * curtime.tv_usec;
1263f841f6adSraf if (end->tv_nsec >= NANOSEC) {
1264f841f6adSraf end->tv_nsec -= NANOSEC;
1265f841f6adSraf end->tv_sec += 1;
1266f841f6adSraf }
1267f841f6adSraf *timedwait = AIO_TIMEOUT_WAIT;
1268f841f6adSraf } else {
1269f841f6adSraf /* polling */
1270f841f6adSraf *timedwait = AIO_TIMEOUT_POLL;
1271f841f6adSraf }
1272f841f6adSraf } else {
1273f841f6adSraf *timedwait = AIO_TIMEOUT_INDEF; /* wait indefinitely */
1274f841f6adSraf }
1275f841f6adSraf return (0);
1276f841f6adSraf }
1277f841f6adSraf
1278f841f6adSraf #if !defined(_LP64)
1279f841f6adSraf
1280f841f6adSraf int
aio_read64(aiocb64_t * aiocbp)1281f841f6adSraf aio_read64(aiocb64_t *aiocbp)
1282f841f6adSraf {
12836e628f27Sraf if (aiocbp == NULL || aiocbp->aio_reqprio != 0) {
1284f841f6adSraf errno = EINVAL;
1285f841f6adSraf return (-1);
1286f841f6adSraf }
1287f841f6adSraf if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
1288f841f6adSraf errno = EBUSY;
1289f841f6adSraf return (-1);
1290f841f6adSraf }
1291f841f6adSraf if (_aio_sigev_thread64(aiocbp) != 0)
1292f841f6adSraf return (-1);
1293f841f6adSraf aiocbp->aio_lio_opcode = LIO_READ;
1294f841f6adSraf return (_aio_rw64(aiocbp, NULL, &__nextworker_rw, AIOAREAD64,
1295f841f6adSraf (AIO_KAIO | AIO_NO_DUPS)));
1296f841f6adSraf }
1297f841f6adSraf
1298f841f6adSraf int
aio_write64(aiocb64_t * aiocbp)1299f841f6adSraf aio_write64(aiocb64_t *aiocbp)
1300f841f6adSraf {
13016e628f27Sraf if (aiocbp == NULL || aiocbp->aio_reqprio != 0) {
1302f841f6adSraf errno = EINVAL;
1303f841f6adSraf return (-1);
1304f841f6adSraf }
1305f841f6adSraf if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
1306f841f6adSraf errno = EBUSY;
1307f841f6adSraf return (-1);
1308f841f6adSraf }
1309f841f6adSraf if (_aio_sigev_thread64(aiocbp) != 0)
1310f841f6adSraf return (-1);
1311f841f6adSraf aiocbp->aio_lio_opcode = LIO_WRITE;
1312f841f6adSraf return (_aio_rw64(aiocbp, NULL, &__nextworker_rw, AIOAWRITE64,
1313f841f6adSraf (AIO_KAIO | AIO_NO_DUPS)));
1314f841f6adSraf }
1315f841f6adSraf
1316f841f6adSraf int
lio_listio64(int mode,aiocb64_t * _RESTRICT_KYWD const * _RESTRICT_KYWD list,int nent,struct sigevent * _RESTRICT_KYWD sigevp)1317f841f6adSraf lio_listio64(int mode, aiocb64_t *_RESTRICT_KYWD const *_RESTRICT_KYWD list,
1318f841f6adSraf int nent, struct sigevent *_RESTRICT_KYWD sigevp)
1319f841f6adSraf {
1320f841f6adSraf int aio_ufs = 0;
1321f841f6adSraf int oerrno = 0;
1322f841f6adSraf aio_lio_t *head = NULL;
1323f841f6adSraf aiocb64_t *aiocbp;
1324f841f6adSraf int state = 0;
1325f841f6adSraf int EIOflg = 0;
1326f841f6adSraf int rw;
1327f841f6adSraf int do_kaio = 0;
1328f841f6adSraf int error;
1329f841f6adSraf int i;
1330f841f6adSraf
1331f841f6adSraf if (!_kaio_ok)
1332f841f6adSraf _kaio_init();
1333f841f6adSraf
1334f841f6adSraf if (aio_list_max == 0)
1335f841f6adSraf aio_list_max = sysconf(_SC_AIO_LISTIO_MAX);
1336f841f6adSraf
1337f841f6adSraf if (nent <= 0 || nent > aio_list_max) {
1338f841f6adSraf errno = EINVAL;
1339f841f6adSraf return (-1);
1340f841f6adSraf }
1341f841f6adSraf
1342f841f6adSraf switch (mode) {
1343f841f6adSraf case LIO_WAIT:
1344f841f6adSraf state = NOCHECK;
1345f841f6adSraf break;
1346f841f6adSraf case LIO_NOWAIT:
1347f841f6adSraf state = CHECK;
1348f841f6adSraf break;
1349f841f6adSraf default:
1350f841f6adSraf errno = EINVAL;
1351f841f6adSraf return (-1);
1352f841f6adSraf }
1353f841f6adSraf
1354f841f6adSraf for (i = 0; i < nent; i++) {
1355f841f6adSraf if ((aiocbp = list[i]) == NULL)
1356f841f6adSraf continue;
1357f841f6adSraf if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
1358f841f6adSraf errno = EBUSY;
1359f841f6adSraf return (-1);
1360f841f6adSraf }
1361f841f6adSraf if (_aio_sigev_thread64(aiocbp) != 0)
1362f841f6adSraf return (-1);
1363f841f6adSraf if (aiocbp->aio_lio_opcode == LIO_NOP)
1364f841f6adSraf aiocbp->aio_state = NOCHECK;
1365f841f6adSraf else {
1366f841f6adSraf aiocbp->aio_state = state;
1367f841f6adSraf if (KAIO_SUPPORTED(aiocbp->aio_fildes))
1368f841f6adSraf do_kaio++;
1369f841f6adSraf else
1370f841f6adSraf aiocbp->aio_resultp.aio_errno = ENOTSUP;
1371f841f6adSraf }
1372f841f6adSraf }
1373f841f6adSraf if (_aio_sigev_thread_init(sigevp) != 0)
1374f841f6adSraf return (-1);
1375f841f6adSraf
1376f841f6adSraf if (do_kaio) {
1377f841f6adSraf error = (int)_kaio(AIOLIO64, mode, list, nent, sigevp);
1378f841f6adSraf if (error == 0)
1379f841f6adSraf return (0);
1380f841f6adSraf oerrno = errno;
1381f841f6adSraf } else {
1382f841f6adSraf oerrno = errno = ENOTSUP;
1383f841f6adSraf error = -1;
1384f841f6adSraf }
1385f841f6adSraf
1386f841f6adSraf if (error == -1 && errno == ENOTSUP) {
1387f841f6adSraf error = errno = 0;
1388f841f6adSraf /*
1389f841f6adSraf * If LIO_WAIT, or notification required, allocate a list head.
1390f841f6adSraf */
1391f841f6adSraf if (mode == LIO_WAIT ||
1392f841f6adSraf (sigevp != NULL &&
1393f841f6adSraf (sigevp->sigev_notify == SIGEV_SIGNAL ||
1394f841f6adSraf sigevp->sigev_notify == SIGEV_THREAD ||
1395f841f6adSraf sigevp->sigev_notify == SIGEV_PORT)))
1396f841f6adSraf head = _aio_lio_alloc();
1397f841f6adSraf if (head) {
1398f841f6adSraf sig_mutex_lock(&head->lio_mutex);
1399f841f6adSraf head->lio_mode = mode;
1400f841f6adSraf head->lio_largefile = 1;
1401f841f6adSraf if (mode == LIO_NOWAIT && sigevp != NULL) {
1402f841f6adSraf if (sigevp->sigev_notify == SIGEV_THREAD) {
1403f841f6adSraf head->lio_port = sigevp->sigev_signo;
1404f841f6adSraf head->lio_event = AIOLIO64;
1405f841f6adSraf head->lio_sigevent = sigevp;
1406f841f6adSraf head->lio_sigval.sival_ptr =
1407f841f6adSraf sigevp->sigev_value.sival_ptr;
1408f841f6adSraf } else if (sigevp->sigev_notify == SIGEV_PORT) {
1409f841f6adSraf port_notify_t *pn =
1410f841f6adSraf sigevp->sigev_value.sival_ptr;
1411f841f6adSraf head->lio_port = pn->portnfy_port;
1412f841f6adSraf head->lio_event = AIOLIO64;
1413f841f6adSraf head->lio_sigevent = sigevp;
1414f841f6adSraf head->lio_sigval.sival_ptr =
1415f841f6adSraf pn->portnfy_user;
1416f841f6adSraf } else { /* SIGEV_SIGNAL */
1417f841f6adSraf head->lio_signo = sigevp->sigev_signo;
1418f841f6adSraf head->lio_sigval.sival_ptr =
1419f841f6adSraf sigevp->sigev_value.sival_ptr;
1420f841f6adSraf }
1421f841f6adSraf }
1422f841f6adSraf head->lio_nent = head->lio_refcnt = nent;
1423f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
1424f841f6adSraf }
1425f841f6adSraf /*
1426f841f6adSraf * find UFS requests, errno == ENOTSUP/EBADFD,
1427f841f6adSraf */
1428f841f6adSraf for (i = 0; i < nent; i++) {
1429f841f6adSraf if ((aiocbp = list[i]) == NULL ||
1430f841f6adSraf aiocbp->aio_lio_opcode == LIO_NOP ||
1431f841f6adSraf (aiocbp->aio_resultp.aio_errno != ENOTSUP &&
1432f841f6adSraf aiocbp->aio_resultp.aio_errno != EBADFD)) {
1433f841f6adSraf if (head)
1434f841f6adSraf _lio_list_decr(head);
1435f841f6adSraf continue;
1436f841f6adSraf }
1437f841f6adSraf if (aiocbp->aio_resultp.aio_errno == EBADFD)
1438f841f6adSraf SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
14396e628f27Sraf if (aiocbp->aio_reqprio != 0) {
1440f841f6adSraf aiocbp->aio_resultp.aio_errno = EINVAL;
1441f841f6adSraf aiocbp->aio_resultp.aio_return = -1;
1442f841f6adSraf EIOflg = 1;
1443f841f6adSraf if (head)
1444f841f6adSraf _lio_list_decr(head);
1445f841f6adSraf continue;
1446f841f6adSraf }
1447f841f6adSraf /*
1448f841f6adSraf * submit an AIO request with flags AIO_NO_KAIO
1449f841f6adSraf * to avoid the kaio() syscall in _aio_rw()
1450f841f6adSraf */
1451f841f6adSraf switch (aiocbp->aio_lio_opcode) {
1452f841f6adSraf case LIO_READ:
1453f841f6adSraf rw = AIOAREAD64;
1454f841f6adSraf break;
1455f841f6adSraf case LIO_WRITE:
1456f841f6adSraf rw = AIOAWRITE64;
1457f841f6adSraf break;
1458f841f6adSraf }
1459f841f6adSraf error = _aio_rw64(aiocbp, head, &__nextworker_rw, rw,
1460f841f6adSraf (AIO_NO_KAIO | AIO_NO_DUPS));
1461f841f6adSraf if (error == 0)
1462f841f6adSraf aio_ufs++;
1463f841f6adSraf else {
1464f841f6adSraf if (head)
1465f841f6adSraf _lio_list_decr(head);
1466f841f6adSraf aiocbp->aio_resultp.aio_errno = error;
1467f841f6adSraf EIOflg = 1;
1468f841f6adSraf }
1469f841f6adSraf }
1470f841f6adSraf }
1471f841f6adSraf if (EIOflg) {
1472f841f6adSraf errno = EIO;
1473f841f6adSraf return (-1);
1474f841f6adSraf }
1475f841f6adSraf if (mode == LIO_WAIT && oerrno == ENOTSUP) {
1476f841f6adSraf /*
1477f841f6adSraf * call kaio(AIOLIOWAIT) to get all outstanding
1478f841f6adSraf * kernel AIO requests
1479f841f6adSraf */
1480f841f6adSraf if ((nent - aio_ufs) > 0)
1481f841f6adSraf (void) _kaio(AIOLIOWAIT, mode, list, nent, sigevp);
1482f841f6adSraf if (head != NULL && head->lio_nent > 0) {
1483f841f6adSraf sig_mutex_lock(&head->lio_mutex);
1484f841f6adSraf while (head->lio_refcnt > 0) {
1485f841f6adSraf int err;
1486f841f6adSraf head->lio_waiting = 1;
1487f841f6adSraf pthread_cleanup_push(_lio_listio_cleanup, head);
1488f841f6adSraf err = sig_cond_wait(&head->lio_cond_cv,
1489f841f6adSraf &head->lio_mutex);
1490f841f6adSraf pthread_cleanup_pop(0);
1491f841f6adSraf head->lio_waiting = 0;
1492f841f6adSraf if (err && head->lio_nent > 0) {
1493f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
1494f841f6adSraf errno = err;
1495f841f6adSraf return (-1);
1496f841f6adSraf }
1497f841f6adSraf }
1498f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
1499f841f6adSraf ASSERT(head->lio_nent == 0 && head->lio_refcnt == 0);
1500f841f6adSraf _aio_lio_free(head);
1501f841f6adSraf for (i = 0; i < nent; i++) {
1502f841f6adSraf if ((aiocbp = list[i]) != NULL &&
1503f841f6adSraf aiocbp->aio_resultp.aio_errno) {
1504f841f6adSraf errno = EIO;
1505f841f6adSraf return (-1);
1506f841f6adSraf }
1507f841f6adSraf }
1508f841f6adSraf }
1509f841f6adSraf return (0);
1510f841f6adSraf }
1511f841f6adSraf return (error);
1512f841f6adSraf }
1513f841f6adSraf
1514f841f6adSraf int
aio_suspend64(const aiocb64_t * const list[],int nent,const timespec_t * timeout)1515f841f6adSraf aio_suspend64(const aiocb64_t * const list[], int nent,
1516f841f6adSraf const timespec_t *timeout)
1517f841f6adSraf {
1518f841f6adSraf return (__aio_suspend((void **)list, nent, timeout, 1));
1519f841f6adSraf }
1520f841f6adSraf
1521f841f6adSraf int
aio_error64(const aiocb64_t * aiocbp)1522f841f6adSraf aio_error64(const aiocb64_t *aiocbp)
1523f841f6adSraf {
1524f841f6adSraf const aio_result_t *resultp = &aiocbp->aio_resultp;
1525f841f6adSraf int error;
1526f841f6adSraf
1527f841f6adSraf if ((error = resultp->aio_errno) == EINPROGRESS) {
1528f841f6adSraf if (aiocbp->aio_state == CHECK) {
1529f841f6adSraf /*
1530f841f6adSraf * Always do the kaio() call without using the
1531f841f6adSraf * KAIO_SUPPORTED() checks because it is not
1532f841f6adSraf * mandatory to have a valid fd set in the
1533f841f6adSraf * aiocb, only the resultp must be set.
1534f841f6adSraf */
1535f841f6adSraf if ((int)_kaio(AIOERROR64, aiocbp) == EINVAL) {
1536f841f6adSraf errno = EINVAL;
1537f841f6adSraf return (-1);
1538f841f6adSraf }
1539f841f6adSraf error = resultp->aio_errno;
1540f841f6adSraf } else if (aiocbp->aio_state == CHECKED) {
1541f841f6adSraf ((aiocb64_t *)aiocbp)->aio_state = CHECK;
1542f841f6adSraf }
1543f841f6adSraf }
1544f841f6adSraf return (error);
1545f841f6adSraf }
1546f841f6adSraf
1547f841f6adSraf ssize_t
aio_return64(aiocb64_t * aiocbp)1548f841f6adSraf aio_return64(aiocb64_t *aiocbp)
1549f841f6adSraf {
1550f841f6adSraf aio_result_t *resultp = &aiocbp->aio_resultp;
1551f841f6adSraf aio_req_t *reqp;
1552f841f6adSraf int error;
1553f841f6adSraf ssize_t retval;
1554f841f6adSraf
1555f841f6adSraf /*
1556f841f6adSraf * The _aiodone() function stores resultp->aio_return before
1557f841f6adSraf * storing resultp->aio_errno (with an membar_producer() in
1558f841f6adSraf * between). We use membar_consumer() below to ensure proper
1559f841f6adSraf * memory ordering between _aiodone() and ourself.
1560f841f6adSraf */
1561f841f6adSraf error = resultp->aio_errno;
1562f841f6adSraf membar_consumer();
1563f841f6adSraf retval = resultp->aio_return;
1564f841f6adSraf
1565f841f6adSraf /*
1566f841f6adSraf * we use this condition to indicate either that
1567f841f6adSraf * aio_return() has been called before or should
1568f841f6adSraf * not have been called yet.
1569f841f6adSraf */
1570f841f6adSraf if ((retval == -1 && error == EINVAL) || error == EINPROGRESS) {
1571f841f6adSraf errno = error;
1572f841f6adSraf return (-1);
1573f841f6adSraf }
1574f841f6adSraf
1575f841f6adSraf /*
1576f841f6adSraf * Before we return, mark the result as being returned so that later
1577f841f6adSraf * calls to aio_return() will return the fact that the result has
1578f841f6adSraf * already been returned.
1579f841f6adSraf */
1580f841f6adSraf sig_mutex_lock(&__aio_mutex);
1581f841f6adSraf /* retest, in case more than one thread actually got in here */
1582f841f6adSraf if (resultp->aio_return == -1 && resultp->aio_errno == EINVAL) {
1583f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1584f841f6adSraf errno = EINVAL;
1585f841f6adSraf return (-1);
1586f841f6adSraf }
1587f841f6adSraf resultp->aio_return = -1;
1588f841f6adSraf resultp->aio_errno = EINVAL;
1589f841f6adSraf if ((reqp = _aio_hash_del(resultp)) == NULL)
1590f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1591f841f6adSraf else {
1592f841f6adSraf aiocbp->aio_state = NOCHECK;
1593f841f6adSraf ASSERT(reqp->req_head == NULL);
1594f841f6adSraf (void) _aio_req_remove(reqp);
1595f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1596f841f6adSraf _aio_req_free(reqp);
1597f841f6adSraf }
1598f841f6adSraf
1599f841f6adSraf if (retval == -1)
1600f841f6adSraf errno = error;
1601f841f6adSraf return (retval);
1602f841f6adSraf }
1603f841f6adSraf
1604f841f6adSraf static int
__aio_fsync_bar64(aiocb64_t * aiocbp,aio_lio_t * head,aio_worker_t * aiowp,int workerscnt)1605f841f6adSraf __aio_fsync_bar64(aiocb64_t *aiocbp, aio_lio_t *head, aio_worker_t *aiowp,
1606f841f6adSraf int workerscnt)
1607f841f6adSraf {
1608f841f6adSraf int i;
1609f841f6adSraf int error;
1610f841f6adSraf aio_worker_t *next = aiowp;
1611f841f6adSraf
1612f841f6adSraf for (i = 0; i < workerscnt; i++) {
1613f841f6adSraf error = _aio_rw64(aiocbp, head, &next, AIOFSYNC, AIO_NO_KAIO);
1614f841f6adSraf if (error != 0) {
1615f841f6adSraf sig_mutex_lock(&head->lio_mutex);
1616f841f6adSraf head->lio_mode = LIO_DESTROY; /* ignore fsync */
1617f841f6adSraf head->lio_nent -= workerscnt - i;
1618f841f6adSraf head->lio_refcnt -= workerscnt - i;
1619f841f6adSraf sig_mutex_unlock(&head->lio_mutex);
1620f841f6adSraf errno = EAGAIN;
1621f841f6adSraf return (i);
1622f841f6adSraf }
1623f841f6adSraf next = next->work_forw;
1624f841f6adSraf }
1625f841f6adSraf return (i);
1626f841f6adSraf }
1627f841f6adSraf
1628f841f6adSraf int
aio_fsync64(int op,aiocb64_t * aiocbp)1629f841f6adSraf aio_fsync64(int op, aiocb64_t *aiocbp)
1630f841f6adSraf {
1631f841f6adSraf aio_lio_t *head;
16327e65cb05SArindam Sarkar struct stat64 statb;
1633f841f6adSraf int fret;
1634f841f6adSraf
1635f841f6adSraf if (aiocbp == NULL)
1636f841f6adSraf return (0);
16376e628f27Sraf if (op != O_DSYNC && op != O_SYNC) {
1638f841f6adSraf errno = EINVAL;
1639f841f6adSraf return (-1);
1640f841f6adSraf }
1641f841f6adSraf if (_aio_hash_find(&aiocbp->aio_resultp) != NULL) {
1642f841f6adSraf errno = EBUSY;
1643f841f6adSraf return (-1);
1644f841f6adSraf }
16457e65cb05SArindam Sarkar if (fstat64(aiocbp->aio_fildes, &statb) < 0)
1646f841f6adSraf return (-1);
1647f841f6adSraf if (_aio_sigev_thread64(aiocbp) != 0)
1648f841f6adSraf return (-1);
1649f841f6adSraf
1650f841f6adSraf /*
1651f841f6adSraf * Kernel aio_fsync() is not supported.
1652f841f6adSraf * We force user-level aio_fsync() just
1653f841f6adSraf * for the notification side-effect.
1654f841f6adSraf */
1655f841f6adSraf if (!__uaio_ok && __uaio_init() == -1)
1656f841f6adSraf return (-1);
1657f841f6adSraf
1658f841f6adSraf /*
1659f841f6adSraf * The first asynchronous I/O request in the current process will
1660f841f6adSraf * create a bunch of workers (via __uaio_init()). If the number
1661f841f6adSraf * of workers is zero then the number of pending asynchronous I/O
1662f841f6adSraf * requests is zero. In such a case only execute the standard
1663*4b9db4f6SChris Fraire * fsync(3C) or fdatasync(3C) as appropriate.
1664f841f6adSraf */
1665f841f6adSraf if (__rw_workerscnt == 0) {
1666f841f6adSraf if (op == O_DSYNC)
16674763305eSRobert Mustacchi return (__fdsync(aiocbp->aio_fildes, FDSYNC_DATA));
1668f841f6adSraf else
16694763305eSRobert Mustacchi return (__fdsync(aiocbp->aio_fildes, FDSYNC_FILE));
1670f841f6adSraf }
1671f841f6adSraf
1672f841f6adSraf /*
1673f841f6adSraf * re-use aio_offset as the op field.
1674f841f6adSraf * O_DSYNC - fdatasync()
1675f841f6adSraf * O_SYNC - fsync()
1676f841f6adSraf */
1677f841f6adSraf aiocbp->aio_offset = op;
1678f841f6adSraf aiocbp->aio_lio_opcode = AIOFSYNC;
1679f841f6adSraf
1680f841f6adSraf /*
1681f841f6adSraf * Create a list of fsync requests. The worker that
1682f841f6adSraf * gets the last request will do the fsync request.
1683f841f6adSraf */
1684f841f6adSraf head = _aio_lio_alloc();
1685f841f6adSraf if (head == NULL) {
1686f841f6adSraf errno = EAGAIN;
1687f841f6adSraf return (-1);
1688f841f6adSraf }
1689f841f6adSraf head->lio_mode = LIO_FSYNC;
1690f841f6adSraf head->lio_nent = head->lio_refcnt = __rw_workerscnt;
1691f841f6adSraf head->lio_largefile = 1;
1692f841f6adSraf
1693f841f6adSraf /*
1694f841f6adSraf * Insert an fsync request on every worker's queue.
1695f841f6adSraf */
1696f841f6adSraf fret = __aio_fsync_bar64(aiocbp, head, __workers_rw, __rw_workerscnt);
1697f841f6adSraf if (fret != __rw_workerscnt) {
1698f841f6adSraf /*
1699f841f6adSraf * Fewer fsync requests than workers means that it was
1700f841f6adSraf * not possible to submit fsync requests to all workers.
1701f841f6adSraf * Actions:
1702f841f6adSraf * a) number of fsync requests submitted is 0:
1703f841f6adSraf * => free allocated memory (aio_lio_t).
1704f841f6adSraf * b) number of fsync requests submitted is > 0:
1705f841f6adSraf * => the last worker executing the fsync request
1706f841f6adSraf * will free the aio_lio_t struct.
1707f841f6adSraf */
1708f841f6adSraf if (fret == 0)
1709f841f6adSraf _aio_lio_free(head);
1710f841f6adSraf return (-1);
1711f841f6adSraf }
1712f841f6adSraf return (0);
1713f841f6adSraf }
1714f841f6adSraf
1715f841f6adSraf int
aio_cancel64(int fd,aiocb64_t * aiocbp)1716f841f6adSraf aio_cancel64(int fd, aiocb64_t *aiocbp)
1717f841f6adSraf {
1718f841f6adSraf aio_req_t *reqp;
1719f841f6adSraf aio_worker_t *aiowp;
1720f841f6adSraf int done = 0;
1721f841f6adSraf int canceled = 0;
17227e65cb05SArindam Sarkar struct stat64 buf;
1723f841f6adSraf
17247e65cb05SArindam Sarkar if (fstat64(fd, &buf) < 0)
1725f841f6adSraf return (-1);
1726f841f6adSraf
1727f841f6adSraf if (aiocbp != NULL) {
1728f841f6adSraf if (fd != aiocbp->aio_fildes) {
1729f841f6adSraf errno = EINVAL;
1730f841f6adSraf return (-1);
1731f841f6adSraf }
1732f841f6adSraf if (aiocbp->aio_state == USERAIO) {
1733f841f6adSraf sig_mutex_lock(&__aio_mutex);
1734f841f6adSraf reqp = _aio_hash_find(&aiocbp->aio_resultp);
1735f841f6adSraf if (reqp == NULL) {
1736f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1737f841f6adSraf return (AIO_ALLDONE);
1738f841f6adSraf }
1739f841f6adSraf aiowp = reqp->req_worker;
1740f841f6adSraf sig_mutex_lock(&aiowp->work_qlock1);
1741f841f6adSraf (void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
1742f841f6adSraf sig_mutex_unlock(&aiowp->work_qlock1);
1743f841f6adSraf sig_mutex_unlock(&__aio_mutex);
1744f841f6adSraf if (done)
1745f841f6adSraf return (AIO_ALLDONE);
1746f841f6adSraf if (canceled)
1747f841f6adSraf return (AIO_CANCELED);
1748f841f6adSraf return (AIO_NOTCANCELED);
1749f841f6adSraf }
1750f841f6adSraf if (aiocbp->aio_state == USERAIO_DONE)
1751f841f6adSraf return (AIO_ALLDONE);
1752f841f6adSraf return ((int)_kaio(AIOCANCEL, fd, aiocbp));
1753f841f6adSraf }
1754f841f6adSraf
1755f841f6adSraf return (aiocancel_all(fd));
1756f841f6adSraf }
1757f841f6adSraf
1758f841f6adSraf int
aio_waitn64(aiocb64_t * list[],uint_t nent,uint_t * nwait,const timespec_t * timeout)1759f841f6adSraf aio_waitn64(aiocb64_t *list[], uint_t nent, uint_t *nwait,
1760f841f6adSraf const timespec_t *timeout)
1761f841f6adSraf {
1762f841f6adSraf return (__aio_waitn((void **)list, nent, nwait, timeout));
1763f841f6adSraf }
1764f841f6adSraf
1765f841f6adSraf #endif /* !defined(_LP64) */
1766