1*abbd98eaSmiod /* $OpenBSD: sys_futex.c,v 1.22 2023/08/14 07:42:34 miod Exp $ */
2004d4497Smpi
3004d4497Smpi /*
4004d4497Smpi * Copyright (c) 2016-2017 Martin Pieuchot
5004d4497Smpi *
6004d4497Smpi * Permission to use, copy, modify, and distribute this software for any
7004d4497Smpi * purpose with or without fee is hereby granted, provided that the above
8004d4497Smpi * copyright notice and this permission notice appear in all copies.
9004d4497Smpi *
10004d4497Smpi * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11004d4497Smpi * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12004d4497Smpi * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13004d4497Smpi * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14004d4497Smpi * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15004d4497Smpi * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16004d4497Smpi * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17004d4497Smpi */
18004d4497Smpi
19004d4497Smpi #include <sys/param.h>
20004d4497Smpi #include <sys/systm.h>
21004d4497Smpi #include <sys/proc.h>
22004d4497Smpi #include <sys/mount.h>
23004d4497Smpi #include <sys/syscallargs.h>
24004d4497Smpi #include <sys/pool.h>
25004d4497Smpi #include <sys/time.h>
26004d4497Smpi #include <sys/rwlock.h>
27004d4497Smpi #include <sys/futex.h>
28004d4497Smpi
29004d4497Smpi #ifdef KTRACE
30004d4497Smpi #include <sys/ktrace.h>
31004d4497Smpi #endif
32004d4497Smpi
33672a12b3Skettenis #include <uvm/uvm.h>
34672a12b3Skettenis
35004d4497Smpi /*
36004d4497Smpi * Kernel representation of a futex.
37004d4497Smpi */
38004d4497Smpi struct futex {
39004d4497Smpi LIST_ENTRY(futex) ft_list; /* list of all futexes */
40004d4497Smpi TAILQ_HEAD(, proc) ft_threads; /* sleeping queue */
41672a12b3Skettenis struct uvm_object *ft_obj; /* UVM object */
42e6eeb55aSkettenis struct vm_amap *ft_amap; /* UVM amap */
43672a12b3Skettenis voff_t ft_off; /* UVM offset */
44004d4497Smpi unsigned int ft_refcnt; /* # of references */
45004d4497Smpi };
46004d4497Smpi
47004d4497Smpi /* Syscall helpers. */
48672a12b3Skettenis int futex_wait(uint32_t *, uint32_t, const struct timespec *, int);
49672a12b3Skettenis int futex_wake(uint32_t *, uint32_t, int);
50672a12b3Skettenis int futex_requeue(uint32_t *, uint32_t, uint32_t *, uint32_t, int);
51004d4497Smpi
52004d4497Smpi /* Flags for futex_get(). */
53004d4497Smpi #define FT_CREATE 0x1 /* Create a futex if it doesn't exist. */
54672a12b3Skettenis #define FT_PRIVATE 0x2 /* Futex is process-private. */
55004d4497Smpi
56004d4497Smpi struct futex *futex_get(uint32_t *, int);
57004d4497Smpi void futex_put(struct futex *);
58004d4497Smpi
59004d4497Smpi /*
60dbb9e73fSvisa * The global futex lock serializes futex(2) calls so that no wakeup
61dbb9e73fSvisa * event is lost, and protects all futex lists and futex states.
62004d4497Smpi */
63004d4497Smpi struct rwlock ftlock = RWLOCK_INITIALIZER("futex");
64dbb9e73fSvisa static struct futex_list ftlist_shared =
65dbb9e73fSvisa LIST_HEAD_INITIALIZER(ftlist_shared);
66004d4497Smpi struct pool ftpool;
67004d4497Smpi
68004d4497Smpi
69004d4497Smpi void
futex_init(void)70004d4497Smpi futex_init(void)
71004d4497Smpi {
72703735beSguenther pool_init(&ftpool, sizeof(struct futex), 0, IPL_NONE,
73703735beSguenther PR_WAITOK | PR_RWLOCK, "futexpl", NULL);
74004d4497Smpi }
75004d4497Smpi
76004d4497Smpi int
sys_futex(struct proc * p,void * v,register_t * retval)77004d4497Smpi sys_futex(struct proc *p, void *v, register_t *retval)
78004d4497Smpi {
79004d4497Smpi struct sys_futex_args /* {
80004d4497Smpi syscallarg(uint32_t *) f;
81004d4497Smpi syscallarg(int) op;
82004d4497Smpi syscallarg(inr) val;
83004d4497Smpi syscallarg(const struct timespec *) timeout;
84004d4497Smpi syscallarg(uint32_t *) g;
85004d4497Smpi } */ *uap = v;
86004d4497Smpi uint32_t *uaddr = SCARG(uap, f);
87004d4497Smpi int op = SCARG(uap, op);
88004d4497Smpi uint32_t val = SCARG(uap, val);
89004d4497Smpi const struct timespec *timeout = SCARG(uap, timeout);
90004d4497Smpi void *g = SCARG(uap, g);
91672a12b3Skettenis int flags = 0;
923288ea8fSkettenis int error = 0;
93672a12b3Skettenis
94672a12b3Skettenis if (op & FUTEX_PRIVATE_FLAG)
95672a12b3Skettenis flags |= FT_PRIVATE;
96004d4497Smpi
97adec12fbSvisa rw_enter_write(&ftlock);
98004d4497Smpi switch (op) {
99004d4497Smpi case FUTEX_WAIT:
100672a12b3Skettenis case FUTEX_WAIT_PRIVATE:
1013288ea8fSkettenis error = futex_wait(uaddr, val, timeout, flags);
102004d4497Smpi break;
103004d4497Smpi case FUTEX_WAKE:
104672a12b3Skettenis case FUTEX_WAKE_PRIVATE:
105672a12b3Skettenis *retval = futex_wake(uaddr, val, flags);
106004d4497Smpi break;
107004d4497Smpi case FUTEX_REQUEUE:
108672a12b3Skettenis case FUTEX_REQUEUE_PRIVATE:
109672a12b3Skettenis *retval = futex_requeue(uaddr, val, g, (u_long)timeout, flags);
110004d4497Smpi break;
111004d4497Smpi default:
1123288ea8fSkettenis error = ENOSYS;
113004d4497Smpi break;
114004d4497Smpi }
115adec12fbSvisa rw_exit_write(&ftlock);
116004d4497Smpi
1173288ea8fSkettenis return error;
118004d4497Smpi }
119004d4497Smpi
120004d4497Smpi /*
121004d4497Smpi * Return an existing futex matching userspace address ``uaddr''.
122004d4497Smpi *
123004d4497Smpi * If such futex does not exist and FT_CREATE is given, create it.
124004d4497Smpi */
125004d4497Smpi struct futex *
futex_get(uint32_t * uaddr,int flags)126672a12b3Skettenis futex_get(uint32_t *uaddr, int flags)
127004d4497Smpi {
1283b07b489Sderaadt struct proc *p = curproc;
129672a12b3Skettenis vm_map_t map = &p->p_vmspace->vm_map;
130672a12b3Skettenis vm_map_entry_t entry;
131672a12b3Skettenis struct uvm_object *obj = NULL;
132e6eeb55aSkettenis struct vm_amap *amap = NULL;
133672a12b3Skettenis voff_t off = (vaddr_t)uaddr;
134004d4497Smpi struct futex *f;
135dbb9e73fSvisa struct futex_list *ftlist = &p->p_p->ps_ftlist;
136004d4497Smpi
137004d4497Smpi rw_assert_wrlock(&ftlock);
138004d4497Smpi
139672a12b3Skettenis if (!(flags & FT_PRIVATE)) {
140672a12b3Skettenis vm_map_lock_read(map);
141672a12b3Skettenis if (uvm_map_lookup_entry(map, (vaddr_t)uaddr, &entry) &&
142672a12b3Skettenis entry->inheritance == MAP_INHERIT_SHARE) {
143e6eeb55aSkettenis if (UVM_ET_ISOBJ(entry)) {
144dbb9e73fSvisa ftlist = &ftlist_shared;
145672a12b3Skettenis obj = entry->object.uvm_obj;
146e6eeb55aSkettenis off = entry->offset +
147e6eeb55aSkettenis ((vaddr_t)uaddr - entry->start);
148e6eeb55aSkettenis } else if (entry->aref.ar_amap) {
149e6eeb55aSkettenis ftlist = &ftlist_shared;
150e6eeb55aSkettenis amap = entry->aref.ar_amap;
151e6eeb55aSkettenis off = ptoa(entry->aref.ar_pageoff) +
152e6eeb55aSkettenis ((vaddr_t)uaddr - entry->start);
153e6eeb55aSkettenis }
154672a12b3Skettenis }
155672a12b3Skettenis vm_map_unlock_read(map);
156672a12b3Skettenis }
157672a12b3Skettenis
158dbb9e73fSvisa LIST_FOREACH(f, ftlist, ft_list) {
159e6eeb55aSkettenis if (f->ft_obj == obj && f->ft_amap == amap &&
160e6eeb55aSkettenis f->ft_off == off) {
161004d4497Smpi f->ft_refcnt++;
162004d4497Smpi break;
163004d4497Smpi }
164004d4497Smpi }
165004d4497Smpi
166672a12b3Skettenis if ((f == NULL) && (flags & FT_CREATE)) {
167004d4497Smpi /*
168004d4497Smpi * We rely on the rwlock to ensure that no other thread
169004d4497Smpi * create the same futex.
170004d4497Smpi */
171004d4497Smpi f = pool_get(&ftpool, PR_WAITOK);
172004d4497Smpi TAILQ_INIT(&f->ft_threads);
173672a12b3Skettenis f->ft_obj = obj;
174e6eeb55aSkettenis f->ft_amap = amap;
175672a12b3Skettenis f->ft_off = off;
176004d4497Smpi f->ft_refcnt = 1;
177dbb9e73fSvisa LIST_INSERT_HEAD(ftlist, f, ft_list);
178004d4497Smpi }
179004d4497Smpi
180004d4497Smpi return f;
181004d4497Smpi }
182004d4497Smpi
183004d4497Smpi /*
184004d4497Smpi * Release a given futex.
185004d4497Smpi */
186004d4497Smpi void
futex_put(struct futex * f)187004d4497Smpi futex_put(struct futex *f)
188004d4497Smpi {
189004d4497Smpi rw_assert_wrlock(&ftlock);
190004d4497Smpi
191004d4497Smpi KASSERT(f->ft_refcnt > 0);
192004d4497Smpi
193004d4497Smpi --f->ft_refcnt;
194004d4497Smpi if (f->ft_refcnt == 0) {
195004d4497Smpi KASSERT(TAILQ_EMPTY(&f->ft_threads));
196004d4497Smpi LIST_REMOVE(f, ft_list);
197004d4497Smpi pool_put(&ftpool, f);
198004d4497Smpi }
199004d4497Smpi }
200004d4497Smpi
201004d4497Smpi /*
202004d4497Smpi * Put the current thread on the sleep queue of the futex at address
203004d4497Smpi * ``uaddr''. Let it sleep for the specified ``timeout'' time, or
204678831beSjsg * indefinitely if the argument is NULL.
205004d4497Smpi */
206004d4497Smpi int
futex_wait(uint32_t * uaddr,uint32_t val,const struct timespec * timeout,int flags)207672a12b3Skettenis futex_wait(uint32_t *uaddr, uint32_t val, const struct timespec *timeout,
208672a12b3Skettenis int flags)
209004d4497Smpi {
210004d4497Smpi struct proc *p = curproc;
211004d4497Smpi struct futex *f;
212e80014c6Smpi uint64_t nsecs = INFSLP;
213004d4497Smpi uint32_t cval;
214004d4497Smpi int error;
215004d4497Smpi
216004d4497Smpi /*
217004d4497Smpi * After reading the value a race is still possible but
218004d4497Smpi * we deal with it by serializing all futex syscalls.
219004d4497Smpi */
220004d4497Smpi rw_assert_wrlock(&ftlock);
221004d4497Smpi
222004d4497Smpi /*
223004d4497Smpi * Read user space futex value
224004d4497Smpi */
225607fea69Skettenis if ((error = copyin32(uaddr, &cval)))
226004d4497Smpi return error;
227004d4497Smpi
228004d4497Smpi /* If the value changed, stop here. */
229004d4497Smpi if (cval != val)
230004d4497Smpi return EAGAIN;
231004d4497Smpi
232004d4497Smpi if (timeout != NULL) {
233004d4497Smpi struct timespec ts;
234004d4497Smpi
235004d4497Smpi if ((error = copyin(timeout, &ts, sizeof(ts))))
236004d4497Smpi return error;
237004d4497Smpi #ifdef KTRACE
238004d4497Smpi if (KTRPOINT(p, KTR_STRUCT))
239a51004aeSmpi ktrreltimespec(p, &ts);
240004d4497Smpi #endif
2419448fb1bScheloha if (ts.tv_sec < 0 || !timespecisvalid(&ts))
2421cd39fe8Scheloha return EINVAL;
24359fcf85eScheloha nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(&ts), MAXTSLP));
244004d4497Smpi }
245004d4497Smpi
246672a12b3Skettenis f = futex_get(uaddr, flags | FT_CREATE);
247004d4497Smpi TAILQ_INSERT_TAIL(&f->ft_threads, p, p_fut_link);
248004d4497Smpi p->p_futex = f;
249004d4497Smpi
250e80014c6Smpi error = rwsleep_nsec(p, &ftlock, PWAIT|PCATCH, "fsleep", nsecs);
251004d4497Smpi if (error == ERESTART)
252e6f936aeSpirofti error = ECANCELED;
253004d4497Smpi else if (error == EWOULDBLOCK) {
254678831beSjsg /* A race occurred between a wakeup and a timeout. */
255004d4497Smpi if (p->p_futex == NULL)
256004d4497Smpi error = 0;
257004d4497Smpi else
258004d4497Smpi error = ETIMEDOUT;
259004d4497Smpi }
260004d4497Smpi
261004d4497Smpi /* Remove ourself if we haven't been awaken. */
262004d4497Smpi if ((f = p->p_futex) != NULL) {
263004d4497Smpi p->p_futex = NULL;
264004d4497Smpi TAILQ_REMOVE(&f->ft_threads, p, p_fut_link);
265004d4497Smpi futex_put(f);
266004d4497Smpi }
267004d4497Smpi
268004d4497Smpi return error;
269004d4497Smpi }
270004d4497Smpi
271004d4497Smpi /*
272004d4497Smpi * Wakeup at most ``n'' sibling threads sleeping on a futex at address
273004d4497Smpi * ``uaddr'' and requeue at most ``m'' sibling threads on a futex at
274004d4497Smpi * address ``uaddr2''.
275004d4497Smpi */
276004d4497Smpi int
futex_requeue(uint32_t * uaddr,uint32_t n,uint32_t * uaddr2,uint32_t m,int flags)277672a12b3Skettenis futex_requeue(uint32_t *uaddr, uint32_t n, uint32_t *uaddr2, uint32_t m,
278672a12b3Skettenis int flags)
279004d4497Smpi {
280004d4497Smpi struct futex *f, *g;
281004d4497Smpi struct proc *p;
282004d4497Smpi uint32_t count = 0;
283004d4497Smpi
284004d4497Smpi rw_assert_wrlock(&ftlock);
285004d4497Smpi
286672a12b3Skettenis f = futex_get(uaddr, flags);
287004d4497Smpi if (f == NULL)
288004d4497Smpi return 0;
289004d4497Smpi
290004d4497Smpi while ((p = TAILQ_FIRST(&f->ft_threads)) != NULL && (count < (n + m))) {
291004d4497Smpi p->p_futex = NULL;
292004d4497Smpi TAILQ_REMOVE(&f->ft_threads, p, p_fut_link);
293004d4497Smpi futex_put(f);
294004d4497Smpi
295004d4497Smpi if (count < n) {
296004d4497Smpi wakeup_one(p);
297004d4497Smpi } else if (uaddr2 != NULL) {
298004d4497Smpi g = futex_get(uaddr2, FT_CREATE);
299004d4497Smpi TAILQ_INSERT_TAIL(&g->ft_threads, p, p_fut_link);
300004d4497Smpi p->p_futex = g;
301004d4497Smpi }
302004d4497Smpi count++;
303004d4497Smpi }
304004d4497Smpi
305004d4497Smpi futex_put(f);
306004d4497Smpi
307004d4497Smpi return count;
308004d4497Smpi }
309004d4497Smpi
310004d4497Smpi /*
311004d4497Smpi * Wakeup at most ``n'' sibling threads sleeping on a futex at address
312004d4497Smpi * ``uaddr''.
313004d4497Smpi */
314004d4497Smpi int
futex_wake(uint32_t * uaddr,uint32_t n,int flags)315672a12b3Skettenis futex_wake(uint32_t *uaddr, uint32_t n, int flags)
316004d4497Smpi {
317672a12b3Skettenis return futex_requeue(uaddr, n, NULL, 0, flags);
318004d4497Smpi }
319