xref: /openbsd-src/sys/kern/sys_futex.c (revision abbd98eac20dc6a00c56b71aeec2ac32f8e8f95a)
1*abbd98eaSmiod /*	$OpenBSD: sys_futex.c,v 1.22 2023/08/14 07:42:34 miod Exp $ */
2004d4497Smpi 
3004d4497Smpi /*
4004d4497Smpi  * Copyright (c) 2016-2017 Martin Pieuchot
5004d4497Smpi  *
6004d4497Smpi  * Permission to use, copy, modify, and distribute this software for any
7004d4497Smpi  * purpose with or without fee is hereby granted, provided that the above
8004d4497Smpi  * copyright notice and this permission notice appear in all copies.
9004d4497Smpi  *
10004d4497Smpi  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11004d4497Smpi  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12004d4497Smpi  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13004d4497Smpi  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14004d4497Smpi  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15004d4497Smpi  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16004d4497Smpi  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17004d4497Smpi  */
18004d4497Smpi 
19004d4497Smpi #include <sys/param.h>
20004d4497Smpi #include <sys/systm.h>
21004d4497Smpi #include <sys/proc.h>
22004d4497Smpi #include <sys/mount.h>
23004d4497Smpi #include <sys/syscallargs.h>
24004d4497Smpi #include <sys/pool.h>
25004d4497Smpi #include <sys/time.h>
26004d4497Smpi #include <sys/rwlock.h>
27004d4497Smpi #include <sys/futex.h>
28004d4497Smpi 
29004d4497Smpi #ifdef KTRACE
30004d4497Smpi #include <sys/ktrace.h>
31004d4497Smpi #endif
32004d4497Smpi 
33672a12b3Skettenis #include <uvm/uvm.h>
34672a12b3Skettenis 
35004d4497Smpi /*
36004d4497Smpi  * Kernel representation of a futex.
37004d4497Smpi  */
38004d4497Smpi struct futex {
39004d4497Smpi 	LIST_ENTRY(futex)	 ft_list;	/* list of all futexes */
40004d4497Smpi 	TAILQ_HEAD(, proc)	 ft_threads;	/* sleeping queue */
41672a12b3Skettenis 	struct uvm_object	*ft_obj;	/* UVM object */
42e6eeb55aSkettenis 	struct vm_amap		*ft_amap;	/* UVM amap */
43672a12b3Skettenis 	voff_t			 ft_off;	/* UVM offset */
44004d4497Smpi 	unsigned int		 ft_refcnt;	/* # of references */
45004d4497Smpi };
46004d4497Smpi 
47004d4497Smpi /* Syscall helpers. */
48672a12b3Skettenis int	 futex_wait(uint32_t *, uint32_t, const struct timespec *, int);
49672a12b3Skettenis int	 futex_wake(uint32_t *, uint32_t, int);
50672a12b3Skettenis int	 futex_requeue(uint32_t *, uint32_t, uint32_t *, uint32_t, int);
51004d4497Smpi 
52004d4497Smpi /* Flags for futex_get(). */
53004d4497Smpi #define FT_CREATE	0x1	/* Create a futex if it doesn't exist. */
54672a12b3Skettenis #define FT_PRIVATE	0x2	/* Futex is process-private. */
55004d4497Smpi 
56004d4497Smpi struct futex *futex_get(uint32_t *, int);
57004d4497Smpi void	 futex_put(struct futex *);
58004d4497Smpi 
59004d4497Smpi /*
60dbb9e73fSvisa  * The global futex lock serializes futex(2) calls so that no wakeup
61dbb9e73fSvisa  * event is lost, and protects all futex lists and futex states.
62004d4497Smpi  */
63004d4497Smpi struct rwlock			ftlock = RWLOCK_INITIALIZER("futex");
64dbb9e73fSvisa static struct futex_list	ftlist_shared =
65dbb9e73fSvisa 				    LIST_HEAD_INITIALIZER(ftlist_shared);
66004d4497Smpi struct pool			ftpool;
67004d4497Smpi 
68004d4497Smpi 
69004d4497Smpi void
futex_init(void)70004d4497Smpi futex_init(void)
71004d4497Smpi {
72703735beSguenther 	pool_init(&ftpool, sizeof(struct futex), 0, IPL_NONE,
73703735beSguenther 	    PR_WAITOK | PR_RWLOCK, "futexpl", NULL);
74004d4497Smpi }
75004d4497Smpi 
76004d4497Smpi int
sys_futex(struct proc * p,void * v,register_t * retval)77004d4497Smpi sys_futex(struct proc *p, void *v, register_t *retval)
78004d4497Smpi {
79004d4497Smpi 	struct sys_futex_args /* {
80004d4497Smpi 		syscallarg(uint32_t *) f;
81004d4497Smpi 		syscallarg(int) op;
82004d4497Smpi 		syscallarg(inr) val;
83004d4497Smpi 		syscallarg(const struct timespec *) timeout;
84004d4497Smpi 		syscallarg(uint32_t *) g;
85004d4497Smpi 	} */ *uap = v;
86004d4497Smpi 	uint32_t *uaddr = SCARG(uap, f);
87004d4497Smpi 	int op = SCARG(uap, op);
88004d4497Smpi 	uint32_t val = SCARG(uap, val);
89004d4497Smpi 	const struct timespec *timeout = SCARG(uap, timeout);
90004d4497Smpi 	void *g = SCARG(uap, g);
91672a12b3Skettenis 	int flags = 0;
923288ea8fSkettenis 	int error = 0;
93672a12b3Skettenis 
94672a12b3Skettenis 	if (op & FUTEX_PRIVATE_FLAG)
95672a12b3Skettenis 		flags |= FT_PRIVATE;
96004d4497Smpi 
97adec12fbSvisa 	rw_enter_write(&ftlock);
98004d4497Smpi 	switch (op) {
99004d4497Smpi 	case FUTEX_WAIT:
100672a12b3Skettenis 	case FUTEX_WAIT_PRIVATE:
1013288ea8fSkettenis 		error = futex_wait(uaddr, val, timeout, flags);
102004d4497Smpi 		break;
103004d4497Smpi 	case FUTEX_WAKE:
104672a12b3Skettenis 	case FUTEX_WAKE_PRIVATE:
105672a12b3Skettenis 		*retval = futex_wake(uaddr, val, flags);
106004d4497Smpi 		break;
107004d4497Smpi 	case FUTEX_REQUEUE:
108672a12b3Skettenis 	case FUTEX_REQUEUE_PRIVATE:
109672a12b3Skettenis 		*retval = futex_requeue(uaddr, val, g, (u_long)timeout, flags);
110004d4497Smpi 		break;
111004d4497Smpi 	default:
1123288ea8fSkettenis 		error = ENOSYS;
113004d4497Smpi 		break;
114004d4497Smpi 	}
115adec12fbSvisa 	rw_exit_write(&ftlock);
116004d4497Smpi 
1173288ea8fSkettenis 	return error;
118004d4497Smpi }
119004d4497Smpi 
120004d4497Smpi /*
121004d4497Smpi  * Return an existing futex matching userspace address ``uaddr''.
122004d4497Smpi  *
123004d4497Smpi  * If such futex does not exist and FT_CREATE is given, create it.
124004d4497Smpi  */
125004d4497Smpi struct futex *
futex_get(uint32_t * uaddr,int flags)126672a12b3Skettenis futex_get(uint32_t *uaddr, int flags)
127004d4497Smpi {
1283b07b489Sderaadt 	struct proc *p = curproc;
129672a12b3Skettenis 	vm_map_t map = &p->p_vmspace->vm_map;
130672a12b3Skettenis 	vm_map_entry_t entry;
131672a12b3Skettenis 	struct uvm_object *obj = NULL;
132e6eeb55aSkettenis 	struct vm_amap *amap = NULL;
133672a12b3Skettenis 	voff_t off = (vaddr_t)uaddr;
134004d4497Smpi 	struct futex *f;
135dbb9e73fSvisa 	struct futex_list *ftlist = &p->p_p->ps_ftlist;
136004d4497Smpi 
137004d4497Smpi 	rw_assert_wrlock(&ftlock);
138004d4497Smpi 
139672a12b3Skettenis 	if (!(flags & FT_PRIVATE)) {
140672a12b3Skettenis 		vm_map_lock_read(map);
141672a12b3Skettenis 		if (uvm_map_lookup_entry(map, (vaddr_t)uaddr, &entry) &&
142672a12b3Skettenis 		    entry->inheritance == MAP_INHERIT_SHARE) {
143e6eeb55aSkettenis 			if (UVM_ET_ISOBJ(entry)) {
144dbb9e73fSvisa 				ftlist = &ftlist_shared;
145672a12b3Skettenis 				obj = entry->object.uvm_obj;
146e6eeb55aSkettenis 				off = entry->offset +
147e6eeb55aSkettenis 				    ((vaddr_t)uaddr - entry->start);
148e6eeb55aSkettenis 			} else if (entry->aref.ar_amap) {
149e6eeb55aSkettenis 				ftlist = &ftlist_shared;
150e6eeb55aSkettenis 				amap = entry->aref.ar_amap;
151e6eeb55aSkettenis 				off = ptoa(entry->aref.ar_pageoff) +
152e6eeb55aSkettenis 				    ((vaddr_t)uaddr - entry->start);
153e6eeb55aSkettenis 			}
154672a12b3Skettenis 		}
155672a12b3Skettenis 		vm_map_unlock_read(map);
156672a12b3Skettenis 	}
157672a12b3Skettenis 
158dbb9e73fSvisa 	LIST_FOREACH(f, ftlist, ft_list) {
159e6eeb55aSkettenis 		if (f->ft_obj == obj && f->ft_amap == amap &&
160e6eeb55aSkettenis 		    f->ft_off == off) {
161004d4497Smpi 			f->ft_refcnt++;
162004d4497Smpi 			break;
163004d4497Smpi 		}
164004d4497Smpi 	}
165004d4497Smpi 
166672a12b3Skettenis 	if ((f == NULL) && (flags & FT_CREATE)) {
167004d4497Smpi 		/*
168004d4497Smpi 		 * We rely on the rwlock to ensure that no other thread
169004d4497Smpi 		 * create the same futex.
170004d4497Smpi 		 */
171004d4497Smpi 		f = pool_get(&ftpool, PR_WAITOK);
172004d4497Smpi 		TAILQ_INIT(&f->ft_threads);
173672a12b3Skettenis 		f->ft_obj = obj;
174e6eeb55aSkettenis 		f->ft_amap = amap;
175672a12b3Skettenis 		f->ft_off = off;
176004d4497Smpi 		f->ft_refcnt = 1;
177dbb9e73fSvisa 		LIST_INSERT_HEAD(ftlist, f, ft_list);
178004d4497Smpi 	}
179004d4497Smpi 
180004d4497Smpi 	return f;
181004d4497Smpi }
182004d4497Smpi 
183004d4497Smpi /*
184004d4497Smpi  * Release a given futex.
185004d4497Smpi  */
186004d4497Smpi void
futex_put(struct futex * f)187004d4497Smpi futex_put(struct futex *f)
188004d4497Smpi {
189004d4497Smpi 	rw_assert_wrlock(&ftlock);
190004d4497Smpi 
191004d4497Smpi 	KASSERT(f->ft_refcnt > 0);
192004d4497Smpi 
193004d4497Smpi 	--f->ft_refcnt;
194004d4497Smpi 	if (f->ft_refcnt == 0) {
195004d4497Smpi 		KASSERT(TAILQ_EMPTY(&f->ft_threads));
196004d4497Smpi 		LIST_REMOVE(f, ft_list);
197004d4497Smpi 		pool_put(&ftpool, f);
198004d4497Smpi 	}
199004d4497Smpi }
200004d4497Smpi 
201004d4497Smpi /*
202004d4497Smpi  * Put the current thread on the sleep queue of the futex at address
203004d4497Smpi  * ``uaddr''.  Let it sleep for the specified ``timeout'' time, or
204678831beSjsg  * indefinitely if the argument is NULL.
205004d4497Smpi  */
206004d4497Smpi int
futex_wait(uint32_t * uaddr,uint32_t val,const struct timespec * timeout,int flags)207672a12b3Skettenis futex_wait(uint32_t *uaddr, uint32_t val, const struct timespec *timeout,
208672a12b3Skettenis     int flags)
209004d4497Smpi {
210004d4497Smpi 	struct proc *p = curproc;
211004d4497Smpi 	struct futex *f;
212e80014c6Smpi 	uint64_t nsecs = INFSLP;
213004d4497Smpi 	uint32_t cval;
214004d4497Smpi 	int error;
215004d4497Smpi 
216004d4497Smpi 	/*
217004d4497Smpi 	 * After reading the value a race is still possible but
218004d4497Smpi 	 * we deal with it by serializing all futex syscalls.
219004d4497Smpi 	 */
220004d4497Smpi 	rw_assert_wrlock(&ftlock);
221004d4497Smpi 
222004d4497Smpi 	/*
223004d4497Smpi 	 * Read user space futex value
224004d4497Smpi 	 */
225607fea69Skettenis 	if ((error = copyin32(uaddr, &cval)))
226004d4497Smpi 		return error;
227004d4497Smpi 
228004d4497Smpi 	/* If the value changed, stop here. */
229004d4497Smpi 	if (cval != val)
230004d4497Smpi 		return EAGAIN;
231004d4497Smpi 
232004d4497Smpi 	if (timeout != NULL) {
233004d4497Smpi 		struct timespec ts;
234004d4497Smpi 
235004d4497Smpi 		if ((error = copyin(timeout, &ts, sizeof(ts))))
236004d4497Smpi 			return error;
237004d4497Smpi #ifdef KTRACE
238004d4497Smpi 		if (KTRPOINT(p, KTR_STRUCT))
239a51004aeSmpi 			ktrreltimespec(p, &ts);
240004d4497Smpi #endif
2419448fb1bScheloha 		if (ts.tv_sec < 0 || !timespecisvalid(&ts))
2421cd39fe8Scheloha 			return EINVAL;
24359fcf85eScheloha 		nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(&ts), MAXTSLP));
244004d4497Smpi 	}
245004d4497Smpi 
246672a12b3Skettenis 	f = futex_get(uaddr, flags | FT_CREATE);
247004d4497Smpi 	TAILQ_INSERT_TAIL(&f->ft_threads, p, p_fut_link);
248004d4497Smpi 	p->p_futex = f;
249004d4497Smpi 
250e80014c6Smpi 	error = rwsleep_nsec(p, &ftlock, PWAIT|PCATCH, "fsleep", nsecs);
251004d4497Smpi 	if (error == ERESTART)
252e6f936aeSpirofti 		error = ECANCELED;
253004d4497Smpi 	else if (error == EWOULDBLOCK) {
254678831beSjsg 		/* A race occurred between a wakeup and a timeout. */
255004d4497Smpi 		if (p->p_futex == NULL)
256004d4497Smpi 			error = 0;
257004d4497Smpi 		else
258004d4497Smpi 			error = ETIMEDOUT;
259004d4497Smpi 	}
260004d4497Smpi 
261004d4497Smpi 	/* Remove ourself if we haven't been awaken. */
262004d4497Smpi 	if ((f = p->p_futex) != NULL) {
263004d4497Smpi 		p->p_futex = NULL;
264004d4497Smpi 		TAILQ_REMOVE(&f->ft_threads, p, p_fut_link);
265004d4497Smpi 		futex_put(f);
266004d4497Smpi 	}
267004d4497Smpi 
268004d4497Smpi 	return error;
269004d4497Smpi }
270004d4497Smpi 
271004d4497Smpi /*
272004d4497Smpi  * Wakeup at most ``n'' sibling threads sleeping on a futex at address
273004d4497Smpi  * ``uaddr'' and requeue at most ``m'' sibling threads on a futex at
274004d4497Smpi  * address ``uaddr2''.
275004d4497Smpi  */
276004d4497Smpi int
futex_requeue(uint32_t * uaddr,uint32_t n,uint32_t * uaddr2,uint32_t m,int flags)277672a12b3Skettenis futex_requeue(uint32_t *uaddr, uint32_t n, uint32_t *uaddr2, uint32_t m,
278672a12b3Skettenis     int flags)
279004d4497Smpi {
280004d4497Smpi 	struct futex *f, *g;
281004d4497Smpi 	struct proc *p;
282004d4497Smpi 	uint32_t count = 0;
283004d4497Smpi 
284004d4497Smpi 	rw_assert_wrlock(&ftlock);
285004d4497Smpi 
286672a12b3Skettenis 	f = futex_get(uaddr, flags);
287004d4497Smpi 	if (f == NULL)
288004d4497Smpi 		return 0;
289004d4497Smpi 
290004d4497Smpi 	while ((p = TAILQ_FIRST(&f->ft_threads)) != NULL && (count < (n + m))) {
291004d4497Smpi 		p->p_futex = NULL;
292004d4497Smpi 		TAILQ_REMOVE(&f->ft_threads, p, p_fut_link);
293004d4497Smpi 		futex_put(f);
294004d4497Smpi 
295004d4497Smpi 		if (count < n) {
296004d4497Smpi 			wakeup_one(p);
297004d4497Smpi 		} else if (uaddr2 != NULL) {
298004d4497Smpi 			g = futex_get(uaddr2, FT_CREATE);
299004d4497Smpi 			TAILQ_INSERT_TAIL(&g->ft_threads, p, p_fut_link);
300004d4497Smpi 			p->p_futex = g;
301004d4497Smpi 		}
302004d4497Smpi 		count++;
303004d4497Smpi 	}
304004d4497Smpi 
305004d4497Smpi 	futex_put(f);
306004d4497Smpi 
307004d4497Smpi 	return count;
308004d4497Smpi }
309004d4497Smpi 
310004d4497Smpi /*
311004d4497Smpi  * Wakeup at most ``n'' sibling threads sleeping on a futex at address
312004d4497Smpi  * ``uaddr''.
313004d4497Smpi  */
314004d4497Smpi int
futex_wake(uint32_t * uaddr,uint32_t n,int flags)315672a12b3Skettenis futex_wake(uint32_t *uaddr, uint32_t n, int flags)
316004d4497Smpi {
317672a12b3Skettenis 	return futex_requeue(uaddr, n, NULL, 0, flags);
318004d4497Smpi }
319