1 /* $NetBSD: sys_memfd.c,v 1.11 2023/08/12 23:22:49 christos Exp $ */
2
3 /*-
4 * Copyright (c) 2023 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Theodore Preduta.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.11 2023/08/12 23:22:49 christos Exp $");
34
35 #include <sys/param.h>
36 #include <sys/types.h>
37
38 #include <sys/fcntl.h>
39 #include <sys/file.h>
40 #include <sys/filedesc.h>
41 #include <sys/memfd.h>
42 #include <sys/mman.h>
43 #include <sys/syscallargs.h>
44
45 #include <uvm/uvm_extern.h>
46 #include <uvm/uvm_object.h>
47
48 #define F_SEAL_ANY_WRITE (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
49 #define MFD_KNOWN_SEALS (F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \
50 |F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
51
52 static const char memfd_prefix[] = "memfd:";
53
54 static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
55 static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
56 static int memfd_ioctl(file_t *, u_long, void *);
57 static int memfd_fcntl(file_t *, u_int, void *);
58 static int memfd_stat(file_t *, struct stat *);
59 static int memfd_close(file_t *);
60 static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *,
61 struct uvm_object **, int *);
62 static int memfd_seek(file_t *, off_t, int, off_t *, int);
63 static int memfd_truncate_locked(file_t *, off_t);
64 static int memfd_truncate(file_t *, off_t);
65
66 static const struct fileops memfd_fileops = {
67 .fo_name = "memfd",
68 .fo_read = memfd_read,
69 .fo_write = memfd_write,
70 .fo_ioctl = memfd_ioctl,
71 .fo_fcntl = memfd_fcntl,
72 .fo_poll = fnullop_poll,
73 .fo_stat = memfd_stat,
74 .fo_close = memfd_close,
75 .fo_kqfilter = fnullop_kqfilter,
76 .fo_restart = fnullop_restart,
77 .fo_mmap = memfd_mmap,
78 .fo_seek = memfd_seek,
79 .fo_fpathconf = (void *)eopnotsupp,
80 .fo_posix_fadvise = (void *)eopnotsupp,
81 .fo_truncate = memfd_truncate,
82 };
83
84 /*
85 * memfd_create(2). Creat a file descriptor associated with anonymous
86 * memory.
87 */
88 int
sys_memfd_create(struct lwp * l,const struct sys_memfd_create_args * uap,register_t * retval)89 sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap,
90 register_t *retval)
91 {
92 /* {
93 syscallarg(const char *) name;
94 syscallarg(unsigned int) flags;
95 } */
96 int error, fd;
97 file_t *fp;
98 struct memfd *mfd;
99 struct proc *p = l->l_proc;
100 const unsigned int flags = SCARG(uap, flags);
101
102 if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING))
103 return EINVAL;
104
105 mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP);
106 mfd->mfd_size = 0;
107 mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */
108
109 CTASSERT(sizeof(memfd_prefix) < NAME_MAX); /* sanity check */
110 strcpy(mfd->mfd_name, memfd_prefix);
111 error = copyinstr(SCARG(uap, name),
112 &mfd->mfd_name[sizeof(memfd_prefix) - 1],
113 sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL);
114 if (error != 0)
115 goto leave;
116
117 getnanotime(&mfd->mfd_btime);
118
119 if ((flags & MFD_ALLOW_SEALING) == 0)
120 mfd->mfd_seals |= F_SEAL_SEAL;
121
122 error = fd_allocfile(&fp, &fd);
123 if (error != 0)
124 goto leave;
125
126 fp->f_flag = FREAD|FWRITE;
127 fp->f_type = DTYPE_MEMFD;
128 fp->f_ops = &memfd_fileops;
129 fp->f_memfd = mfd;
130 fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0);
131 fd_affix(p, fp, fd);
132
133 *retval = fd;
134 return 0;
135
136 leave:
137 uao_detach(mfd->mfd_uobj);
138 kmem_free(mfd, sizeof(*mfd));
139 return error;
140 }
141
142 static int
memfd_read(file_t * fp,off_t * offp,struct uio * uio,kauth_cred_t cred,int flags)143 memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
144 int flags)
145 {
146 int error;
147 vsize_t todo;
148 struct memfd *mfd = fp->f_memfd;
149
150 mutex_enter(&fp->f_lock);
151
152 if (*offp < 0) {
153 error = EINVAL;
154 goto leave;
155 }
156
157 /* Trying to read past the end does nothing. */
158 if (*offp >= mfd->mfd_size) {
159 error = 0;
160 goto leave;
161 }
162
163 uio->uio_offset = *offp;
164 todo = MIN(uio->uio_resid, mfd->mfd_size - *offp);
165 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
166 UBC_READ|UBC_PARTIALOK);
167 if (flags & FOF_UPDATE_OFFSET)
168 *offp = uio->uio_offset;
169
170 leave:
171 getnanotime(&mfd->mfd_atime);
172
173
174 mutex_exit(&fp->f_lock);
175
176 return error;
177 }
178
179 static int
memfd_write(file_t * fp,off_t * offp,struct uio * uio,kauth_cred_t cred,int flags)180 memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
181 int flags)
182 {
183 int error;
184 vsize_t todo;
185 struct memfd *mfd = fp->f_memfd;
186
187 mutex_enter(&fp->f_lock);
188
189 if (mfd->mfd_seals & F_SEAL_ANY_WRITE) {
190 error = EPERM;
191 goto leave;
192 }
193
194 if (*offp < 0) {
195 error = EINVAL;
196 goto leave;
197 }
198
199 uio->uio_offset = *offp;
200 todo = uio->uio_resid;
201
202 if (mfd->mfd_seals & F_SEAL_GROW) {
203 if (*offp >= mfd->mfd_size) {
204 error = EPERM;
205 goto leave;
206 }
207
208 /* Truncate the write to fit in mfd_size */
209 if (*offp + uio->uio_resid >= mfd->mfd_size)
210 todo = mfd->mfd_size - *offp;
211 } else if (*offp + uio->uio_resid >= mfd->mfd_size) {
212 /* Grow to accommodate the write request. */
213 error = memfd_truncate_locked(fp, *offp + uio->uio_resid);
214 if (error != 0)
215 goto leave;
216 }
217
218 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
219 UBC_WRITE|UBC_PARTIALOK);
220 if (flags & FOF_UPDATE_OFFSET)
221 *offp = uio->uio_offset;
222
223 getnanotime(&mfd->mfd_mtime);
224
225 leave:
226 mutex_exit(&fp->f_lock);
227
228 return error;
229 }
230
231 static int
memfd_ioctl(file_t * fp,u_long cmd,void * data)232 memfd_ioctl(file_t *fp, u_long cmd, void *data)
233 {
234
235 return EINVAL;
236 }
237
238 static int
memfd_fcntl(file_t * fp,u_int cmd,void * data)239 memfd_fcntl(file_t *fp, u_int cmd, void *data)
240 {
241 struct memfd *mfd = fp->f_memfd;
242 int error = 0;
243
244 switch (cmd) {
245 case F_GETPATH:
246 strncpy(data, mfd->mfd_name, MAXPATHLEN);
247 return 0;
248
249 case F_ADD_SEALS:
250 mutex_enter(&fp->f_lock);
251
252 if (mfd->mfd_seals & F_SEAL_SEAL) {
253 error = EPERM;
254 goto leave_add_seals;
255 }
256
257 if (*(int *)data & ~MFD_KNOWN_SEALS) {
258 error = EINVAL;
259 goto leave_add_seals;
260 }
261
262 /*
263 * Can only add F_SEAL_WRITE if there are no currently
264 * open mmaps.
265 *
266 * XXX should only disallow if there are no currently
267 * open mmaps with PROT_WRITE.
268 */
269 if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 &&
270 (*(int *)data & F_SEAL_WRITE) != 0 &&
271 mfd->mfd_uobj->uo_refs > 1)
272 {
273 error = EBUSY;
274 goto leave_add_seals;
275 }
276
277 mfd->mfd_seals |= *(int *)data;
278
279 leave_add_seals:
280 mutex_exit(&fp->f_lock);
281 return error;
282
283 case F_GET_SEALS:
284 mutex_enter(&fp->f_lock);
285 *(int *)data = mfd->mfd_seals;
286 mutex_exit(&fp->f_lock);
287 return 0;
288
289 default:
290 return EINVAL;
291 }
292 }
293
294 static int
memfd_stat(file_t * fp,struct stat * st)295 memfd_stat(file_t *fp, struct stat *st)
296 {
297 struct memfd *mfd = fp->f_memfd;
298
299 mutex_enter(&fp->f_lock);
300
301 memset(st, 0, sizeof(*st));
302 st->st_uid = kauth_cred_geteuid(fp->f_cred);
303 st->st_gid = kauth_cred_getegid(fp->f_cred);
304 st->st_size = mfd->mfd_size;
305
306 st->st_mode = S_IREAD;
307 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0)
308 st->st_mode |= S_IWRITE;
309
310 st->st_birthtimespec = mfd->mfd_btime;
311 st->st_ctimespec = mfd->mfd_mtime;
312 st->st_atimespec = mfd->mfd_atime;
313 st->st_mtimespec = mfd->mfd_mtime;
314
315 mutex_exit(&fp->f_lock);
316
317 return 0;
318 }
319
320 static int
memfd_close(file_t * fp)321 memfd_close(file_t *fp)
322 {
323 struct memfd *mfd = fp->f_memfd;
324
325 uao_detach(mfd->mfd_uobj);
326
327 kmem_free(mfd, sizeof(*mfd));
328 fp->f_memfd = NULL;
329
330 return 0;
331 }
332
333 static int
memfd_mmap(file_t * fp,off_t * offp,size_t size,int prot,int * flagsp,int * advicep,struct uvm_object ** uobjp,int * maxprotp)334 memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
335 int *advicep, struct uvm_object **uobjp, int *maxprotp)
336 {
337 struct memfd *mfd = fp->f_memfd;
338 int error = 0;
339
340 /* uvm_mmap guarantees page-aligned offset and size. */
341 KASSERT(*offp == round_page(*offp));
342 KASSERT(size == round_page(size));
343 KASSERT(size > 0);
344
345 mutex_enter(&fp->f_lock);
346
347 if (*offp < 0) {
348 error = EINVAL;
349 goto leave;
350 }
351 if (*offp + size > mfd->mfd_size) {
352 error = EINVAL;
353 goto leave;
354 }
355
356 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) &&
357 (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0) {
358 error = EPERM;
359 goto leave;
360 }
361
362 uao_reference(fp->f_memfd->mfd_uobj);
363 *uobjp = fp->f_memfd->mfd_uobj;
364
365 *maxprotp = prot;
366 *advicep = UVM_ADV_RANDOM;
367
368 leave:
369 mutex_exit(&fp->f_lock);
370
371 return error;
372 }
373
374 static int
memfd_seek(file_t * fp,off_t delta,int whence,off_t * newoffp,int flags)375 memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp,
376 int flags)
377 {
378 off_t newoff;
379 int error = 0;
380
381 mutex_enter(&fp->f_lock);
382
383 switch (whence) {
384 case SEEK_CUR:
385 newoff = fp->f_offset + delta;
386 break;
387
388 case SEEK_END:
389 newoff = fp->f_memfd->mfd_size + delta;
390 break;
391
392 case SEEK_SET:
393 newoff = delta;
394 break;
395
396 default:
397 error = EINVAL;
398 goto leave;
399 }
400
401 if (newoffp)
402 *newoffp = newoff;
403 if (flags & FOF_UPDATE_OFFSET)
404 fp->f_offset = newoff;
405
406 leave:
407 mutex_exit(&fp->f_lock);
408
409 return error;
410 }
411
412 static int
memfd_truncate_locked(file_t * fp,off_t length)413 memfd_truncate_locked(file_t *fp, off_t length)
414 {
415 struct memfd *mfd = fp->f_memfd;
416 voff_t start, end;
417 int error = 0;
418
419 KASSERT(mutex_owned(&fp->f_lock));
420
421 if (length < 0)
422 return EINVAL;
423 if (length == mfd->mfd_size)
424 return 0;
425
426 if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size)
427 return EPERM;
428 if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size)
429 return EPERM;
430
431 if (length > mfd->mfd_size)
432 ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size,
433 length - mfd->mfd_size, 0);
434 else {
435 /* length < mfd->mfd_size, so try to get rid of excess pages */
436 start = round_page(length);
437 end = round_page(mfd->mfd_size);
438
439 if (start < end) { /* we actually have pages to remove */
440 rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER);
441 error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj,
442 start, end, PGO_FREE);
443 /* pgo_put drops vmobjlock */
444 }
445 }
446
447 getnanotime(&mfd->mfd_mtime);
448 mfd->mfd_size = length;
449
450 return error;
451 }
452
453 static int
memfd_truncate(file_t * fp,off_t length)454 memfd_truncate(file_t *fp, off_t length)
455 {
456 int error;
457
458 mutex_enter(&fp->f_lock);
459 error = memfd_truncate_locked(fp, length);
460 mutex_exit(&fp->f_lock);
461 return error;
462 }
463