1 /* $NetBSD: sys_memfd.c,v 1.11 2023/08/12 23:22:49 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2023 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Theodore Preduta. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.11 2023/08/12 23:22:49 christos Exp $"); 34 35 #include <sys/param.h> 36 #include <sys/types.h> 37 38 #include <sys/fcntl.h> 39 #include <sys/file.h> 40 #include <sys/filedesc.h> 41 #include <sys/memfd.h> 42 #include <sys/mman.h> 43 #include <sys/syscallargs.h> 44 45 #include <uvm/uvm_extern.h> 46 #include <uvm/uvm_object.h> 47 48 #define F_SEAL_ANY_WRITE (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) 49 #define MFD_KNOWN_SEALS (F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \ 50 |F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) 51 52 static const char memfd_prefix[] = "memfd:"; 53 54 static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int); 55 static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int); 56 static int memfd_ioctl(file_t *, u_long, void *); 57 static int memfd_fcntl(file_t *, u_int, void *); 58 static int memfd_stat(file_t *, struct stat *); 59 static int memfd_close(file_t *); 60 static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *, 61 struct uvm_object **, int *); 62 static int memfd_seek(file_t *, off_t, int, off_t *, int); 63 static int memfd_truncate_locked(file_t *, off_t); 64 static int memfd_truncate(file_t *, off_t); 65 66 static const struct fileops memfd_fileops = { 67 .fo_name = "memfd", 68 .fo_read = memfd_read, 69 .fo_write = memfd_write, 70 .fo_ioctl = memfd_ioctl, 71 .fo_fcntl = memfd_fcntl, 72 .fo_poll = fnullop_poll, 73 .fo_stat = memfd_stat, 74 .fo_close = memfd_close, 75 .fo_kqfilter = fnullop_kqfilter, 76 .fo_restart = fnullop_restart, 77 .fo_mmap = memfd_mmap, 78 .fo_seek = memfd_seek, 79 .fo_fpathconf = (void *)eopnotsupp, 80 .fo_posix_fadvise = (void *)eopnotsupp, 81 .fo_truncate = memfd_truncate, 82 }; 83 84 /* 85 * memfd_create(2). Creat a file descriptor associated with anonymous 86 * memory. 87 */ 88 int 89 sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap, 90 register_t *retval) 91 { 92 /* { 93 syscallarg(const char *) name; 94 syscallarg(unsigned int) flags; 95 } */ 96 int error, fd; 97 file_t *fp; 98 struct memfd *mfd; 99 struct proc *p = l->l_proc; 100 const unsigned int flags = SCARG(uap, flags); 101 102 if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING)) 103 return EINVAL; 104 105 mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP); 106 mfd->mfd_size = 0; 107 mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */ 108 109 CTASSERT(sizeof(memfd_prefix) < NAME_MAX); /* sanity check */ 110 strcpy(mfd->mfd_name, memfd_prefix); 111 error = copyinstr(SCARG(uap, name), 112 &mfd->mfd_name[sizeof(memfd_prefix) - 1], 113 sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL); 114 if (error != 0) 115 goto leave; 116 117 getnanotime(&mfd->mfd_btime); 118 119 if ((flags & MFD_ALLOW_SEALING) == 0) 120 mfd->mfd_seals |= F_SEAL_SEAL; 121 122 error = fd_allocfile(&fp, &fd); 123 if (error != 0) 124 goto leave; 125 126 fp->f_flag = FREAD|FWRITE; 127 fp->f_type = DTYPE_MEMFD; 128 fp->f_ops = &memfd_fileops; 129 fp->f_memfd = mfd; 130 fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0); 131 fd_affix(p, fp, fd); 132 133 *retval = fd; 134 return 0; 135 136 leave: 137 uao_detach(mfd->mfd_uobj); 138 kmem_free(mfd, sizeof(*mfd)); 139 return error; 140 } 141 142 static int 143 memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, 144 int flags) 145 { 146 int error; 147 vsize_t todo; 148 struct memfd *mfd = fp->f_memfd; 149 150 mutex_enter(&fp->f_lock); 151 152 if (*offp < 0) { 153 error = EINVAL; 154 goto leave; 155 } 156 157 /* Trying to read past the end does nothing. */ 158 if (*offp >= mfd->mfd_size) { 159 error = 0; 160 goto leave; 161 } 162 163 uio->uio_offset = *offp; 164 todo = MIN(uio->uio_resid, mfd->mfd_size - *offp); 165 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL, 166 UBC_READ|UBC_PARTIALOK); 167 if (flags & FOF_UPDATE_OFFSET) 168 *offp = uio->uio_offset; 169 170 leave: 171 getnanotime(&mfd->mfd_atime); 172 173 174 mutex_exit(&fp->f_lock); 175 176 return error; 177 } 178 179 static int 180 memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, 181 int flags) 182 { 183 int error; 184 vsize_t todo; 185 struct memfd *mfd = fp->f_memfd; 186 187 mutex_enter(&fp->f_lock); 188 189 if (mfd->mfd_seals & F_SEAL_ANY_WRITE) { 190 error = EPERM; 191 goto leave; 192 } 193 194 if (*offp < 0) { 195 error = EINVAL; 196 goto leave; 197 } 198 199 uio->uio_offset = *offp; 200 todo = uio->uio_resid; 201 202 if (mfd->mfd_seals & F_SEAL_GROW) { 203 if (*offp >= mfd->mfd_size) { 204 error = EPERM; 205 goto leave; 206 } 207 208 /* Truncate the write to fit in mfd_size */ 209 if (*offp + uio->uio_resid >= mfd->mfd_size) 210 todo = mfd->mfd_size - *offp; 211 } else if (*offp + uio->uio_resid >= mfd->mfd_size) { 212 /* Grow to accommodate the write request. */ 213 error = memfd_truncate_locked(fp, *offp + uio->uio_resid); 214 if (error != 0) 215 goto leave; 216 } 217 218 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL, 219 UBC_WRITE|UBC_PARTIALOK); 220 if (flags & FOF_UPDATE_OFFSET) 221 *offp = uio->uio_offset; 222 223 getnanotime(&mfd->mfd_mtime); 224 225 leave: 226 mutex_exit(&fp->f_lock); 227 228 return error; 229 } 230 231 static int 232 memfd_ioctl(file_t *fp, u_long cmd, void *data) 233 { 234 235 return EINVAL; 236 } 237 238 static int 239 memfd_fcntl(file_t *fp, u_int cmd, void *data) 240 { 241 struct memfd *mfd = fp->f_memfd; 242 int error = 0; 243 244 switch (cmd) { 245 case F_GETPATH: 246 strncpy(data, mfd->mfd_name, MAXPATHLEN); 247 return 0; 248 249 case F_ADD_SEALS: 250 mutex_enter(&fp->f_lock); 251 252 if (mfd->mfd_seals & F_SEAL_SEAL) { 253 error = EPERM; 254 goto leave_add_seals; 255 } 256 257 if (*(int *)data & ~MFD_KNOWN_SEALS) { 258 error = EINVAL; 259 goto leave_add_seals; 260 } 261 262 /* 263 * Can only add F_SEAL_WRITE if there are no currently 264 * open mmaps. 265 * 266 * XXX should only disallow if there are no currently 267 * open mmaps with PROT_WRITE. 268 */ 269 if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 && 270 (*(int *)data & F_SEAL_WRITE) != 0 && 271 mfd->mfd_uobj->uo_refs > 1) 272 { 273 error = EBUSY; 274 goto leave_add_seals; 275 } 276 277 mfd->mfd_seals |= *(int *)data; 278 279 leave_add_seals: 280 mutex_exit(&fp->f_lock); 281 return error; 282 283 case F_GET_SEALS: 284 mutex_enter(&fp->f_lock); 285 *(int *)data = mfd->mfd_seals; 286 mutex_exit(&fp->f_lock); 287 return 0; 288 289 default: 290 return EINVAL; 291 } 292 } 293 294 static int 295 memfd_stat(file_t *fp, struct stat *st) 296 { 297 struct memfd *mfd = fp->f_memfd; 298 299 mutex_enter(&fp->f_lock); 300 301 memset(st, 0, sizeof(*st)); 302 st->st_uid = kauth_cred_geteuid(fp->f_cred); 303 st->st_gid = kauth_cred_getegid(fp->f_cred); 304 st->st_size = mfd->mfd_size; 305 306 st->st_mode = S_IREAD; 307 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0) 308 st->st_mode |= S_IWRITE; 309 310 st->st_birthtimespec = mfd->mfd_btime; 311 st->st_ctimespec = mfd->mfd_mtime; 312 st->st_atimespec = mfd->mfd_atime; 313 st->st_mtimespec = mfd->mfd_mtime; 314 315 mutex_exit(&fp->f_lock); 316 317 return 0; 318 } 319 320 static int 321 memfd_close(file_t *fp) 322 { 323 struct memfd *mfd = fp->f_memfd; 324 325 uao_detach(mfd->mfd_uobj); 326 327 kmem_free(mfd, sizeof(*mfd)); 328 fp->f_memfd = NULL; 329 330 return 0; 331 } 332 333 static int 334 memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp, 335 int *advicep, struct uvm_object **uobjp, int *maxprotp) 336 { 337 struct memfd *mfd = fp->f_memfd; 338 int error = 0; 339 340 /* uvm_mmap guarantees page-aligned offset and size. */ 341 KASSERT(*offp == round_page(*offp)); 342 KASSERT(size == round_page(size)); 343 KASSERT(size > 0); 344 345 mutex_enter(&fp->f_lock); 346 347 if (*offp < 0) { 348 error = EINVAL; 349 goto leave; 350 } 351 if (*offp + size > mfd->mfd_size) { 352 error = EINVAL; 353 goto leave; 354 } 355 356 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) && 357 (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0) { 358 error = EPERM; 359 goto leave; 360 } 361 362 uao_reference(fp->f_memfd->mfd_uobj); 363 *uobjp = fp->f_memfd->mfd_uobj; 364 365 *maxprotp = prot; 366 *advicep = UVM_ADV_RANDOM; 367 368 leave: 369 mutex_exit(&fp->f_lock); 370 371 return error; 372 } 373 374 static int 375 memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp, 376 int flags) 377 { 378 off_t newoff; 379 int error = 0; 380 381 mutex_enter(&fp->f_lock); 382 383 switch (whence) { 384 case SEEK_CUR: 385 newoff = fp->f_offset + delta; 386 break; 387 388 case SEEK_END: 389 newoff = fp->f_memfd->mfd_size + delta; 390 break; 391 392 case SEEK_SET: 393 newoff = delta; 394 break; 395 396 default: 397 error = EINVAL; 398 goto leave; 399 } 400 401 if (newoffp) 402 *newoffp = newoff; 403 if (flags & FOF_UPDATE_OFFSET) 404 fp->f_offset = newoff; 405 406 leave: 407 mutex_exit(&fp->f_lock); 408 409 return error; 410 } 411 412 static int 413 memfd_truncate_locked(file_t *fp, off_t length) 414 { 415 struct memfd *mfd = fp->f_memfd; 416 voff_t start, end; 417 int error = 0; 418 419 KASSERT(mutex_owned(&fp->f_lock)); 420 421 if (length < 0) 422 return EINVAL; 423 if (length == mfd->mfd_size) 424 return 0; 425 426 if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size) 427 return EPERM; 428 if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size) 429 return EPERM; 430 431 if (length > mfd->mfd_size) 432 ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size, 433 length - mfd->mfd_size, 0); 434 else { 435 /* length < mfd->mfd_size, so try to get rid of excess pages */ 436 start = round_page(length); 437 end = round_page(mfd->mfd_size); 438 439 if (start < end) { /* we actually have pages to remove */ 440 rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER); 441 error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj, 442 start, end, PGO_FREE); 443 /* pgo_put drops vmobjlock */ 444 } 445 } 446 447 getnanotime(&mfd->mfd_mtime); 448 mfd->mfd_size = length; 449 450 return error; 451 } 452 453 static int 454 memfd_truncate(file_t *fp, off_t length) 455 { 456 int error; 457 458 mutex_enter(&fp->f_lock); 459 error = memfd_truncate_locked(fp, length); 460 mutex_exit(&fp->f_lock); 461 return error; 462 } 463