1 /* $NetBSD: sys_memfd.c,v 1.8 2023/07/29 23:59:59 rin Exp $ */ 2 3 /*- 4 * Copyright (c) 2023 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Theodore Preduta. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.8 2023/07/29 23:59:59 rin Exp $"); 34 35 #include <sys/param.h> 36 #include <sys/types.h> 37 38 #include <sys/fcntl.h> 39 #include <sys/file.h> 40 #include <sys/filedesc.h> 41 #include <sys/memfd.h> 42 #include <sys/mman.h> 43 #include <sys/syscallargs.h> 44 45 #include <uvm/uvm_extern.h> 46 #include <uvm/uvm_object.h> 47 48 #define F_SEAL_ANY_WRITE (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) 49 #define MFD_KNOWN_SEALS (F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \ 50 |F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) 51 52 static const char memfd_prefix[] = "memfd:"; 53 54 static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int); 55 static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int); 56 static int memfd_ioctl(file_t *, u_long, void *); 57 static int memfd_fcntl(file_t *, u_int, void *); 58 static int memfd_stat(file_t *, struct stat *); 59 static int memfd_close(file_t *); 60 static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *, 61 struct uvm_object **, int *); 62 static int memfd_seek(file_t *, off_t, int, off_t *, int); 63 static int memfd_truncate_locked(file_t *, off_t); 64 static int memfd_truncate(file_t *, off_t); 65 66 static const struct fileops memfd_fileops = { 67 .fo_name = "memfd", 68 .fo_read = memfd_read, 69 .fo_write = memfd_write, 70 .fo_ioctl = memfd_ioctl, 71 .fo_fcntl = memfd_fcntl, 72 .fo_poll = fnullop_poll, 73 .fo_stat = memfd_stat, 74 .fo_close = memfd_close, 75 .fo_kqfilter = fnullop_kqfilter, 76 .fo_restart = fnullop_restart, 77 .fo_mmap = memfd_mmap, 78 .fo_seek = memfd_seek, 79 .fo_fpathconf = (void *)eopnotsupp, 80 .fo_posix_fadvise = (void *)eopnotsupp, 81 .fo_truncate = memfd_truncate, 82 }; 83 84 /* 85 * memfd_create(2). Creat a file descriptor associated with anonymous 86 * memory. 87 */ 88 int 89 sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap, 90 register_t *retval) 91 { 92 /* { 93 syscallarg(const char *) name; 94 syscallarg(unsigned int) flags; 95 } */ 96 int error, fd; 97 file_t *fp; 98 struct memfd *mfd; 99 struct proc *p = l->l_proc; 100 const unsigned int flags = SCARG(uap, flags); 101 102 if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING)) 103 return EINVAL; 104 105 mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP); 106 mfd->mfd_size = 0; 107 mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */ 108 109 CTASSERT(sizeof(memfd_prefix) < NAME_MAX); /* sanity check */ 110 strcpy(mfd->mfd_name, memfd_prefix); 111 error = copyinstr(SCARG(uap, name), 112 &mfd->mfd_name[sizeof(memfd_prefix) - 1], 113 sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL); 114 if (error != 0) 115 goto leave; 116 117 getnanotime(&mfd->mfd_btime); 118 119 if ((flags & MFD_ALLOW_SEALING) == 0) 120 mfd->mfd_seals |= F_SEAL_SEAL; 121 122 error = fd_allocfile(&fp, &fd); 123 if (error != 0) 124 goto leave; 125 126 fp->f_flag = FREAD|FWRITE; 127 fp->f_type = DTYPE_MEMFD; 128 fp->f_ops = &memfd_fileops; 129 fp->f_memfd = mfd; 130 fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0); 131 fd_affix(p, fp, fd); 132 133 *retval = fd; 134 return 0; 135 136 leave: 137 uao_detach(mfd->mfd_uobj); 138 kmem_free(mfd, sizeof(*mfd)); 139 return error; 140 } 141 142 static int 143 memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, 144 int flags) 145 { 146 int error; 147 vsize_t todo; 148 struct memfd *mfd = fp->f_memfd; 149 150 mutex_enter(&fp->f_lock); 151 152 if (*offp < 0) { 153 error = EINVAL; 154 goto leave; 155 } 156 157 /* Trying to read past the end does nothing. */ 158 if (*offp >= mfd->mfd_size) { 159 error = 0; 160 goto leave; 161 } 162 163 uio->uio_offset = *offp; 164 todo = MIN(uio->uio_resid, mfd->mfd_size - *offp); 165 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL, 166 UBC_READ|UBC_PARTIALOK); 167 if (flags & FOF_UPDATE_OFFSET) 168 *offp = uio->uio_offset; 169 170 leave: 171 getnanotime(&mfd->mfd_atime); 172 173 174 mutex_exit(&fp->f_lock); 175 176 return error; 177 } 178 179 static int 180 memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, 181 int flags) 182 { 183 int error; 184 vsize_t todo; 185 struct memfd *mfd = fp->f_memfd; 186 187 mutex_enter(&fp->f_lock); 188 189 if (mfd->mfd_seals & F_SEAL_ANY_WRITE) { 190 error = EPERM; 191 goto leave; 192 } 193 194 if (*offp < 0) { 195 error = EINVAL; 196 goto leave; 197 } 198 199 uio->uio_offset = *offp; 200 todo = uio->uio_resid; 201 202 if (mfd->mfd_seals & F_SEAL_GROW) { 203 if (*offp >= mfd->mfd_size) { 204 error = EPERM; 205 goto leave; 206 } 207 208 /* Truncate the write to fit in mfd_size */ 209 if (*offp + uio->uio_resid >= mfd->mfd_size) 210 todo = mfd->mfd_size - *offp; 211 } else if (*offp + uio->uio_resid >= mfd->mfd_size) { 212 /* Grow to accommodate the write request. */ 213 error = memfd_truncate_locked(fp, *offp + uio->uio_resid); 214 if (error != 0) 215 goto leave; 216 } 217 218 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL, 219 UBC_WRITE|UBC_PARTIALOK); 220 if (flags & FOF_UPDATE_OFFSET) 221 *offp = uio->uio_offset; 222 223 getnanotime(&mfd->mfd_mtime); 224 225 leave: 226 mutex_exit(&fp->f_lock); 227 228 return error; 229 } 230 231 static int 232 memfd_ioctl(file_t *fp, u_long cmd, void *data) 233 { 234 235 return EINVAL; 236 } 237 238 static int 239 memfd_fcntl(file_t *fp, u_int cmd, void *data) 240 { 241 struct memfd *mfd = fp->f_memfd; 242 int error = 0; 243 244 switch (cmd) { 245 case F_ADD_SEALS: 246 mutex_enter(&fp->f_lock); 247 248 if (mfd->mfd_seals & F_SEAL_SEAL) { 249 error = EPERM; 250 goto leave_add_seals; 251 } 252 253 if (*(int *)data & ~MFD_KNOWN_SEALS) { 254 error = EINVAL; 255 goto leave_add_seals; 256 } 257 258 /* 259 * Can only add F_SEAL_WRITE if there are no currently 260 * open mmaps. 261 * 262 * XXX should only disallow if there are no currently 263 * open mmaps with PROT_WRITE. 264 */ 265 if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 && 266 (*(int *)data & F_SEAL_WRITE) != 0 && 267 mfd->mfd_uobj->uo_refs > 1) 268 { 269 error = EBUSY; 270 goto leave_add_seals; 271 } 272 273 mfd->mfd_seals |= *(int *)data; 274 275 leave_add_seals: 276 mutex_exit(&fp->f_lock); 277 return error; 278 279 case F_GET_SEALS: 280 mutex_enter(&fp->f_lock); 281 *(int *)data = mfd->mfd_seals; 282 mutex_exit(&fp->f_lock); 283 return 0; 284 285 default: 286 return EINVAL; 287 } 288 } 289 290 static int 291 memfd_stat(file_t *fp, struct stat *st) 292 { 293 struct memfd *mfd = fp->f_memfd; 294 295 mutex_enter(&fp->f_lock); 296 297 memset(st, 0, sizeof(*st)); 298 st->st_uid = kauth_cred_geteuid(fp->f_cred); 299 st->st_gid = kauth_cred_getegid(fp->f_cred); 300 st->st_size = mfd->mfd_size; 301 302 st->st_mode = S_IREAD; 303 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0) 304 st->st_mode |= S_IWRITE; 305 306 st->st_birthtimespec = mfd->mfd_btime; 307 st->st_ctimespec = mfd->mfd_mtime; 308 st->st_atimespec = mfd->mfd_atime; 309 st->st_mtimespec = mfd->mfd_mtime; 310 311 mutex_exit(&fp->f_lock); 312 313 return 0; 314 } 315 316 static int 317 memfd_close(file_t *fp) 318 { 319 struct memfd *mfd = fp->f_memfd; 320 321 uao_detach(mfd->mfd_uobj); 322 323 kmem_free(mfd, sizeof(*mfd)); 324 fp->f_memfd = NULL; 325 326 return 0; 327 } 328 329 static int 330 memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp, 331 int *advicep, struct uvm_object **uobjp, int *maxprotp) 332 { 333 struct memfd *mfd = fp->f_memfd; 334 int error = 0; 335 336 /* uvm_mmap guarantees page-aligned offset and size. */ 337 KASSERT(*offp == round_page(*offp)); 338 KASSERT(size == round_page(size)); 339 KASSERT(size > 0); 340 341 mutex_enter(&fp->f_lock); 342 343 if (*offp < 0) { 344 error = EINVAL; 345 goto leave; 346 } 347 if (*offp + size > mfd->mfd_size) { 348 error = EINVAL; 349 goto leave; 350 } 351 352 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) && 353 (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0) { 354 error = EPERM; 355 goto leave; 356 } 357 358 uao_reference(fp->f_memfd->mfd_uobj); 359 *uobjp = fp->f_memfd->mfd_uobj; 360 361 *maxprotp = prot; 362 *advicep = UVM_ADV_RANDOM; 363 364 leave: 365 mutex_exit(&fp->f_lock); 366 367 return error; 368 } 369 370 static int 371 memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp, 372 int flags) 373 { 374 off_t newoff; 375 int error = 0; 376 377 mutex_enter(&fp->f_lock); 378 379 switch (whence) { 380 case SEEK_CUR: 381 newoff = fp->f_offset + delta; 382 break; 383 384 case SEEK_END: 385 newoff = fp->f_memfd->mfd_size + delta; 386 break; 387 388 case SEEK_SET: 389 newoff = delta; 390 break; 391 392 default: 393 error = EINVAL; 394 goto leave; 395 } 396 397 if (newoffp) 398 *newoffp = newoff; 399 if (flags & FOF_UPDATE_OFFSET) 400 fp->f_offset = newoff; 401 402 leave: 403 mutex_exit(&fp->f_lock); 404 405 return error; 406 } 407 408 static int 409 memfd_truncate_locked(file_t *fp, off_t length) 410 { 411 struct memfd *mfd = fp->f_memfd; 412 voff_t start, end; 413 int error = 0; 414 415 KASSERT(mutex_owned(&fp->f_lock)); 416 417 if (length < 0) 418 return EINVAL; 419 if (length == mfd->mfd_size) 420 return 0; 421 422 if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size) 423 return EPERM; 424 if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size) 425 return EPERM; 426 427 if (length > mfd->mfd_size) 428 ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size, 429 length - mfd->mfd_size, 0); 430 else { 431 /* length < mfd->mfd_size, so try to get rid of excess pages */ 432 start = round_page(length); 433 end = round_page(mfd->mfd_size); 434 435 if (start < end) { /* we actually have pages to remove */ 436 rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER); 437 error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj, 438 start, end, PGO_FREE); 439 /* pgo_put drops vmobjlock */ 440 } 441 } 442 443 getnanotime(&mfd->mfd_mtime); 444 mfd->mfd_size = length; 445 446 return error; 447 } 448 449 static int 450 memfd_truncate(file_t *fp, off_t length) 451 { 452 int error; 453 454 mutex_enter(&fp->f_lock); 455 error = memfd_truncate_locked(fp, length); 456 mutex_exit(&fp->f_lock); 457 return error; 458 } 459