1 /* 2 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/kern/vfs_jops.c,v 1.7 2005/02/28 17:41:00 dillon Exp $ 35 */ 36 /* 37 * Each mount point may have zero or more independantly configured journals 38 * attached to it. Each journal is represented by a memory FIFO and worker 39 * thread. Journal events are streamed through the FIFO to the thread, 40 * batched up (typically on one-second intervals), and written out by the 41 * thread. 42 * 43 * Journal vnode ops are executed instead of mnt_vn_norm_ops when one or 44 * more journals have been installed on a mount point. It becomes the 45 * responsibility of the journal op to call the underlying normal op as 46 * appropriate. 47 * 48 * The journaling protocol is intended to evolve into a two-way stream 49 * whereby transaction IDs can be acknowledged by the journaling target 50 * when the data has been committed to hard storage. Both implicit and 51 * explicit acknowledgement schemes will be supported, depending on the 52 * sophistication of the journaling stream, plus resynchronization and 53 * restart when a journaling stream is interrupted. This information will 54 * also be made available to journaling-aware filesystems to allow better 55 * management of their own physical storage synchronization mechanisms as 56 * well as to allow such filesystems to take direct advantage of the kernel's 57 * journaling layer so they don't have to roll their own. 58 * 59 * In addition, the worker thread will have access to much larger 60 * spooling areas then the memory buffer is able to provide by e.g. 61 * reserving swap space, in order to absorb potentially long interruptions 62 * of off-site journaling streams, and to prevent 'slow' off-site linkages 63 * from radically slowing down local filesystem operations. 64 * 65 * Because of the non-trivial algorithms the journaling system will be 66 * required to support, use of a worker thread is mandatory. Efficiencies 67 * are maintained by utilitizing the memory FIFO to batch transactions when 68 * possible, reducing the number of gratuitous thread switches and taking 69 * advantage of cpu caches through the use of shorter batched code paths 70 * rather then trying to do everything in the context of the process 71 * originating the filesystem op. In the future the memory FIFO can be 72 * made per-cpu to remove BGL or other locking requirements. 73 */ 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/buf.h> 77 #include <sys/conf.h> 78 #include <sys/kernel.h> 79 #include <sys/queue.h> 80 #include <sys/lock.h> 81 #include <sys/malloc.h> 82 #include <sys/mount.h> 83 #include <sys/unistd.h> 84 #include <sys/vnode.h> 85 #include <sys/poll.h> 86 #include <sys/mountctl.h> 87 #include <sys/journal.h> 88 #include <sys/file.h> 89 #include <sys/proc.h> 90 91 #include <machine/limits.h> 92 93 #include <vm/vm.h> 94 #include <vm/vm_object.h> 95 #include <vm/vm_page.h> 96 #include <vm/vm_pager.h> 97 #include <vm/vnode_pager.h> 98 99 #include <sys/file2.h> 100 #include <sys/thread2.h> 101 102 static int journal_attach(struct mount *mp); 103 static void journal_detach(struct mount *mp); 104 static int journal_install_vfs_journal(struct mount *mp, struct file *fp, 105 const struct mountctl_install_journal *info); 106 static int journal_remove_vfs_journal(struct mount *mp, 107 const struct mountctl_remove_journal *info); 108 static int journal_resync_vfs_journal(struct mount *mp, const void *ctl); 109 static int journal_status_vfs_journal(struct mount *mp, 110 const struct mountctl_status_journal *info, 111 struct mountctl_journal_ret_status *rstat, 112 int buflen, int *res); 113 static void journal_thread(void *info); 114 115 static void *journal_reserve(struct journal *jo, 116 struct journal_rawrecbeg **rawpp, 117 int16_t streamid, int bytes); 118 static void *journal_extend(struct journal *jo, 119 struct journal_rawrecbeg **rawpp, 120 int truncbytes, int bytes, int *newstreamrecp); 121 static void journal_abort(struct journal *jo, 122 struct journal_rawrecbeg **rawpp); 123 static void journal_commit(struct journal *jo, 124 struct journal_rawrecbeg **rawpp, 125 int bytes, int closeout); 126 127 static void jrecord_init(struct journal *jo, 128 struct jrecord *jrec, int16_t streamid); 129 static struct journal_subrecord *jrecord_push( 130 struct jrecord *jrec, int16_t rectype); 131 static void jrecord_pop(struct jrecord *jrec, struct journal_subrecord *parent); 132 static struct journal_subrecord *jrecord_write(struct jrecord *jrec, 133 int16_t rectype, int bytes); 134 static void jrecord_data(struct jrecord *jrec, const void *buf, int bytes); 135 static void jrecord_done(struct jrecord *jrec, int abortit); 136 137 static int journal_setattr(struct vop_setattr_args *ap); 138 static int journal_write(struct vop_write_args *ap); 139 static int journal_fsync(struct vop_fsync_args *ap); 140 static int journal_putpages(struct vop_putpages_args *ap); 141 static int journal_setacl(struct vop_setacl_args *ap); 142 static int journal_setextattr(struct vop_setextattr_args *ap); 143 static int journal_ncreate(struct vop_ncreate_args *ap); 144 static int journal_nmknod(struct vop_nmknod_args *ap); 145 static int journal_nlink(struct vop_nlink_args *ap); 146 static int journal_nsymlink(struct vop_nsymlink_args *ap); 147 static int journal_nwhiteout(struct vop_nwhiteout_args *ap); 148 static int journal_nremove(struct vop_nremove_args *ap); 149 static int journal_nmkdir(struct vop_nmkdir_args *ap); 150 static int journal_nrmdir(struct vop_nrmdir_args *ap); 151 static int journal_nrename(struct vop_nrename_args *ap); 152 153 static struct vnodeopv_entry_desc journal_vnodeop_entries[] = { 154 { &vop_default_desc, vop_journal_operate_ap }, 155 { &vop_mountctl_desc, (void *)journal_mountctl }, 156 { &vop_setattr_desc, (void *)journal_setattr }, 157 { &vop_write_desc, (void *)journal_write }, 158 { &vop_fsync_desc, (void *)journal_fsync }, 159 { &vop_putpages_desc, (void *)journal_putpages }, 160 { &vop_setacl_desc, (void *)journal_setacl }, 161 { &vop_setextattr_desc, (void *)journal_setextattr }, 162 { &vop_ncreate_desc, (void *)journal_ncreate }, 163 { &vop_nmknod_desc, (void *)journal_nmknod }, 164 { &vop_nlink_desc, (void *)journal_nlink }, 165 { &vop_nsymlink_desc, (void *)journal_nsymlink }, 166 { &vop_nwhiteout_desc, (void *)journal_nwhiteout }, 167 { &vop_nremove_desc, (void *)journal_nremove }, 168 { &vop_nmkdir_desc, (void *)journal_nmkdir }, 169 { &vop_nrmdir_desc, (void *)journal_nrmdir }, 170 { &vop_nrename_desc, (void *)journal_nrename }, 171 { NULL, NULL } 172 }; 173 174 static MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structures"); 175 static MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO"); 176 177 int 178 journal_mountctl(struct vop_mountctl_args *ap) 179 { 180 struct mount *mp; 181 int error = 0; 182 183 mp = ap->a_head.a_ops->vv_mount; 184 KKASSERT(mp); 185 186 if (mp->mnt_vn_journal_ops == NULL) { 187 switch(ap->a_op) { 188 case MOUNTCTL_INSTALL_VFS_JOURNAL: 189 error = journal_attach(mp); 190 if (error == 0 && ap->a_ctllen != sizeof(struct mountctl_install_journal)) 191 error = EINVAL; 192 if (error == 0 && ap->a_fp == NULL) 193 error = EBADF; 194 if (error == 0) 195 error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl); 196 if (TAILQ_EMPTY(&mp->mnt_jlist)) 197 journal_detach(mp); 198 break; 199 case MOUNTCTL_REMOVE_VFS_JOURNAL: 200 case MOUNTCTL_RESYNC_VFS_JOURNAL: 201 case MOUNTCTL_STATUS_VFS_JOURNAL: 202 error = ENOENT; 203 break; 204 default: 205 error = EOPNOTSUPP; 206 break; 207 } 208 } else { 209 switch(ap->a_op) { 210 case MOUNTCTL_INSTALL_VFS_JOURNAL: 211 if (ap->a_ctllen != sizeof(struct mountctl_install_journal)) 212 error = EINVAL; 213 if (error == 0 && ap->a_fp == NULL) 214 error = EBADF; 215 if (error == 0) 216 error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl); 217 break; 218 case MOUNTCTL_REMOVE_VFS_JOURNAL: 219 if (ap->a_ctllen != sizeof(struct mountctl_remove_journal)) 220 error = EINVAL; 221 if (error == 0) 222 error = journal_remove_vfs_journal(mp, ap->a_ctl); 223 if (TAILQ_EMPTY(&mp->mnt_jlist)) 224 journal_detach(mp); 225 break; 226 case MOUNTCTL_RESYNC_VFS_JOURNAL: 227 if (ap->a_ctllen != 0) 228 error = EINVAL; 229 error = journal_resync_vfs_journal(mp, ap->a_ctl); 230 break; 231 case MOUNTCTL_STATUS_VFS_JOURNAL: 232 if (ap->a_ctllen != sizeof(struct mountctl_status_journal)) 233 error = EINVAL; 234 if (error == 0) { 235 error = journal_status_vfs_journal(mp, ap->a_ctl, 236 ap->a_buf, ap->a_buflen, ap->a_res); 237 } 238 break; 239 default: 240 error = EOPNOTSUPP; 241 break; 242 } 243 } 244 return (error); 245 } 246 247 /* 248 * High level mount point setup. When a 249 */ 250 static int 251 journal_attach(struct mount *mp) 252 { 253 vfs_add_vnodeops(mp, &mp->mnt_vn_journal_ops, journal_vnodeop_entries); 254 return(0); 255 } 256 257 static void 258 journal_detach(struct mount *mp) 259 { 260 if (mp->mnt_vn_journal_ops) 261 vfs_rm_vnodeops(&mp->mnt_vn_journal_ops); 262 } 263 264 /* 265 * Install a journal on a mount point. Each journal has an associated worker 266 * thread which is responsible for buffering and spooling the data to the 267 * target. A mount point may have multiple journals attached to it. An 268 * initial start record is generated when the journal is associated. 269 */ 270 static int 271 journal_install_vfs_journal(struct mount *mp, struct file *fp, 272 const struct mountctl_install_journal *info) 273 { 274 struct journal *jo; 275 struct jrecord jrec; 276 int error = 0; 277 int size; 278 279 jo = malloc(sizeof(struct journal), M_JOURNAL, M_WAITOK|M_ZERO); 280 bcopy(info->id, jo->id, sizeof(jo->id)); 281 jo->flags = info->flags & ~(MC_JOURNAL_ACTIVE | MC_JOURNAL_STOP_REQ); 282 283 /* 284 * Memory FIFO size, round to nearest power of 2 285 */ 286 if (info->membufsize) { 287 if (info->membufsize < 65536) 288 size = 65536; 289 else if (info->membufsize > 128 * 1024 * 1024) 290 size = 128 * 1024 * 1024; 291 else 292 size = (int)info->membufsize; 293 } else { 294 size = 1024 * 1024; 295 } 296 jo->fifo.size = 1; 297 while (jo->fifo.size < size) 298 jo->fifo.size <<= 1; 299 300 /* 301 * Other parameters. If not specified the starting transaction id 302 * will be the current date. 303 */ 304 if (info->transid) { 305 jo->transid = info->transid; 306 } else { 307 struct timespec ts; 308 getnanotime(&ts); 309 jo->transid = ((int64_t)ts.tv_sec << 30) | ts.tv_nsec; 310 } 311 312 jo->fp = fp; 313 314 /* 315 * Allocate the memory FIFO 316 */ 317 jo->fifo.mask = jo->fifo.size - 1; 318 jo->fifo.membase = malloc(jo->fifo.size, M_JFIFO, M_WAITOK|M_ZERO|M_NULLOK); 319 if (jo->fifo.membase == NULL) 320 error = ENOMEM; 321 322 /* 323 * Create the worker thread and generate the association record. 324 */ 325 if (error) { 326 free(jo, M_JOURNAL); 327 } else { 328 fhold(fp); 329 jo->flags |= MC_JOURNAL_ACTIVE; 330 lwkt_create(journal_thread, jo, NULL, &jo->thread, 331 TDF_STOPREQ, -1, "journal %.*s", JIDMAX, jo->id); 332 lwkt_setpri(&jo->thread, TDPRI_KERN_DAEMON); 333 lwkt_schedule(&jo->thread); 334 335 jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT); 336 jrecord_write(&jrec, JTYPE_ASSOCIATE, 0); 337 jrecord_done(&jrec, 0); 338 TAILQ_INSERT_TAIL(&mp->mnt_jlist, jo, jentry); 339 } 340 return(error); 341 } 342 343 /* 344 * Disassociate a journal from a mount point and terminate its worker thread. 345 * A final termination record is written out before the file pointer is 346 * dropped. 347 */ 348 static int 349 journal_remove_vfs_journal(struct mount *mp, 350 const struct mountctl_remove_journal *info) 351 { 352 struct journal *jo; 353 struct jrecord jrec; 354 int error; 355 356 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 357 if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0) 358 break; 359 } 360 if (jo) { 361 error = 0; 362 TAILQ_REMOVE(&mp->mnt_jlist, jo, jentry); 363 364 jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT); 365 jrecord_write(&jrec, JTYPE_DISASSOCIATE, 0); 366 jrecord_done(&jrec, 0); 367 368 jo->flags |= MC_JOURNAL_STOP_REQ | (info->flags & MC_JOURNAL_STOP_IMM); 369 wakeup(&jo->fifo); 370 while (jo->flags & MC_JOURNAL_ACTIVE) { 371 tsleep(jo, 0, "jwait", 0); 372 } 373 lwkt_free_thread(&jo->thread); /* XXX SMP */ 374 if (jo->fp) 375 fdrop(jo->fp, curthread); 376 if (jo->fifo.membase) 377 free(jo->fifo.membase, M_JFIFO); 378 free(jo, M_JOURNAL); 379 } else { 380 error = EINVAL; 381 } 382 return (error); 383 } 384 385 static int 386 journal_resync_vfs_journal(struct mount *mp, const void *ctl) 387 { 388 return(EINVAL); 389 } 390 391 static int 392 journal_status_vfs_journal(struct mount *mp, 393 const struct mountctl_status_journal *info, 394 struct mountctl_journal_ret_status *rstat, 395 int buflen, int *res) 396 { 397 struct journal *jo; 398 int error = 0; 399 int index; 400 401 index = 0; 402 *res = 0; 403 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 404 if (info->index == MC_JOURNAL_INDEX_ID) { 405 if (bcmp(jo->id, info->id, sizeof(jo->id)) != 0) 406 continue; 407 } else if (info->index >= 0) { 408 if (info->index < index) 409 continue; 410 } else if (info->index != MC_JOURNAL_INDEX_ALL) { 411 continue; 412 } 413 if (buflen < sizeof(*rstat)) { 414 if (*res) 415 rstat[-1].flags |= MC_JOURNAL_STATUS_MORETOCOME; 416 else 417 error = EINVAL; 418 break; 419 } 420 bzero(rstat, sizeof(*rstat)); 421 rstat->recsize = sizeof(*rstat); 422 bcopy(jo->id, rstat->id, sizeof(jo->id)); 423 rstat->index = index; 424 rstat->membufsize = jo->fifo.size; 425 rstat->membufused = jo->fifo.xindex - jo->fifo.rindex; 426 rstat->membufiopend = jo->fifo.windex - jo->fifo.rindex; 427 rstat->bytessent = jo->total_acked; 428 ++rstat; 429 ++index; 430 *res += sizeof(*rstat); 431 buflen -= sizeof(*rstat); 432 } 433 return(error); 434 } 435 /* 436 * The per-journal worker thread is responsible for writing out the 437 * journal's FIFO to the target stream. 438 */ 439 static void 440 journal_thread(void *info) 441 { 442 struct journal *jo = info; 443 struct journal_rawrecbeg *rawp; 444 int bytes; 445 int error; 446 int avail; 447 int res; 448 449 for (;;) { 450 /* 451 * Calculate the number of bytes available to write. This buffer 452 * area may contain reserved records so we can't just write it out 453 * without further checks. 454 */ 455 bytes = jo->fifo.windex - jo->fifo.rindex; 456 457 /* 458 * sleep if no bytes are available or if an incomplete record is 459 * encountered (it needs to be filled in before we can write it 460 * out), and skip any pad records that we encounter. 461 */ 462 if (bytes == 0) { 463 if (jo->flags & MC_JOURNAL_STOP_REQ) 464 break; 465 tsleep(&jo->fifo, 0, "jfifo", hz); 466 continue; 467 } 468 rawp = (void *)(jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask)); 469 if (rawp->begmagic == JREC_INCOMPLETEMAGIC) { 470 tsleep(&jo->fifo, 0, "jpad", hz); 471 continue; 472 } 473 if (rawp->streamid == JREC_STREAMID_PAD) { 474 jo->fifo.rindex += (rawp->recsize + 15) & ~15; 475 KKASSERT(jo->fifo.windex - jo->fifo.rindex > 0); 476 continue; 477 } 478 479 /* 480 * Figure out how much we can write out, beware the buffer wrap 481 * case. 482 */ 483 res = 0; 484 avail = jo->fifo.size - (jo->fifo.rindex & jo->fifo.mask); 485 while (res < bytes && rawp->begmagic == JREC_BEGMAGIC) { 486 res += (rawp->recsize + 15) & ~15; 487 if (res >= avail) { 488 KKASSERT(res == avail); 489 break; 490 } 491 } 492 493 /* 494 * Issue the write and deal with any errors or other conditions. 495 * For now assume blocking I/O. Since we are record-aware the 496 * code cannot yet handle partial writes. 497 * 498 * XXX EWOULDBLOCK/NBIO 499 * XXX notification on failure 500 * XXX two-way acknowledgement stream in the return direction / xindex 501 */ 502 printf("write @%d,%d\n", jo->fifo.rindex & jo->fifo.mask, bytes); 503 bytes = res; 504 error = fp_write(jo->fp, 505 jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask), 506 bytes, &res); 507 if (error) { 508 printf("journal_thread(%s) write, error %d\n", jo->id, error); 509 /* XXX */ 510 } else { 511 KKASSERT(res == bytes); 512 printf("journal_thread(%s) write %d\n", jo->id, res); 513 } 514 515 /* 516 * Advance rindex. XXX for now also advance xindex, which will 517 * eventually be advanced when the target acknowledges the sequence 518 * space. 519 */ 520 jo->fifo.rindex += bytes; 521 jo->fifo.xindex += bytes; 522 jo->total_acked += bytes; 523 if (jo->flags & MC_JOURNAL_WWAIT) { 524 jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */ 525 wakeup(&jo->fifo.windex); 526 } 527 } 528 jo->flags &= ~MC_JOURNAL_ACTIVE; 529 wakeup(jo); 530 wakeup(&jo->fifo.windex); 531 } 532 533 static __inline 534 void 535 journal_build_pad(struct journal_rawrecbeg *rawp, int recsize) 536 { 537 struct journal_rawrecend *rendp; 538 539 KKASSERT((recsize & 15) == 0 && recsize >= 16); 540 541 rawp->begmagic = JREC_BEGMAGIC; 542 rawp->streamid = JREC_STREAMID_PAD; 543 rawp->recsize = recsize; /* must be 16-byte aligned */ 544 rawp->seqno = 0; 545 /* 546 * WARNING, rendp may overlap rawp->seqno. This is necessary to 547 * allow PAD records to fit in 16 bytes. Use cpu_mb1() to 548 * hopefully cause the compiler to not make any assumptions. 549 */ 550 cpu_mb1(); 551 rendp = (void *)((char *)rawp + rawp->recsize - sizeof(*rendp)); 552 rendp->endmagic = JREC_ENDMAGIC; 553 rendp->check = 0; 554 rendp->recsize = rawp->recsize; 555 } 556 557 /* 558 * Wake up the worker thread if the FIFO is more then half full or if 559 * someone is waiting for space to be freed up. Otherwise let the 560 * heartbeat deal with it. Being able to avoid waking up the worker 561 * is the key to the journal's cpu efficiency. 562 */ 563 static __inline 564 void 565 journal_commit_wakeup(struct journal *jo) 566 { 567 int avail; 568 569 avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex); 570 KKASSERT(avail >= 0); 571 if ((avail < (jo->fifo.size >> 1)) || (jo->flags & MC_JOURNAL_WWAIT)) 572 wakeup(&jo->fifo); 573 } 574 575 /* 576 * Create a new BEGIN stream record with the specified streamid and the 577 * specified amount of payload space. *rawpp will be set to point to the 578 * base of the new stream record and a pointer to the base of the payload 579 * space will be returned. *rawpp does not need to be pre-NULLd prior to 580 * making this call. 581 * 582 * A stream can be extended, aborted, or committed by other API calls 583 * below. This may result in a sequence of potentially disconnected 584 * stream records to be output to the journaling target. The first record 585 * (the one created by this function) will be marked JREC_STREAMCTL_BEGIN, 586 * while the last record on commit or abort will be marked JREC_STREAMCTL_END 587 * (and possibly also JREC_STREAMCTL_ABORTED). The last record could wind 588 * up being the same as the first, in which case the bits are all set in 589 * the first record. 590 * 591 * The stream record is created in an incomplete state by setting the begin 592 * magic to JREC_INCOMPLETEMAGIC. This prevents the worker thread from 593 * flushing the fifo past our record until we have finished populating it. 594 * Other threads can reserve and operate on their own space without stalling 595 * but the stream output will stall until we have completed operations. The 596 * memory FIFO is intended to be large enough to absorb such situations 597 * without stalling out other threads. 598 */ 599 static 600 void * 601 journal_reserve(struct journal *jo, struct journal_rawrecbeg **rawpp, 602 int16_t streamid, int bytes) 603 { 604 struct journal_rawrecbeg *rawp; 605 int avail; 606 int availtoend; 607 int req; 608 609 /* 610 * Add header and trailer overheads to the passed payload. Note that 611 * the passed payload size need not be aligned in any way. 612 */ 613 bytes += sizeof(struct journal_rawrecbeg); 614 bytes += sizeof(struct journal_rawrecend); 615 616 for (;;) { 617 /* 618 * First, check boundary conditions. If the request would wrap around 619 * we have to skip past the ending block and return to the beginning 620 * of the FIFO's buffer. Calculate 'req' which is the actual number 621 * of bytes being reserved, including wrap-around dead space. 622 * 623 * Note that availtoend is not truncated to avail and so cannot be 624 * used to determine whether the reservation is possible by itself. 625 * Also, since all fifo ops are 16-byte aligned, we can check 626 * the size before calculating the aligned size. 627 */ 628 availtoend = jo->fifo.size - (jo->fifo.windex & jo->fifo.mask); 629 if (bytes > availtoend) 630 req = bytes + availtoend; /* add pad to end */ 631 else 632 req = bytes; 633 634 /* 635 * Next calculate the total available space and see if it is 636 * sufficient. We cannot overwrite previously buffered data 637 * past xindex because otherwise we would not be able to restart 638 * a broken link at the target's last point of commit. 639 */ 640 avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex); 641 KKASSERT(avail >= 0 && (avail & 15) == 0); 642 643 if (avail < req) { 644 /* XXX MC_JOURNAL_STOP_IMM */ 645 jo->flags |= MC_JOURNAL_WWAIT; 646 tsleep(&jo->fifo.windex, 0, "jwrite", 0); 647 continue; 648 } 649 650 /* 651 * Create a pad record for any dead space and create an incomplete 652 * record for the live space, then return a pointer to the 653 * contiguous buffer space that was requested. 654 * 655 * NOTE: The worker thread will not flush past an incomplete 656 * record, so the reserved space can be filled in at-will. The 657 * journaling code must also be aware the reserved sections occuring 658 * after this one will also not be written out even if completed 659 * until this one is completed. 660 */ 661 rawp = (void *)(jo->fifo.membase + (jo->fifo.windex & jo->fifo.mask)); 662 if (req != bytes) { 663 journal_build_pad(rawp, req - bytes); 664 rawp = (void *)jo->fifo.membase; 665 } 666 rawp->begmagic = JREC_INCOMPLETEMAGIC; /* updated by abort/commit */ 667 rawp->recsize = bytes; /* (unaligned size) */ 668 rawp->streamid = streamid | JREC_STREAMCTL_BEGIN; 669 rawp->seqno = 0; /* set by caller */ 670 671 /* 672 * Issue a memory barrier to guarentee that the record data has been 673 * properly initialized before we advance the write index and return 674 * a pointer to the reserved record. Otherwise the worker thread 675 * could accidently run past us. 676 * 677 * Note that stream records are always 16-byte aligned. 678 */ 679 cpu_mb1(); 680 jo->fifo.windex += (req + 15) & ~15; 681 *rawpp = rawp; 682 return(rawp + 1); 683 } 684 /* not reached */ 685 *rawpp = NULL; 686 return(NULL); 687 } 688 689 /* 690 * Extend a previous reservation by the specified number of payload bytes. 691 * If it is not possible to extend the existing reservation due to either 692 * another thread having reserved space after us or due to a boundary 693 * condition, the current reservation will be committed and possibly 694 * truncated and a new reservation with the specified payload size will 695 * be created. *rawpp is set to the new reservation in this case but the 696 * caller cannot depend on a comparison with the old rawp to determine if 697 * this case occurs because we could end up using the same memory FIFO 698 * offset for the new stream record. 699 * 700 * In either case this function will return a pointer to the base of the 701 * extended payload space. 702 * 703 * If a new stream block is created the caller needs to recalculate payload 704 * byte counts, if the same stream block is used the caller needs to extend 705 * its current notion of the payload byte count. 706 */ 707 static void * 708 journal_extend(struct journal *jo, struct journal_rawrecbeg **rawpp, 709 int truncbytes, int bytes, int *newstreamrecp) 710 { 711 struct journal_rawrecbeg *rawp; 712 int16_t streamid; 713 int availtoend; 714 int avail; 715 int osize; 716 int nsize; 717 int wbase; 718 void *rptr; 719 720 *newstreamrecp = 0; 721 rawp = *rawpp; 722 osize = (rawp->recsize + 15) & ~15; 723 nsize = (rawp->recsize + bytes + 15) & ~15; 724 wbase = (char *)rawp - jo->fifo.membase; 725 726 /* 727 * If the aligned record size does not change we can trivially extend 728 * the record. 729 */ 730 if (nsize == osize) { 731 rawp->recsize += bytes; 732 return((char *)rawp + rawp->recsize - bytes); 733 } 734 735 /* 736 * If the fifo's write index hasn't been modified since we made the 737 * reservation and we do not hit any boundary conditions, we can 738 * trivially extend the record. 739 */ 740 if ((jo->fifo.windex & jo->fifo.mask) == wbase + osize) { 741 availtoend = jo->fifo.size - wbase; 742 avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex) + osize; 743 KKASSERT((availtoend & 15) == 0); 744 KKASSERT((avail & 15) == 0); 745 if (nsize <= avail && nsize <= availtoend) { 746 jo->fifo.windex += nsize - osize; 747 rawp->recsize += bytes; 748 return((char *)rawp + rawp->recsize - bytes); 749 } 750 } 751 752 /* 753 * It was not possible to extend the buffer. Commit the current 754 * buffer and create a new one. We manually clear the BEGIN mark that 755 * journal_reserve() creates (because this is a continuing record, not 756 * the start of a new stream). 757 */ 758 streamid = rawp->streamid & JREC_STREAMID_MASK; 759 journal_commit(jo, rawpp, truncbytes, 0); 760 rptr = journal_reserve(jo, rawpp, streamid, bytes); 761 rawp = *rawpp; 762 rawp->streamid &= ~JREC_STREAMCTL_BEGIN; 763 *newstreamrecp = 1; 764 return(rptr); 765 } 766 767 /* 768 * Abort a journal record. If the transaction record represents a stream 769 * BEGIN and we can reverse the fifo's write index we can simply reverse 770 * index the entire record, as if it were never reserved in the first place. 771 * 772 * Otherwise we set the JREC_STREAMCTL_ABORTED bit and commit the record 773 * with the payload truncated to 0 bytes. 774 */ 775 static void 776 journal_abort(struct journal *jo, struct journal_rawrecbeg **rawpp) 777 { 778 struct journal_rawrecbeg *rawp; 779 int osize; 780 781 rawp = *rawpp; 782 osize = (rawp->recsize + 15) & ~15; 783 784 if ((rawp->streamid & JREC_STREAMCTL_BEGIN) && 785 (jo->fifo.windex & jo->fifo.mask) == 786 (char *)rawp - jo->fifo.membase + osize) 787 { 788 jo->fifo.windex -= osize; 789 *rawpp = NULL; 790 } else { 791 rawp->streamid |= JREC_STREAMCTL_ABORTED; 792 journal_commit(jo, rawpp, 0, 1); 793 } 794 } 795 796 /* 797 * Commit a journal record and potentially truncate it to the specified 798 * number of payload bytes. If you do not want to truncate the record, 799 * simply pass -1 for the bytes parameter. Do not pass rawp->recsize, that 800 * field includes header and trailer and will not be correct. Note that 801 * passing 0 will truncate the entire data payload of the record. 802 * 803 * The logical stream is terminated by this function. 804 * 805 * If truncation occurs, and it is not possible to physically optimize the 806 * memory FIFO due to other threads having reserved space after ours, 807 * the remaining reserved space will be covered by a pad record. 808 */ 809 static void 810 journal_commit(struct journal *jo, struct journal_rawrecbeg **rawpp, 811 int bytes, int closeout) 812 { 813 struct journal_rawrecbeg *rawp; 814 struct journal_rawrecend *rendp; 815 int osize; 816 int nsize; 817 818 rawp = *rawpp; 819 *rawpp = NULL; 820 821 KKASSERT((char *)rawp >= jo->fifo.membase && 822 (char *)rawp + rawp->recsize <= jo->fifo.membase + jo->fifo.size); 823 KKASSERT(((intptr_t)rawp & 15) == 0); 824 825 /* 826 * Truncate the record if requested. If the FIFO write index as still 827 * at the end of our record we can optimally backindex it. Otherwise 828 * we have to insert a pad record. 829 * 830 * We calculate osize which is the 16-byte-aligned original recsize. 831 * We calculate nsize which is the 16-byte-aligned new recsize. 832 * 833 * Due to alignment issues or in case the passed truncation bytes is 834 * the same as the original payload, windex will be equal to nindex. 835 */ 836 if (bytes >= 0) { 837 KKASSERT(bytes >= 0 && bytes <= rawp->recsize - sizeof(struct journal_rawrecbeg) - sizeof(struct journal_rawrecend)); 838 osize = (rawp->recsize + 15) & ~15; 839 rawp->recsize = bytes + sizeof(struct journal_rawrecbeg) + 840 sizeof(struct journal_rawrecend); 841 nsize = (rawp->recsize + 15) & ~15; 842 if (osize == nsize) { 843 /* do nothing */ 844 } else if ((jo->fifo.windex & jo->fifo.mask) == (char *)rawp - jo->fifo.membase + osize) { 845 /* we are able to backindex the fifo */ 846 jo->fifo.windex -= osize - nsize; 847 } else { 848 /* we cannot backindex the fifo, emplace a pad in the dead space */ 849 journal_build_pad((void *)((char *)rawp + osize), osize - nsize); 850 } 851 } 852 853 /* 854 * Fill in the trailer. Note that unlike pad records, the trailer will 855 * never overlap the header. 856 */ 857 rendp = (void *)((char *)rawp + 858 ((rawp->recsize + 15) & ~15) - sizeof(*rendp)); 859 rendp->endmagic = JREC_ENDMAGIC; 860 rendp->recsize = rawp->recsize; 861 rendp->check = 0; /* XXX check word, disabled for now */ 862 863 /* 864 * Fill in begmagic last. This will allow the worker thread to proceed. 865 * Use a memory barrier to guarentee write ordering. Mark the stream 866 * as terminated if closeout is set. This is the typical case. 867 */ 868 if (closeout) 869 rawp->streamid |= JREC_STREAMCTL_END; 870 cpu_mb1(); /* memory barrier */ 871 rawp->begmagic = JREC_BEGMAGIC; 872 873 journal_commit_wakeup(jo); 874 } 875 876 /************************************************************************ 877 * TRANSACTION SUPPORT ROUTINES * 878 ************************************************************************ 879 * 880 * JRECORD_*() - routines to create subrecord transactions and embed them 881 * in the logical streams managed by the journal_*() routines. 882 */ 883 884 static int16_t sid = JREC_STREAMID_JMIN; 885 886 /* 887 * Initialize the passed jrecord structure and start a new stream transaction 888 * by reserving an initial build space in the journal's memory FIFO. 889 */ 890 static void 891 jrecord_init(struct journal *jo, struct jrecord *jrec, int16_t streamid) 892 { 893 bzero(jrec, sizeof(*jrec)); 894 jrec->jo = jo; 895 if (streamid < 0) { 896 streamid = sid++; /* XXX need to track stream ids! */ 897 if (sid == JREC_STREAMID_JMAX) 898 sid = JREC_STREAMID_JMIN; 899 } 900 jrec->streamid = streamid; 901 jrec->stream_residual = JREC_DEFAULTSIZE; 902 jrec->stream_reserved = jrec->stream_residual; 903 jrec->stream_ptr = 904 journal_reserve(jo, &jrec->rawp, streamid, jrec->stream_reserved); 905 } 906 907 /* 908 * Push a recursive record type. All pushes should have matching pops. 909 * The old parent is returned and the newly pushed record becomes the 910 * new parent. Note that the old parent's pointer may already be invalid 911 * or may become invalid if jrecord_write() had to build a new stream 912 * record, so the caller should not mess with the returned pointer in 913 * any way other then to save it. 914 */ 915 static 916 struct journal_subrecord * 917 jrecord_push(struct jrecord *jrec, int16_t rectype) 918 { 919 struct journal_subrecord *save; 920 921 save = jrec->parent; 922 jrec->parent = jrecord_write(jrec, rectype|JMASK_NESTED, 0); 923 jrec->last = NULL; 924 KKASSERT(jrec->parent != NULL); 925 ++jrec->pushcount; 926 ++jrec->pushptrgood; /* cleared on flush */ 927 return(save); 928 } 929 930 /* 931 * Pop a previously pushed sub-transaction. We must set JMASK_LAST 932 * on the last record written within the subtransaction. If the last 933 * record written is not accessible or if the subtransaction is empty, 934 * we must write out a pad record with JMASK_LAST set before popping. 935 * 936 * When popping a subtransaction the parent record's recsize field 937 * will be properly set. If the parent pointer is no longer valid 938 * (which can occur if the data has already been flushed out to the 939 * stream), the protocol spec allows us to leave it 0. 940 * 941 * The saved parent pointer which we restore may or may not be valid, 942 * and if not valid may or may not be NULL, depending on the value 943 * of pushptrgood. 944 */ 945 static void 946 jrecord_pop(struct jrecord *jrec, struct journal_subrecord *save) 947 { 948 struct journal_subrecord *last; 949 950 KKASSERT(jrec->pushcount > 0); 951 KKASSERT(jrec->residual == 0); 952 953 /* 954 * Set JMASK_LAST on the last record we wrote at the current 955 * level. If last is NULL we either no longer have access to the 956 * record or the subtransaction was empty and we must write out a pad 957 * record. 958 */ 959 if ((last = jrec->last) == NULL) { 960 jrecord_write(jrec, JLEAF_PAD|JMASK_LAST, 0); 961 last = jrec->last; /* reload after possible flush */ 962 } else { 963 last->rectype |= JMASK_LAST; 964 } 965 966 /* 967 * pushptrgood tells us how many levels of parent record pointers 968 * are valid. The jrec only stores the current parent record pointer 969 * (and it is only valid if pushptrgood != 0). The higher level parent 970 * record pointers are saved by the routines calling jrecord_push() and 971 * jrecord_pop(). These pointers may become stale and we determine 972 * that fact by tracking the count of valid parent pointers with 973 * pushptrgood. Pointers become invalid when their related stream 974 * record gets pushed out. 975 * 976 * If no pointer is available (the data has already been pushed out), 977 * then no fixup of e.g. the length field is possible for non-leaf 978 * nodes. The protocol allows for this situation by placing a larger 979 * burden on the program scanning the stream on the other end. 980 * 981 * [parentA] 982 * [node X] 983 * [parentB] 984 * [node Y] 985 * [node Z] 986 * (pop B) see NOTE B 987 * (pop A) see NOTE A 988 * 989 * NOTE B: This pop sets LAST in node Z if the node is still accessible, 990 * else a PAD record is appended and LAST is set in that. 991 * 992 * This pop sets the record size in parentB if parentB is still 993 * accessible, else the record size is left 0 (the scanner must 994 * deal with that). 995 * 996 * This pop sets the new 'last' record to parentB, the pointer 997 * to which may or may not still be accessible. 998 * 999 * NOTE A: This pop sets LAST in parentB if the node is still accessible, 1000 * else a PAD record is appended and LAST is set in that. 1001 * 1002 * This pop sets the record size in parentA if parentA is still 1003 * accessible, else the record size is left 0 (the scanner must 1004 * deal with that). 1005 * 1006 * This pop sets the new 'last' record to parentA, the pointer 1007 * to which may or may not still be accessible. 1008 * 1009 * Also note that the last record in the stream transaction, which in 1010 * the above example is parentA, does not currently have the LAST bit 1011 * set. 1012 * 1013 * The current parent becomes the last record relative to the 1014 * saved parent passed into us. It's validity is based on 1015 * whether pushptrgood is non-zero prior to decrementing. The saved 1016 * parent becomes the new parent, and its validity is based on whether 1017 * pushptrgood is non-zero after decrementing. 1018 * 1019 * The old jrec->parent may be NULL if it is no longer accessible. 1020 * If pushptrgood is non-zero, however, it is guarenteed to not 1021 * be NULL (since no flush occured). 1022 */ 1023 jrec->last = jrec->parent; 1024 --jrec->pushcount; 1025 if (jrec->pushptrgood) { 1026 KKASSERT(jrec->last != NULL && last != NULL); 1027 if (--jrec->pushptrgood == 0) { 1028 jrec->parent = NULL; /* 'save' contains garbage or NULL */ 1029 } else { 1030 KKASSERT(save != NULL); 1031 jrec->parent = save; /* 'save' must not be NULL */ 1032 } 1033 1034 /* 1035 * Set the record size in the old parent. 'last' still points to 1036 * the original last record in the subtransaction being popped, 1037 * jrec->last points to the old parent (which became the last 1038 * record relative to the new parent being popped into). 1039 */ 1040 jrec->last->recsize = (char *)last + last->recsize - (char *)jrec->last; 1041 } else { 1042 jrec->parent = NULL; 1043 KKASSERT(jrec->last == NULL); 1044 } 1045 } 1046 1047 /* 1048 * Write out a leaf record, including associated data. 1049 */ 1050 static 1051 void 1052 jrecord_leaf(struct jrecord *jrec, int16_t rectype, void *ptr, int bytes) 1053 { 1054 jrecord_write(jrec, rectype, bytes); 1055 jrecord_data(jrec, ptr, bytes); 1056 jrecord_done(jrec, 0); 1057 } 1058 1059 /* 1060 * Write a leaf record out and return a pointer to its base. The leaf 1061 * record may contain potentially megabytes of data which is supplied 1062 * in jrecord_data() calls. The exact amount must be specified in this 1063 * call. 1064 * 1065 * THE RETURNED SUBRECORD POINTER IS ONLY VALID IMMEDIATELY AFTER THE 1066 * CALL AND MAY BECOME INVALID AT ANY TIME. ONLY THE PUSH/POP CODE SHOULD 1067 * USE THE RETURN VALUE. 1068 */ 1069 static 1070 struct journal_subrecord * 1071 jrecord_write(struct jrecord *jrec, int16_t rectype, int bytes) 1072 { 1073 struct journal_subrecord *last; 1074 int pusheditout; 1075 1076 /* 1077 * Try to catch some obvious errors. Nesting records must specify a 1078 * size of 0, and there should be no left-overs from previous operations 1079 * (such as incomplete data writeouts). 1080 */ 1081 KKASSERT(bytes == 0 || (rectype & JMASK_NESTED) == 0); 1082 KKASSERT(jrec->residual == 0); 1083 1084 /* 1085 * Check to see if the current stream record has enough room for 1086 * the new subrecord header. If it doesn't we extend the current 1087 * stream record. 1088 * 1089 * This may have the side effect of pushing out the current stream record 1090 * and creating a new one. We must adjust our stream tracking fields 1091 * accordingly. 1092 */ 1093 if (jrec->stream_residual < sizeof(struct journal_subrecord)) { 1094 jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp, 1095 jrec->stream_reserved - jrec->stream_residual, 1096 JREC_DEFAULTSIZE, &pusheditout); 1097 if (pusheditout) { 1098 jrec->stream_reserved = JREC_DEFAULTSIZE; 1099 jrec->stream_residual = JREC_DEFAULTSIZE; 1100 jrec->parent = NULL; /* no longer accessible */ 1101 jrec->pushptrgood = 0; /* restored parents in pops no good */ 1102 } else { 1103 jrec->stream_reserved += JREC_DEFAULTSIZE; 1104 jrec->stream_residual += JREC_DEFAULTSIZE; 1105 } 1106 } 1107 last = (void *)jrec->stream_ptr; 1108 last->rectype = rectype; 1109 last->reserved = 0; 1110 last->recsize = sizeof(struct journal_subrecord) + bytes; 1111 jrec->last = last; 1112 jrec->residual = bytes; /* remaining data to be posted */ 1113 jrec->residual_align = -bytes & 7; /* post-data alignment required */ 1114 return(last); 1115 } 1116 1117 /* 1118 * Write out the data associated with a leaf record. Any number of calls 1119 * to this routine may be made as long as the byte count adds up to the 1120 * amount originally specified in jrecord_write(). 1121 * 1122 * The act of writing out the leaf data may result in numerous stream records 1123 * being pushed out. Callers should be aware that even the associated 1124 * subrecord header may become inaccessible due to stream record pushouts. 1125 */ 1126 static void 1127 jrecord_data(struct jrecord *jrec, const void *buf, int bytes) 1128 { 1129 int pusheditout; 1130 int extsize; 1131 1132 KKASSERT(bytes >= 0 && bytes <= jrec->residual); 1133 1134 /* 1135 * Push out stream records as long as there is insufficient room to hold 1136 * the remaining data. 1137 */ 1138 while (jrec->stream_residual < bytes) { 1139 /* 1140 * Fill in any remaining space in the current stream record. 1141 */ 1142 bcopy(buf, jrec->stream_ptr, jrec->stream_residual); 1143 buf = (const char *)buf + jrec->stream_residual; 1144 bytes -= jrec->stream_residual; 1145 /*jrec->stream_ptr += jrec->stream_residual;*/ 1146 jrec->stream_residual = 0; 1147 jrec->residual -= jrec->stream_residual; 1148 1149 /* 1150 * Try to extend the current stream record, but no more then 1/4 1151 * the size of the FIFO. 1152 */ 1153 extsize = jrec->jo->fifo.size >> 2; 1154 if (extsize > bytes) 1155 extsize = (bytes + 15) & ~15; 1156 1157 jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp, 1158 jrec->stream_reserved - jrec->stream_residual, 1159 extsize, &pusheditout); 1160 if (pusheditout) { 1161 jrec->stream_reserved = extsize; 1162 jrec->stream_residual = extsize; 1163 jrec->parent = NULL; /* no longer accessible */ 1164 jrec->last = NULL; /* no longer accessible */ 1165 jrec->pushptrgood = 0; /* restored parents in pops no good */ 1166 } else { 1167 jrec->stream_reserved += extsize; 1168 jrec->stream_residual += extsize; 1169 } 1170 } 1171 1172 /* 1173 * Push out any remaining bytes into the current stream record. 1174 */ 1175 if (bytes) { 1176 bcopy(buf, jrec->stream_ptr, bytes); 1177 jrec->stream_ptr += bytes; 1178 jrec->stream_residual -= bytes; 1179 jrec->residual -= bytes; 1180 } 1181 1182 /* 1183 * Handle data alignment requirements for the subrecord. Because the 1184 * stream record's data space is more strictly aligned, it must already 1185 * have sufficient space to hold any subrecord alignment slop. 1186 */ 1187 if (jrec->residual == 0 && jrec->residual_align) { 1188 KKASSERT(jrec->residual_align <= jrec->stream_residual); 1189 bzero(jrec->stream_ptr, jrec->residual_align); 1190 jrec->stream_ptr += jrec->residual_align; 1191 jrec->stream_residual -= jrec->residual_align; 1192 jrec->residual_align = 0; 1193 } 1194 } 1195 1196 /* 1197 * We are finished with a transaction. If abortit is not set then we must 1198 * be at the top level with no residual subrecord data left to output. 1199 * If abortit is set then we can be in any state. 1200 * 1201 * The stream record will be committed or aborted as specified and jrecord 1202 * resources will be cleaned up. 1203 */ 1204 static void 1205 jrecord_done(struct jrecord *jrec, int abortit) 1206 { 1207 KKASSERT(jrec->rawp != NULL); 1208 1209 if (abortit) { 1210 journal_abort(jrec->jo, &jrec->rawp); 1211 } else { 1212 KKASSERT(jrec->pushcount == 0 && jrec->residual == 0); 1213 journal_commit(jrec->jo, &jrec->rawp, 1214 jrec->stream_reserved - jrec->stream_residual, 1); 1215 } 1216 1217 /* 1218 * jrec should not be used beyond this point without another init, 1219 * but clean up some fields to ensure that we panic if it is. 1220 * 1221 * Note that jrec->rawp is NULLd out by journal_abort/journal_commit. 1222 */ 1223 jrec->jo = NULL; 1224 jrec->stream_ptr = NULL; 1225 } 1226 1227 /************************************************************************ 1228 * LOW LEVEL RECORD SUPPORT ROUTINES * 1229 ************************************************************************ 1230 * 1231 * These routine create low level recursive and leaf subrecords representing 1232 * common filesystem structures. 1233 */ 1234 1235 /* 1236 * Write out a filename path relative to the base of the mount point. 1237 * rectype is typically JLEAF_PATH{1,2,3,4}. 1238 */ 1239 static void 1240 jrecord_write_path(struct jrecord *jrec, int16_t rectype, struct namecache *ncp) 1241 { 1242 char buf[64]; /* local buffer if it fits, else malloced */ 1243 char *base; 1244 int pathlen; 1245 int index; 1246 struct namecache *scan; 1247 1248 /* 1249 * Pass 1 - figure out the number of bytes required. Include terminating 1250 * \0 on last element and '/' separator on other elements. 1251 */ 1252 again: 1253 pathlen = 0; 1254 for (scan = ncp; 1255 scan && (scan->nc_flag & NCF_MOUNTPT) == 0; 1256 scan = scan->nc_parent 1257 ) { 1258 pathlen += scan->nc_nlen + 1; 1259 } 1260 1261 if (pathlen <= sizeof(buf)) 1262 base = buf; 1263 else 1264 base = malloc(pathlen, M_TEMP, M_INTWAIT); 1265 1266 /* 1267 * Pass 2 - generate the path buffer 1268 */ 1269 index = pathlen; 1270 for (scan = ncp; 1271 scan && (scan->nc_flag & NCF_MOUNTPT) == 0; 1272 scan = scan->nc_parent 1273 ) { 1274 if (scan->nc_nlen >= index) { 1275 if (base != buf) 1276 free(base, M_TEMP); 1277 goto again; 1278 } 1279 if (index == pathlen) 1280 base[--index] = 0; 1281 else 1282 base[--index] = '/'; 1283 index -= scan->nc_nlen; 1284 bcopy(scan->nc_name, base + index, scan->nc_nlen); 1285 } 1286 jrecord_leaf(jrec, rectype, base + index, pathlen - index); 1287 if (base != buf) 1288 free(base, M_TEMP); 1289 } 1290 1291 /* 1292 * Write out a file attribute structure. While somewhat inefficient, using 1293 * a recursive data structure is the most portable and extensible way. 1294 */ 1295 static void 1296 jrecord_write_vattr(struct jrecord *jrec, struct vattr *vat) 1297 { 1298 void *save; 1299 1300 save = jrecord_push(jrec, JTYPE_VATTR); 1301 if (vat->va_type != VNON) 1302 jrecord_leaf(jrec, JLEAF_UID, &vat->va_type, sizeof(vat->va_type)); 1303 if (vat->va_uid != VNOVAL) 1304 jrecord_leaf(jrec, JLEAF_UID, &vat->va_mode, sizeof(vat->va_mode)); 1305 if (vat->va_nlink != VNOVAL) 1306 jrecord_leaf(jrec, JLEAF_NLINK, &vat->va_nlink, sizeof(vat->va_nlink)); 1307 if (vat->va_uid != VNOVAL) 1308 jrecord_leaf(jrec, JLEAF_UID, &vat->va_uid, sizeof(vat->va_uid)); 1309 if (vat->va_gid != VNOVAL) 1310 jrecord_leaf(jrec, JLEAF_GID, &vat->va_gid, sizeof(vat->va_gid)); 1311 if (vat->va_fsid != VNOVAL) 1312 jrecord_leaf(jrec, JLEAF_FSID, &vat->va_fsid, sizeof(vat->va_fsid)); 1313 if (vat->va_fileid != VNOVAL) 1314 jrecord_leaf(jrec, JLEAF_INUM, &vat->va_fileid, sizeof(vat->va_fileid)); 1315 if (vat->va_size != VNOVAL) 1316 jrecord_leaf(jrec, JLEAF_SIZE, &vat->va_size, sizeof(vat->va_size)); 1317 if (vat->va_atime.tv_sec != VNOVAL) 1318 jrecord_leaf(jrec, JLEAF_ATIME, &vat->va_atime, sizeof(vat->va_atime)); 1319 if (vat->va_mtime.tv_sec != VNOVAL) 1320 jrecord_leaf(jrec, JLEAF_MTIME, &vat->va_mtime, sizeof(vat->va_mtime)); 1321 if (vat->va_ctime.tv_sec != VNOVAL) 1322 jrecord_leaf(jrec, JLEAF_CTIME, &vat->va_ctime, sizeof(vat->va_ctime)); 1323 if (vat->va_gen != VNOVAL) 1324 jrecord_leaf(jrec, JLEAF_GEN, &vat->va_gen, sizeof(vat->va_gen)); 1325 if (vat->va_flags != VNOVAL) 1326 jrecord_leaf(jrec, JLEAF_FLAGS, &vat->va_flags, sizeof(vat->va_flags)); 1327 if (vat->va_rdev != VNOVAL) 1328 jrecord_leaf(jrec, JLEAF_UDEV, &vat->va_rdev, sizeof(vat->va_rdev)); 1329 #if 0 1330 if (vat->va_filerev != VNOVAL) 1331 jrecord_leaf(jrec, JLEAF_FILEREV, &vat->va_filerev, sizeof(vat->va_filerev)); 1332 #endif 1333 jrecord_pop(jrec, save); 1334 jrecord_done(jrec, 0); 1335 } 1336 1337 /* 1338 * Write out the creds used to issue a file operation. If a process is 1339 * available write out additional tracking information related to the 1340 * process. 1341 * 1342 * XXX additional tracking info 1343 * XXX tty line info 1344 */ 1345 static void 1346 jrecord_write_cred(struct jrecord *jrec, struct thread *td, struct ucred *cred) 1347 { 1348 void *save; 1349 struct proc *p; 1350 1351 save = jrecord_push(jrec, JTYPE_CRED); 1352 jrecord_leaf(jrec, JLEAF_UID, &cred->cr_uid, sizeof(cred->cr_uid)); 1353 jrecord_leaf(jrec, JLEAF_GID, &cred->cr_gid, sizeof(cred->cr_gid)); 1354 if (td && (p = td->td_proc) != NULL) { 1355 jrecord_leaf(jrec, JLEAF_PID, &p->p_pid, sizeof(p->p_pid)); 1356 jrecord_leaf(jrec, JLEAF_COMM, p->p_comm, sizeof(p->p_comm)); 1357 } 1358 jrecord_pop(jrec, save); 1359 jrecord_done(jrec, 0); 1360 } 1361 1362 /* 1363 * Write out information required to identify a vnode 1364 */ 1365 static void 1366 jrecord_write_vnode_ref(struct jrecord *jrec, struct vnode *vp) 1367 { 1368 /* XXX */ 1369 } 1370 1371 /* 1372 * Write out the data associated with a UIO 1373 */ 1374 static void 1375 jrecord_write_uio(struct jrecord *jrec, int16_t rectype, struct uio *uio) 1376 { 1377 /* XXX */ 1378 } 1379 1380 /************************************************************************ 1381 * JOURNAL VNOPS * 1382 ************************************************************************ 1383 * 1384 * These are function shims replacing the normal filesystem ops. We become 1385 * responsible for calling the underlying filesystem ops. We have the choice 1386 * of executing the underlying op first and then generating the journal entry, 1387 * or starting the journal entry, executing the underlying op, and then 1388 * either completing or aborting it. 1389 * 1390 * The journal is supposed to be a high-level entity, which generally means 1391 * identifying files by name rather then by inode. Supplying both allows 1392 * the journal to be used both for inode-number-compatible 'mirrors' and 1393 * for simple filesystem replication. 1394 * 1395 * Writes are particularly difficult to deal with because a single write may 1396 * represent a hundred megabyte buffer or more, and both writes and truncations 1397 * require the 'old' data to be written out as well as the new data if the 1398 * log is reversable. Other issues: 1399 * 1400 * - How to deal with operations on unlinked files (no path available), 1401 * but which may still be filesystem visible due to hard links. 1402 * 1403 * - How to deal with modifications made via a memory map. 1404 * 1405 * - Future cache coherency support will require cache coherency API calls 1406 * both prior to and after the call to the underlying VFS. 1407 * 1408 * ALSO NOTE: We do not have to shim compatibility VOPs like MKDIR which have 1409 * new VFS equivalents (NMKDIR). 1410 */ 1411 1412 /* 1413 * Journal vop_settattr { a_vp, a_vap, a_cred, a_td } 1414 */ 1415 static 1416 int 1417 journal_setattr(struct vop_setattr_args *ap) 1418 { 1419 struct mount *mp; 1420 struct journal *jo; 1421 struct jrecord jrec; 1422 void *save; /* warning, save pointers do not always remain valid */ 1423 int error; 1424 1425 error = vop_journal_operate_ap(&ap->a_head); 1426 mp = ap->a_head.a_ops->vv_mount; 1427 if (error == 0) { 1428 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1429 jrecord_init(jo, &jrec, -1); 1430 save = jrecord_push(&jrec, JTYPE_SETATTR); 1431 jrecord_write_cred(&jrec, ap->a_td, ap->a_cred); 1432 jrecord_write_vnode_ref(&jrec, ap->a_vp); 1433 jrecord_write_vattr(&jrec, ap->a_vap); 1434 jrecord_pop(&jrec, save); 1435 jrecord_done(&jrec, 0); 1436 } 1437 } 1438 return (error); 1439 } 1440 1441 /* 1442 * Journal vop_write { a_vp, a_uio, a_ioflag, a_cred } 1443 */ 1444 static 1445 int 1446 journal_write(struct vop_write_args *ap) 1447 { 1448 struct mount *mp; 1449 struct journal *jo; 1450 struct jrecord jrec; 1451 void *save; /* warning, save pointers do not always remain valid */ 1452 int error; 1453 1454 error = vop_journal_operate_ap(&ap->a_head); 1455 mp = ap->a_head.a_ops->vv_mount; 1456 if (error == 0) { 1457 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1458 jrecord_init(jo, &jrec, -1); 1459 save = jrecord_push(&jrec, JTYPE_WRITE); 1460 jrecord_write_cred(&jrec, NULL, ap->a_cred); 1461 jrecord_write_vnode_ref(&jrec, ap->a_vp); 1462 jrecord_write_uio(&jrec, JLEAF_FILEDATA, ap->a_uio); 1463 jrecord_pop(&jrec, save); 1464 jrecord_done(&jrec, 0); 1465 } 1466 } 1467 return (error); 1468 } 1469 1470 /* 1471 * Journal vop_fsync { a_vp, a_waitfor, a_td } 1472 */ 1473 static 1474 int 1475 journal_fsync(struct vop_fsync_args *ap) 1476 { 1477 struct mount *mp; 1478 struct journal *jo; 1479 int error; 1480 1481 error = vop_journal_operate_ap(&ap->a_head); 1482 mp = ap->a_head.a_ops->vv_mount; 1483 if (error == 0) { 1484 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1485 /* XXX synchronize pending journal records */ 1486 } 1487 } 1488 return (error); 1489 } 1490 1491 /* 1492 * Journal vop_putpages { a_vp, a_m, a_count, a_sync, a_rtvals, a_offset } 1493 */ 1494 static 1495 int 1496 journal_putpages(struct vop_putpages_args *ap) 1497 { 1498 struct mount *mp; 1499 struct journal *jo; 1500 struct jrecord jrec; 1501 void *save; /* warning, save pointers do not always remain valid */ 1502 int error; 1503 1504 error = vop_journal_operate_ap(&ap->a_head); 1505 mp = ap->a_head.a_ops->vv_mount; 1506 if (error == 0) { 1507 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1508 jrecord_init(jo, &jrec, -1); 1509 save = jrecord_push(&jrec, JTYPE_PUTPAGES); 1510 jrecord_write_vnode_ref(&jrec, ap->a_vp); 1511 /* XXX pagelist */ 1512 jrecord_pop(&jrec, save); 1513 jrecord_done(&jrec, 0); 1514 } 1515 } 1516 return (error); 1517 } 1518 1519 /* 1520 * Journal vop_setacl { a_vp, a_type, a_aclp, a_cred, a_td } 1521 */ 1522 static 1523 int 1524 journal_setacl(struct vop_setacl_args *ap) 1525 { 1526 struct mount *mp; 1527 struct journal *jo; 1528 struct jrecord jrec; 1529 void *save; /* warning, save pointers do not always remain valid */ 1530 int error; 1531 1532 error = vop_journal_operate_ap(&ap->a_head); 1533 mp = ap->a_head.a_ops->vv_mount; 1534 if (error == 0) { 1535 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1536 jrecord_init(jo, &jrec, -1); 1537 save = jrecord_push(&jrec, JTYPE_SETACL); 1538 jrecord_write_cred(&jrec, ap->a_td, ap->a_cred); 1539 jrecord_write_vnode_ref(&jrec, ap->a_vp); 1540 /* XXX type, aclp */ 1541 jrecord_pop(&jrec, save); 1542 jrecord_done(&jrec, 0); 1543 } 1544 } 1545 return (error); 1546 } 1547 1548 /* 1549 * Journal vop_setextattr { a_vp, a_name, a_uio, a_cred, a_td } 1550 */ 1551 static 1552 int 1553 journal_setextattr(struct vop_setextattr_args *ap) 1554 { 1555 struct mount *mp; 1556 struct journal *jo; 1557 struct jrecord jrec; 1558 void *save; /* warning, save pointers do not always remain valid */ 1559 int error; 1560 1561 error = vop_journal_operate_ap(&ap->a_head); 1562 mp = ap->a_head.a_ops->vv_mount; 1563 if (error == 0) { 1564 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1565 jrecord_init(jo, &jrec, -1); 1566 save = jrecord_push(&jrec, JTYPE_SETEXTATTR); 1567 jrecord_write_cred(&jrec, ap->a_td, ap->a_cred); 1568 jrecord_write_vnode_ref(&jrec, ap->a_vp); 1569 jrecord_leaf(&jrec, JLEAF_ATTRNAME, ap->a_name, strlen(ap->a_name)); 1570 jrecord_write_uio(&jrec, JLEAF_FILEDATA, ap->a_uio); 1571 jrecord_pop(&jrec, save); 1572 jrecord_done(&jrec, 0); 1573 } 1574 } 1575 return (error); 1576 } 1577 1578 /* 1579 * Journal vop_ncreate { a_ncp, a_vpp, a_cred, a_vap } 1580 */ 1581 static 1582 int 1583 journal_ncreate(struct vop_ncreate_args *ap) 1584 { 1585 struct mount *mp; 1586 struct journal *jo; 1587 struct jrecord jrec; 1588 void *save; /* warning, save pointers do not always remain valid */ 1589 int error; 1590 1591 error = vop_journal_operate_ap(&ap->a_head); 1592 mp = ap->a_head.a_ops->vv_mount; 1593 if (error == 0) { 1594 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1595 jrecord_init(jo, &jrec, -1); 1596 save = jrecord_push(&jrec, JTYPE_CREATE); 1597 jrecord_write_cred(&jrec, NULL, ap->a_cred); 1598 jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp); 1599 if (*ap->a_vpp) 1600 jrecord_write_vnode_ref(&jrec, *ap->a_vpp); 1601 jrecord_pop(&jrec, save); 1602 jrecord_done(&jrec, 0); 1603 } 1604 } 1605 return (error); 1606 } 1607 1608 /* 1609 * Journal vop_nmknod { a_ncp, a_vpp, a_cred, a_vap } 1610 */ 1611 static 1612 int 1613 journal_nmknod(struct vop_nmknod_args *ap) 1614 { 1615 struct mount *mp; 1616 struct journal *jo; 1617 struct jrecord jrec; 1618 void *save; /* warning, save pointers do not always remain valid */ 1619 int error; 1620 1621 error = vop_journal_operate_ap(&ap->a_head); 1622 mp = ap->a_head.a_ops->vv_mount; 1623 if (error == 0) { 1624 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1625 jrecord_init(jo, &jrec, -1); 1626 save = jrecord_push(&jrec, JTYPE_MKNOD); 1627 jrecord_write_cred(&jrec, NULL, ap->a_cred); 1628 jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp); 1629 jrecord_write_vattr(&jrec, ap->a_vap); 1630 if (*ap->a_vpp) 1631 jrecord_write_vnode_ref(&jrec, *ap->a_vpp); 1632 jrecord_pop(&jrec, save); 1633 jrecord_done(&jrec, 0); 1634 } 1635 } 1636 return (error); 1637 } 1638 1639 /* 1640 * Journal vop_nlink { a_ncp, a_vp, a_cred } 1641 */ 1642 static 1643 int 1644 journal_nlink(struct vop_nlink_args *ap) 1645 { 1646 struct mount *mp; 1647 struct journal *jo; 1648 struct jrecord jrec; 1649 void *save; /* warning, save pointers do not always remain valid */ 1650 int error; 1651 1652 error = vop_journal_operate_ap(&ap->a_head); 1653 mp = ap->a_head.a_ops->vv_mount; 1654 if (error == 0) { 1655 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1656 jrecord_init(jo, &jrec, -1); 1657 save = jrecord_push(&jrec, JTYPE_LINK); 1658 jrecord_write_cred(&jrec, NULL, ap->a_cred); 1659 jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp); 1660 jrecord_write_vnode_ref(&jrec, ap->a_vp); 1661 /* XXX PATH to VP and inode number */ 1662 jrecord_pop(&jrec, save); 1663 jrecord_done(&jrec, 0); 1664 } 1665 } 1666 return (error); 1667 } 1668 1669 /* 1670 * Journal vop_symlink { a_ncp, a_vpp, a_cred, a_vap, a_target } 1671 */ 1672 static 1673 int 1674 journal_nsymlink(struct vop_nsymlink_args *ap) 1675 { 1676 struct mount *mp; 1677 struct journal *jo; 1678 struct jrecord jrec; 1679 void *save; /* warning, save pointers do not always remain valid */ 1680 int error; 1681 1682 error = vop_journal_operate_ap(&ap->a_head); 1683 mp = ap->a_head.a_ops->vv_mount; 1684 if (error == 0) { 1685 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1686 jrecord_init(jo, &jrec, -1); 1687 save = jrecord_push(&jrec, JTYPE_SYMLINK); 1688 jrecord_write_cred(&jrec, NULL, ap->a_cred); 1689 jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp); 1690 jrecord_leaf(&jrec, JLEAF_SYMLINKDATA, 1691 ap->a_target, strlen(ap->a_target)); 1692 if (*ap->a_vpp) 1693 jrecord_write_vnode_ref(&jrec, *ap->a_vpp); 1694 jrecord_pop(&jrec, save); 1695 jrecord_done(&jrec, 0); 1696 } 1697 } 1698 return (error); 1699 } 1700 1701 /* 1702 * Journal vop_nwhiteout { a_ncp, a_cred, a_flags } 1703 */ 1704 static 1705 int 1706 journal_nwhiteout(struct vop_nwhiteout_args *ap) 1707 { 1708 struct mount *mp; 1709 struct journal *jo; 1710 struct jrecord jrec; 1711 void *save; /* warning, save pointers do not always remain valid */ 1712 int error; 1713 1714 error = vop_journal_operate_ap(&ap->a_head); 1715 mp = ap->a_head.a_ops->vv_mount; 1716 if (error == 0) { 1717 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1718 jrecord_init(jo, &jrec, -1); 1719 save = jrecord_push(&jrec, JTYPE_WHITEOUT); 1720 jrecord_write_cred(&jrec, NULL, ap->a_cred); 1721 jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp); 1722 jrecord_pop(&jrec, save); 1723 jrecord_done(&jrec, 0); 1724 } 1725 } 1726 return (error); 1727 } 1728 1729 /* 1730 * Journal vop_nremove { a_ncp, a_cred } 1731 */ 1732 static 1733 int 1734 journal_nremove(struct vop_nremove_args *ap) 1735 { 1736 struct mount *mp; 1737 struct journal *jo; 1738 struct jrecord jrec; 1739 void *save; /* warning, save pointers do not always remain valid */ 1740 int error; 1741 1742 error = vop_journal_operate_ap(&ap->a_head); 1743 mp = ap->a_head.a_ops->vv_mount; 1744 if (error == 0) { 1745 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1746 jrecord_init(jo, &jrec, -1); 1747 save = jrecord_push(&jrec, JTYPE_REMOVE); 1748 jrecord_write_cred(&jrec, NULL, ap->a_cred); 1749 jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp); 1750 jrecord_pop(&jrec, save); 1751 jrecord_done(&jrec, 0); 1752 } 1753 } 1754 return (error); 1755 } 1756 1757 /* 1758 * Journal vop_nmkdir { a_ncp, a_vpp, a_cred, a_vap } 1759 */ 1760 static 1761 int 1762 journal_nmkdir(struct vop_nmkdir_args *ap) 1763 { 1764 struct mount *mp; 1765 struct journal *jo; 1766 struct jrecord jrec; 1767 void *save; /* warning, save pointers do not always remain valid */ 1768 int error; 1769 1770 error = vop_journal_operate_ap(&ap->a_head); 1771 mp = ap->a_head.a_ops->vv_mount; 1772 if (error == 0) { 1773 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1774 jrecord_init(jo, &jrec, -1); 1775 if (jo->flags & MC_JOURNAL_WANT_REVERSABLE) { 1776 save = jrecord_push(&jrec, JTYPE_UNDO); 1777 /* XXX undo operations */ 1778 jrecord_pop(&jrec, save); 1779 } 1780 #if 0 1781 if (jo->flags & MC_JOURNAL_WANT_AUDIT) { 1782 jrecord_write_audit(&jrec); 1783 } 1784 #endif 1785 save = jrecord_push(&jrec, JTYPE_MKDIR); 1786 jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp); 1787 jrecord_write_cred(&jrec, NULL, ap->a_cred); 1788 jrecord_write_vattr(&jrec, ap->a_vap); 1789 jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp); 1790 if (*ap->a_vpp) 1791 jrecord_write_vnode_ref(&jrec, *ap->a_vpp); 1792 jrecord_pop(&jrec, save); 1793 jrecord_done(&jrec, 0); 1794 } 1795 } 1796 return (error); 1797 } 1798 1799 /* 1800 * Journal vop_nrmdir { a_ncp, a_cred } 1801 */ 1802 static 1803 int 1804 journal_nrmdir(struct vop_nrmdir_args *ap) 1805 { 1806 struct mount *mp; 1807 struct journal *jo; 1808 struct jrecord jrec; 1809 void *save; /* warning, save pointers do not always remain valid */ 1810 int error; 1811 1812 error = vop_journal_operate_ap(&ap->a_head); 1813 mp = ap->a_head.a_ops->vv_mount; 1814 if (error == 0) { 1815 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1816 jrecord_init(jo, &jrec, -1); 1817 save = jrecord_push(&jrec, JTYPE_RMDIR); 1818 jrecord_write_cred(&jrec, NULL, ap->a_cred); 1819 jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp); 1820 jrecord_pop(&jrec, save); 1821 jrecord_done(&jrec, 0); 1822 } 1823 } 1824 return (error); 1825 } 1826 1827 /* 1828 * Journal vop_nrename { a_fncp, a_tncp, a_cred } 1829 */ 1830 static 1831 int 1832 journal_nrename(struct vop_nrename_args *ap) 1833 { 1834 struct mount *mp; 1835 struct journal *jo; 1836 struct jrecord jrec; 1837 void *save; /* warning, save pointers do not always remain valid */ 1838 int error; 1839 1840 error = vop_journal_operate_ap(&ap->a_head); 1841 mp = ap->a_head.a_ops->vv_mount; 1842 if (error == 0) { 1843 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { 1844 jrecord_init(jo, &jrec, -1); 1845 save = jrecord_push(&jrec, JTYPE_RENAME); 1846 jrecord_write_cred(&jrec, NULL, ap->a_cred); 1847 jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_fncp); 1848 jrecord_write_path(&jrec, JLEAF_PATH2, ap->a_tncp); 1849 jrecord_pop(&jrec, save); 1850 jrecord_done(&jrec, 0); 1851 } 1852 } 1853 return (error); 1854 } 1855 1856