1 /* 2 * Copyright (c) 2004-2006 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/kern/vfs_journal.c,v 1.33 2007/05/09 00:53:34 dillon Exp $ 35 */ 36 /* 37 * The journaling protocol is intended to evolve into a two-way stream 38 * whereby transaction IDs can be acknowledged by the journaling target 39 * when the data has been committed to hard storage. Both implicit and 40 * explicit acknowledgement schemes will be supported, depending on the 41 * sophistication of the journaling stream, plus resynchronization and 42 * restart when a journaling stream is interrupted. This information will 43 * also be made available to journaling-aware filesystems to allow better 44 * management of their own physical storage synchronization mechanisms as 45 * well as to allow such filesystems to take direct advantage of the kernel's 46 * journaling layer so they don't have to roll their own. 47 * 48 * In addition, the worker thread will have access to much larger 49 * spooling areas then the memory buffer is able to provide by e.g. 50 * reserving swap space, in order to absorb potentially long interruptions 51 * of off-site journaling streams, and to prevent 'slow' off-site linkages 52 * from radically slowing down local filesystem operations. 53 * 54 * Because of the non-trivial algorithms the journaling system will be 55 * required to support, use of a worker thread is mandatory. Efficiencies 56 * are maintained by utilitizing the memory FIFO to batch transactions when 57 * possible, reducing the number of gratuitous thread switches and taking 58 * advantage of cpu caches through the use of shorter batched code paths 59 * rather then trying to do everything in the context of the process 60 * originating the filesystem op. In the future the memory FIFO can be 61 * made per-cpu to remove BGL or other locking requirements. 62 */ 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/buf.h> 66 #include <sys/conf.h> 67 #include <sys/kernel.h> 68 #include <sys/queue.h> 69 #include <sys/lock.h> 70 #include <sys/malloc.h> 71 #include <sys/mount.h> 72 #include <sys/unistd.h> 73 #include <sys/vnode.h> 74 #include <sys/poll.h> 75 #include <sys/mountctl.h> 76 #include <sys/journal.h> 77 #include <sys/file.h> 78 #include <sys/proc.h> 79 #include <sys/xio.h> 80 #include <sys/socket.h> 81 #include <sys/socketvar.h> 82 83 #include <machine/limits.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_object.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_pager.h> 89 #include <vm/vnode_pager.h> 90 91 #include <sys/file2.h> 92 #include <sys/thread2.h> 93 #include <sys/spinlock2.h> 94 95 static void journal_wthread(void *info); 96 static void journal_rthread(void *info); 97 98 static void *journal_reserve(struct journal *jo, 99 struct journal_rawrecbeg **rawpp, 100 int16_t streamid, int bytes); 101 static void *journal_extend(struct journal *jo, 102 struct journal_rawrecbeg **rawpp, 103 int truncbytes, int bytes, int *newstreamrecp); 104 static void journal_abort(struct journal *jo, 105 struct journal_rawrecbeg **rawpp); 106 static void journal_commit(struct journal *jo, 107 struct journal_rawrecbeg **rawpp, 108 int bytes, int closeout); 109 static void jrecord_data(struct jrecord *jrec, 110 void *buf, int bytes, int dtype); 111 112 113 MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structures"); 114 MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO"); 115 116 void 117 journal_create_threads(struct journal *jo) 118 { 119 jo->flags &= ~(MC_JOURNAL_STOP_REQ | MC_JOURNAL_STOP_IMM); 120 jo->flags |= MC_JOURNAL_WACTIVE; 121 lwkt_create(journal_wthread, jo, NULL, &jo->wthread, 122 TDF_STOPREQ, -1, "journal w:%.*s", JIDMAX, jo->id); 123 lwkt_setpri(&jo->wthread, TDPRI_KERN_DAEMON); 124 lwkt_schedule(&jo->wthread); 125 126 if (jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) { 127 jo->flags |= MC_JOURNAL_RACTIVE; 128 lwkt_create(journal_rthread, jo, NULL, &jo->rthread, 129 TDF_STOPREQ, -1, "journal r:%.*s", JIDMAX, jo->id); 130 lwkt_setpri(&jo->rthread, TDPRI_KERN_DAEMON); 131 lwkt_schedule(&jo->rthread); 132 } 133 } 134 135 void 136 journal_destroy_threads(struct journal *jo, int flags) 137 { 138 int wcount; 139 140 jo->flags |= MC_JOURNAL_STOP_REQ | (flags & MC_JOURNAL_STOP_IMM); 141 wakeup(&jo->fifo); 142 wcount = 0; 143 while (jo->flags & (MC_JOURNAL_WACTIVE | MC_JOURNAL_RACTIVE)) { 144 tsleep(jo, 0, "jwait", hz); 145 if (++wcount % 10 == 0) { 146 kprintf("Warning: journal %s waiting for descriptors to close\n", 147 jo->id); 148 } 149 } 150 151 /* 152 * XXX SMP - threads should move to cpu requesting the restart or 153 * termination before finishing up to properly interlock. 154 */ 155 tsleep(jo, 0, "jwait", hz); 156 lwkt_free_thread(&jo->wthread); 157 if (jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) 158 lwkt_free_thread(&jo->rthread); 159 } 160 161 /* 162 * The per-journal worker thread is responsible for writing out the 163 * journal's FIFO to the target stream. 164 */ 165 static void 166 journal_wthread(void *info) 167 { 168 struct journal *jo = info; 169 struct journal_rawrecbeg *rawp; 170 int error; 171 size_t avail; 172 size_t bytes; 173 size_t res; 174 175 for (;;) { 176 /* 177 * Calculate the number of bytes available to write. This buffer 178 * area may contain reserved records so we can't just write it out 179 * without further checks. 180 */ 181 bytes = jo->fifo.windex - jo->fifo.rindex; 182 183 /* 184 * sleep if no bytes are available or if an incomplete record is 185 * encountered (it needs to be filled in before we can write it 186 * out), and skip any pad records that we encounter. 187 */ 188 if (bytes == 0) { 189 if (jo->flags & MC_JOURNAL_STOP_REQ) 190 break; 191 tsleep(&jo->fifo, 0, "jfifo", hz); 192 continue; 193 } 194 195 /* 196 * Sleep if we can not go any further due to hitting an incomplete 197 * record. This case should occur rarely but may have to be better 198 * optimized XXX. 199 */ 200 rawp = (void *)(jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask)); 201 if (rawp->begmagic == JREC_INCOMPLETEMAGIC) { 202 tsleep(&jo->fifo, 0, "jpad", hz); 203 continue; 204 } 205 206 /* 207 * Skip any pad records. We do not write out pad records if we can 208 * help it. 209 */ 210 if (rawp->streamid == JREC_STREAMID_PAD) { 211 if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) { 212 if (jo->fifo.rindex == jo->fifo.xindex) { 213 jo->fifo.xindex += (rawp->recsize + 15) & ~15; 214 jo->total_acked += (rawp->recsize + 15) & ~15; 215 } 216 } 217 jo->fifo.rindex += (rawp->recsize + 15) & ~15; 218 jo->total_acked += bytes; 219 KKASSERT(jo->fifo.windex - jo->fifo.rindex >= 0); 220 continue; 221 } 222 223 /* 224 * 'bytes' is the amount of data that can potentially be written out. 225 * Calculate 'res', the amount of data that can actually be written 226 * out. res is bounded either by hitting the end of the physical 227 * memory buffer or by hitting an incomplete record. Incomplete 228 * records often occur due to the way the space reservation model 229 * works. 230 */ 231 res = 0; 232 avail = jo->fifo.size - (jo->fifo.rindex & jo->fifo.mask); 233 while (res < bytes && rawp->begmagic == JREC_BEGMAGIC) { 234 res += (rawp->recsize + 15) & ~15; 235 if (res >= avail) { 236 KKASSERT(res == avail); 237 break; 238 } 239 rawp = (void *)((char *)rawp + ((rawp->recsize + 15) & ~15)); 240 } 241 242 /* 243 * Issue the write and deal with any errors or other conditions. 244 * For now assume blocking I/O. Since we are record-aware the 245 * code cannot yet handle partial writes. 246 * 247 * We bump rindex prior to issuing the write to avoid racing 248 * the acknowledgement coming back (which could prevent the ack 249 * from bumping xindex). Restarts are always based on xindex so 250 * we do not try to undo the rindex if an error occurs. 251 * 252 * XXX EWOULDBLOCK/NBIO 253 * XXX notification on failure 254 * XXX permanent verses temporary failures 255 * XXX two-way acknowledgement stream in the return direction / xindex 256 */ 257 bytes = res; 258 jo->fifo.rindex += bytes; 259 error = fp_write(jo->fp, 260 jo->fifo.membase + 261 ((jo->fifo.rindex - bytes) & jo->fifo.mask), 262 bytes, &res, UIO_SYSSPACE); 263 if (error) { 264 kprintf("journal_thread(%s) write, error %d\n", jo->id, error); 265 /* XXX */ 266 } else { 267 KKASSERT(res == bytes); 268 } 269 270 /* 271 * Advance rindex. If the journal stream is not full duplex we also 272 * advance xindex, otherwise the rjournal thread is responsible for 273 * advancing xindex. 274 */ 275 if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) { 276 jo->fifo.xindex += bytes; 277 jo->total_acked += bytes; 278 } 279 KKASSERT(jo->fifo.windex - jo->fifo.rindex >= 0); 280 if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) { 281 if (jo->flags & MC_JOURNAL_WWAIT) { 282 jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */ 283 wakeup(&jo->fifo.windex); 284 } 285 } 286 } 287 fp_shutdown(jo->fp, SHUT_WR); 288 jo->flags &= ~MC_JOURNAL_WACTIVE; 289 wakeup(jo); 290 wakeup(&jo->fifo.windex); 291 } 292 293 /* 294 * A second per-journal worker thread is created for two-way journaling 295 * streams to deal with the return acknowledgement stream. 296 */ 297 static void 298 journal_rthread(void *info) 299 { 300 struct journal_rawrecbeg *rawp; 301 struct journal_ackrecord ack; 302 struct journal *jo = info; 303 int64_t transid; 304 int error; 305 size_t count; 306 size_t bytes; 307 308 transid = 0; 309 error = 0; 310 311 for (;;) { 312 /* 313 * We have been asked to stop 314 */ 315 if (jo->flags & MC_JOURNAL_STOP_REQ) 316 break; 317 318 /* 319 * If we have no active transaction id, get one from the return 320 * stream. 321 */ 322 if (transid == 0) { 323 error = fp_read(jo->fp, &ack, sizeof(ack), &count, 324 1, UIO_SYSSPACE); 325 #if 0 326 kprintf("fp_read ack error %d count %d\n", error, count); 327 #endif 328 if (error || count != sizeof(ack)) 329 break; 330 if (error) { 331 kprintf("read error %d on receive stream\n", error); 332 break; 333 } 334 if (ack.rbeg.begmagic != JREC_BEGMAGIC || 335 ack.rend.endmagic != JREC_ENDMAGIC 336 ) { 337 kprintf("bad begmagic or endmagic on receive stream\n"); 338 break; 339 } 340 transid = ack.rbeg.transid; 341 } 342 343 /* 344 * Calculate the number of unacknowledged bytes. If there are no 345 * unacknowledged bytes then unsent data was acknowledged, report, 346 * sleep a bit, and loop in that case. This should not happen 347 * normally. The ack record is thrown away. 348 */ 349 bytes = jo->fifo.rindex - jo->fifo.xindex; 350 351 if (bytes == 0) { 352 kprintf("warning: unsent data acknowledged transid %08llx\n", 353 (long long)transid); 354 tsleep(&jo->fifo.xindex, 0, "jrseq", hz); 355 transid = 0; 356 continue; 357 } 358 359 /* 360 * Since rindex has advanced, the record pointed to by xindex 361 * must be a valid record. 362 */ 363 rawp = (void *)(jo->fifo.membase + (jo->fifo.xindex & jo->fifo.mask)); 364 KKASSERT(rawp->begmagic == JREC_BEGMAGIC); 365 KKASSERT(rawp->recsize <= bytes); 366 367 /* 368 * The target can acknowledge several records at once. 369 */ 370 if (rawp->transid < transid) { 371 #if 1 372 kprintf("ackskip %08llx/%08llx\n", 373 (long long)rawp->transid, 374 (long long)transid); 375 #endif 376 jo->fifo.xindex += (rawp->recsize + 15) & ~15; 377 jo->total_acked += (rawp->recsize + 15) & ~15; 378 if (jo->flags & MC_JOURNAL_WWAIT) { 379 jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */ 380 wakeup(&jo->fifo.windex); 381 } 382 continue; 383 } 384 if (rawp->transid == transid) { 385 #if 1 386 kprintf("ackskip %08llx/%08llx\n", 387 (long long)rawp->transid, 388 (long long)transid); 389 #endif 390 jo->fifo.xindex += (rawp->recsize + 15) & ~15; 391 jo->total_acked += (rawp->recsize + 15) & ~15; 392 if (jo->flags & MC_JOURNAL_WWAIT) { 393 jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */ 394 wakeup(&jo->fifo.windex); 395 } 396 transid = 0; 397 continue; 398 } 399 kprintf("warning: unsent data(2) acknowledged transid %08llx\n", 400 (long long)transid); 401 transid = 0; 402 } 403 jo->flags &= ~MC_JOURNAL_RACTIVE; 404 wakeup(jo); 405 wakeup(&jo->fifo.windex); 406 } 407 408 /* 409 * This builds a pad record which the journaling thread will skip over. Pad 410 * records are required when we are unable to reserve sufficient stream space 411 * due to insufficient space at the end of the physical memory fifo. 412 * 413 * Even though the record is not transmitted, a normal transid must be 414 * assigned to it so link recovery operations after a failure work properly. 415 */ 416 static 417 void 418 journal_build_pad(struct journal_rawrecbeg *rawp, int recsize, int64_t transid) 419 { 420 struct journal_rawrecend *rendp; 421 422 KKASSERT((recsize & 15) == 0 && recsize >= 16); 423 424 rawp->streamid = JREC_STREAMID_PAD; 425 rawp->recsize = recsize; /* must be 16-byte aligned */ 426 rawp->transid = transid; 427 /* 428 * WARNING, rendp may overlap rawp->transid. This is necessary to 429 * allow PAD records to fit in 16 bytes. Use cpu_ccfence() to 430 * hopefully cause the compiler to not make any assumptions. 431 */ 432 rendp = (void *)((char *)rawp + rawp->recsize - sizeof(*rendp)); 433 rendp->endmagic = JREC_ENDMAGIC; 434 rendp->check = 0; 435 rendp->recsize = rawp->recsize; 436 437 /* 438 * Set the begin magic last. This is what will allow the journal 439 * thread to write the record out. Use a store fence to prevent 440 * compiler and cpu reordering of the writes. 441 */ 442 cpu_sfence(); 443 rawp->begmagic = JREC_BEGMAGIC; 444 } 445 446 /* 447 * Wake up the worker thread if the FIFO is more then half full or if 448 * someone is waiting for space to be freed up. Otherwise let the 449 * heartbeat deal with it. Being able to avoid waking up the worker 450 * is the key to the journal's cpu performance. 451 */ 452 static __inline 453 void 454 journal_commit_wakeup(struct journal *jo) 455 { 456 int avail; 457 458 avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex); 459 KKASSERT(avail >= 0); 460 if ((avail < (jo->fifo.size >> 1)) || (jo->flags & MC_JOURNAL_WWAIT)) 461 wakeup(&jo->fifo); 462 } 463 464 /* 465 * Create a new BEGIN stream record with the specified streamid and the 466 * specified amount of payload space. *rawpp will be set to point to the 467 * base of the new stream record and a pointer to the base of the payload 468 * space will be returned. *rawpp does not need to be pre-NULLd prior to 469 * making this call. The raw record header will be partially initialized. 470 * 471 * A stream can be extended, aborted, or committed by other API calls 472 * below. This may result in a sequence of potentially disconnected 473 * stream records to be output to the journaling target. The first record 474 * (the one created by this function) will be marked JREC_STREAMCTL_BEGIN, 475 * while the last record on commit or abort will be marked JREC_STREAMCTL_END 476 * (and possibly also JREC_STREAMCTL_ABORTED). The last record could wind 477 * up being the same as the first, in which case the bits are all set in 478 * the first record. 479 * 480 * The stream record is created in an incomplete state by setting the begin 481 * magic to JREC_INCOMPLETEMAGIC. This prevents the worker thread from 482 * flushing the fifo past our record until we have finished populating it. 483 * Other threads can reserve and operate on their own space without stalling 484 * but the stream output will stall until we have completed operations. The 485 * memory FIFO is intended to be large enough to absorb such situations 486 * without stalling out other threads. 487 */ 488 static 489 void * 490 journal_reserve(struct journal *jo, struct journal_rawrecbeg **rawpp, 491 int16_t streamid, int bytes) 492 { 493 struct journal_rawrecbeg *rawp; 494 int avail; 495 int availtoend; 496 int req; 497 498 /* 499 * Add header and trailer overheads to the passed payload. Note that 500 * the passed payload size need not be aligned in any way. 501 */ 502 bytes += sizeof(struct journal_rawrecbeg); 503 bytes += sizeof(struct journal_rawrecend); 504 505 for (;;) { 506 /* 507 * First, check boundary conditions. If the request would wrap around 508 * we have to skip past the ending block and return to the beginning 509 * of the FIFO's buffer. Calculate 'req' which is the actual number 510 * of bytes being reserved, including wrap-around dead space. 511 * 512 * Neither 'bytes' or 'req' are aligned. 513 * 514 * Note that availtoend is not truncated to avail and so cannot be 515 * used to determine whether the reservation is possible by itself. 516 * Also, since all fifo ops are 16-byte aligned, we can check 517 * the size before calculating the aligned size. 518 */ 519 availtoend = jo->fifo.size - (jo->fifo.windex & jo->fifo.mask); 520 KKASSERT((availtoend & 15) == 0); 521 if (bytes > availtoend) 522 req = bytes + availtoend; /* add pad to end */ 523 else 524 req = bytes; 525 526 /* 527 * Next calculate the total available space and see if it is 528 * sufficient. We cannot overwrite previously buffered data 529 * past xindex because otherwise we would not be able to restart 530 * a broken link at the target's last point of commit. 531 */ 532 avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex); 533 KKASSERT(avail >= 0 && (avail & 15) == 0); 534 535 if (avail < req) { 536 /* XXX MC_JOURNAL_STOP_IMM */ 537 jo->flags |= MC_JOURNAL_WWAIT; 538 ++jo->fifostalls; 539 tsleep(&jo->fifo.windex, 0, "jwrite", 0); 540 continue; 541 } 542 543 /* 544 * Create a pad record for any dead space and create an incomplete 545 * record for the live space, then return a pointer to the 546 * contiguous buffer space that was requested. 547 * 548 * NOTE: The worker thread will not flush past an incomplete 549 * record, so the reserved space can be filled in at-will. The 550 * journaling code must also be aware the reserved sections occuring 551 * after this one will also not be written out even if completed 552 * until this one is completed. 553 * 554 * The transaction id must accomodate real and potential pad creation. 555 */ 556 rawp = (void *)(jo->fifo.membase + (jo->fifo.windex & jo->fifo.mask)); 557 if (req != bytes) { 558 journal_build_pad(rawp, availtoend, jo->transid); 559 ++jo->transid; 560 rawp = (void *)jo->fifo.membase; 561 } 562 rawp->begmagic = JREC_INCOMPLETEMAGIC; /* updated by abort/commit */ 563 rawp->recsize = bytes; /* (unaligned size) */ 564 rawp->streamid = streamid | JREC_STREAMCTL_BEGIN; 565 rawp->transid = jo->transid; 566 jo->transid += 2; 567 568 /* 569 * Issue a memory barrier to guarentee that the record data has been 570 * properly initialized before we advance the write index and return 571 * a pointer to the reserved record. Otherwise the worker thread 572 * could accidently run past us. 573 * 574 * Note that stream records are always 16-byte aligned. 575 */ 576 cpu_sfence(); 577 jo->fifo.windex += (req + 15) & ~15; 578 *rawpp = rawp; 579 return(rawp + 1); 580 } 581 /* not reached */ 582 *rawpp = NULL; 583 return(NULL); 584 } 585 586 /* 587 * Attempt to extend the stream record by <bytes> worth of payload space. 588 * 589 * If it is possible to extend the existing stream record no truncation 590 * occurs and the record is extended as specified. A pointer to the 591 * truncation offset within the payload space is returned. 592 * 593 * If it is not possible to do this the existing stream record is truncated 594 * and committed, and a new stream record of size <bytes> is created. A 595 * pointer to the base of the new stream record's payload space is returned. 596 * 597 * *rawpp is set to the new reservation in the case of a new record but 598 * the caller cannot depend on a comparison with the old rawp to determine if 599 * this case occurs because we could end up using the same memory FIFO 600 * offset for the new stream record. Use *newstreamrecp instead. 601 */ 602 static void * 603 journal_extend(struct journal *jo, struct journal_rawrecbeg **rawpp, 604 int truncbytes, int bytes, int *newstreamrecp) 605 { 606 struct journal_rawrecbeg *rawp; 607 int16_t streamid; 608 int availtoend; 609 int avail; 610 int osize; 611 int nsize; 612 int wbase; 613 void *rptr; 614 615 *newstreamrecp = 0; 616 rawp = *rawpp; 617 osize = (rawp->recsize + 15) & ~15; 618 nsize = (rawp->recsize + bytes + 15) & ~15; 619 wbase = (char *)rawp - jo->fifo.membase; 620 621 /* 622 * If the aligned record size does not change we can trivially adjust 623 * the record size. 624 */ 625 if (nsize == osize) { 626 rawp->recsize += bytes; 627 return((char *)(rawp + 1) + truncbytes); 628 } 629 630 /* 631 * If the fifo's write index hasn't been modified since we made the 632 * reservation and we do not hit any boundary conditions, we can 633 * trivially make the record smaller or larger. 634 */ 635 if ((jo->fifo.windex & jo->fifo.mask) == wbase + osize) { 636 availtoend = jo->fifo.size - wbase; 637 avail = jo->fifo.size - (jo->fifo.windex - jo->fifo.xindex) + osize; 638 KKASSERT((availtoend & 15) == 0); 639 KKASSERT((avail & 15) == 0); 640 if (nsize <= avail && nsize <= availtoend) { 641 jo->fifo.windex += nsize - osize; 642 rawp->recsize += bytes; 643 return((char *)(rawp + 1) + truncbytes); 644 } 645 } 646 647 /* 648 * It was not possible to extend the buffer. Commit the current 649 * buffer and create a new one. We manually clear the BEGIN mark that 650 * journal_reserve() creates (because this is a continuing record, not 651 * the start of a new stream). 652 */ 653 streamid = rawp->streamid & JREC_STREAMID_MASK; 654 journal_commit(jo, rawpp, truncbytes, 0); 655 rptr = journal_reserve(jo, rawpp, streamid, bytes); 656 rawp = *rawpp; 657 rawp->streamid &= ~JREC_STREAMCTL_BEGIN; 658 *newstreamrecp = 1; 659 return(rptr); 660 } 661 662 /* 663 * Abort a journal record. If the transaction record represents a stream 664 * BEGIN and we can reverse the fifo's write index we can simply reverse 665 * index the entire record, as if it were never reserved in the first place. 666 * 667 * Otherwise we set the JREC_STREAMCTL_ABORTED bit and commit the record 668 * with the payload truncated to 0 bytes. 669 */ 670 static void 671 journal_abort(struct journal *jo, struct journal_rawrecbeg **rawpp) 672 { 673 struct journal_rawrecbeg *rawp; 674 int osize; 675 676 rawp = *rawpp; 677 osize = (rawp->recsize + 15) & ~15; 678 679 if ((rawp->streamid & JREC_STREAMCTL_BEGIN) && 680 (jo->fifo.windex & jo->fifo.mask) == 681 (char *)rawp - jo->fifo.membase + osize) 682 { 683 jo->fifo.windex -= osize; 684 *rawpp = NULL; 685 } else { 686 rawp->streamid |= JREC_STREAMCTL_ABORTED; 687 journal_commit(jo, rawpp, 0, 1); 688 } 689 } 690 691 /* 692 * Commit a journal record and potentially truncate it to the specified 693 * number of payload bytes. If you do not want to truncate the record, 694 * simply pass -1 for the bytes parameter. Do not pass rawp->recsize, that 695 * field includes header and trailer and will not be correct. Note that 696 * passing 0 will truncate the entire data payload of the record. 697 * 698 * The logical stream is terminated by this function. 699 * 700 * If truncation occurs, and it is not possible to physically optimize the 701 * memory FIFO due to other threads having reserved space after ours, 702 * the remaining reserved space will be covered by a pad record. 703 */ 704 static void 705 journal_commit(struct journal *jo, struct journal_rawrecbeg **rawpp, 706 int bytes, int closeout) 707 { 708 struct journal_rawrecbeg *rawp; 709 struct journal_rawrecend *rendp; 710 int osize; 711 int nsize; 712 713 rawp = *rawpp; 714 *rawpp = NULL; 715 716 KKASSERT((char *)rawp >= jo->fifo.membase && 717 (char *)rawp + rawp->recsize <= jo->fifo.membase + jo->fifo.size); 718 KKASSERT(((intptr_t)rawp & 15) == 0); 719 720 /* 721 * Truncate the record if necessary. If the FIFO write index as still 722 * at the end of our record we can optimally backindex it. Otherwise 723 * we have to insert a pad record to cover the dead space. 724 * 725 * We calculate osize which is the 16-byte-aligned original recsize. 726 * We calculate nsize which is the 16-byte-aligned new recsize. 727 * 728 * Due to alignment issues or in case the passed truncation bytes is 729 * the same as the original payload, nsize may be equal to osize even 730 * if the committed bytes is less then the originally reserved bytes. 731 */ 732 if (bytes >= 0) { 733 KKASSERT(bytes >= 0 && bytes <= rawp->recsize - sizeof(struct journal_rawrecbeg) - sizeof(struct journal_rawrecend)); 734 osize = (rawp->recsize + 15) & ~15; 735 rawp->recsize = bytes + sizeof(struct journal_rawrecbeg) + 736 sizeof(struct journal_rawrecend); 737 nsize = (rawp->recsize + 15) & ~15; 738 KKASSERT(nsize <= osize); 739 if (osize == nsize) { 740 /* do nothing */ 741 } else if ((jo->fifo.windex & jo->fifo.mask) == (char *)rawp - jo->fifo.membase + osize) { 742 /* we are able to backindex the fifo */ 743 jo->fifo.windex -= osize - nsize; 744 } else { 745 /* we cannot backindex the fifo, emplace a pad in the dead space */ 746 journal_build_pad((void *)((char *)rawp + nsize), osize - nsize, 747 rawp->transid + 1); 748 } 749 } 750 751 /* 752 * Fill in the trailer. Note that unlike pad records, the trailer will 753 * never overlap the header. 754 */ 755 rendp = (void *)((char *)rawp + 756 ((rawp->recsize + 15) & ~15) - sizeof(*rendp)); 757 rendp->endmagic = JREC_ENDMAGIC; 758 rendp->recsize = rawp->recsize; 759 rendp->check = 0; /* XXX check word, disabled for now */ 760 761 /* 762 * Fill in begmagic last. This will allow the worker thread to proceed. 763 * Use a memory barrier to guarentee write ordering. Mark the stream 764 * as terminated if closeout is set. This is the typical case. 765 */ 766 if (closeout) 767 rawp->streamid |= JREC_STREAMCTL_END; 768 cpu_sfence(); /* memory and compiler barrier */ 769 rawp->begmagic = JREC_BEGMAGIC; 770 771 journal_commit_wakeup(jo); 772 } 773 774 /************************************************************************ 775 * TRANSACTION SUPPORT ROUTINES * 776 ************************************************************************ 777 * 778 * JRECORD_*() - routines to create subrecord transactions and embed them 779 * in the logical streams managed by the journal_*() routines. 780 */ 781 782 /* 783 * Initialize the passed jrecord structure and start a new stream transaction 784 * by reserving an initial build space in the journal's memory FIFO. 785 */ 786 void 787 jrecord_init(struct journal *jo, struct jrecord *jrec, int16_t streamid) 788 { 789 bzero(jrec, sizeof(*jrec)); 790 jrec->jo = jo; 791 jrec->streamid = streamid; 792 jrec->stream_residual = JREC_DEFAULTSIZE; 793 jrec->stream_reserved = jrec->stream_residual; 794 jrec->stream_ptr = 795 journal_reserve(jo, &jrec->rawp, streamid, jrec->stream_reserved); 796 } 797 798 /* 799 * Push a recursive record type. All pushes should have matching pops. 800 * The old parent is returned and the newly pushed record becomes the 801 * new parent. Note that the old parent's pointer may already be invalid 802 * or may become invalid if jrecord_write() had to build a new stream 803 * record, so the caller should not mess with the returned pointer in 804 * any way other then to save it. 805 */ 806 struct journal_subrecord * 807 jrecord_push(struct jrecord *jrec, int16_t rectype) 808 { 809 struct journal_subrecord *save; 810 811 save = jrec->parent; 812 jrec->parent = jrecord_write(jrec, rectype|JMASK_NESTED, 0); 813 jrec->last = NULL; 814 KKASSERT(jrec->parent != NULL); 815 ++jrec->pushcount; 816 ++jrec->pushptrgood; /* cleared on flush */ 817 return(save); 818 } 819 820 /* 821 * Pop a previously pushed sub-transaction. We must set JMASK_LAST 822 * on the last record written within the subtransaction. If the last 823 * record written is not accessible or if the subtransaction is empty, 824 * we must write out a pad record with JMASK_LAST set before popping. 825 * 826 * When popping a subtransaction the parent record's recsize field 827 * will be properly set. If the parent pointer is no longer valid 828 * (which can occur if the data has already been flushed out to the 829 * stream), the protocol spec allows us to leave it 0. 830 * 831 * The saved parent pointer which we restore may or may not be valid, 832 * and if not valid may or may not be NULL, depending on the value 833 * of pushptrgood. 834 */ 835 void 836 jrecord_pop(struct jrecord *jrec, struct journal_subrecord *save) 837 { 838 struct journal_subrecord *last; 839 840 KKASSERT(jrec->pushcount > 0); 841 KKASSERT(jrec->residual == 0); 842 843 /* 844 * Set JMASK_LAST on the last record we wrote at the current 845 * level. If last is NULL we either no longer have access to the 846 * record or the subtransaction was empty and we must write out a pad 847 * record. 848 */ 849 if ((last = jrec->last) == NULL) { 850 jrecord_write(jrec, JLEAF_PAD|JMASK_LAST, 0); 851 last = jrec->last; /* reload after possible flush */ 852 } else { 853 last->rectype |= JMASK_LAST; 854 } 855 856 /* 857 * pushptrgood tells us how many levels of parent record pointers 858 * are valid. The jrec only stores the current parent record pointer 859 * (and it is only valid if pushptrgood != 0). The higher level parent 860 * record pointers are saved by the routines calling jrecord_push() and 861 * jrecord_pop(). These pointers may become stale and we determine 862 * that fact by tracking the count of valid parent pointers with 863 * pushptrgood. Pointers become invalid when their related stream 864 * record gets pushed out. 865 * 866 * If no pointer is available (the data has already been pushed out), 867 * then no fixup of e.g. the length field is possible for non-leaf 868 * nodes. The protocol allows for this situation by placing a larger 869 * burden on the program scanning the stream on the other end. 870 * 871 * [parentA] 872 * [node X] 873 * [parentB] 874 * [node Y] 875 * [node Z] 876 * (pop B) see NOTE B 877 * (pop A) see NOTE A 878 * 879 * NOTE B: This pop sets LAST in node Z if the node is still accessible, 880 * else a PAD record is appended and LAST is set in that. 881 * 882 * This pop sets the record size in parentB if parentB is still 883 * accessible, else the record size is left 0 (the scanner must 884 * deal with that). 885 * 886 * This pop sets the new 'last' record to parentB, the pointer 887 * to which may or may not still be accessible. 888 * 889 * NOTE A: This pop sets LAST in parentB if the node is still accessible, 890 * else a PAD record is appended and LAST is set in that. 891 * 892 * This pop sets the record size in parentA if parentA is still 893 * accessible, else the record size is left 0 (the scanner must 894 * deal with that). 895 * 896 * This pop sets the new 'last' record to parentA, the pointer 897 * to which may or may not still be accessible. 898 * 899 * Also note that the last record in the stream transaction, which in 900 * the above example is parentA, does not currently have the LAST bit 901 * set. 902 * 903 * The current parent becomes the last record relative to the 904 * saved parent passed into us. It's validity is based on 905 * whether pushptrgood is non-zero prior to decrementing. The saved 906 * parent becomes the new parent, and its validity is based on whether 907 * pushptrgood is non-zero after decrementing. 908 * 909 * The old jrec->parent may be NULL if it is no longer accessible. 910 * If pushptrgood is non-zero, however, it is guarenteed to not 911 * be NULL (since no flush occured). 912 */ 913 jrec->last = jrec->parent; 914 --jrec->pushcount; 915 if (jrec->pushptrgood) { 916 KKASSERT(jrec->last != NULL && last != NULL); 917 if (--jrec->pushptrgood == 0) { 918 jrec->parent = NULL; /* 'save' contains garbage or NULL */ 919 } else { 920 KKASSERT(save != NULL); 921 jrec->parent = save; /* 'save' must not be NULL */ 922 } 923 924 /* 925 * Set the record size in the old parent. 'last' still points to 926 * the original last record in the subtransaction being popped, 927 * jrec->last points to the old parent (which became the last 928 * record relative to the new parent being popped into). 929 */ 930 jrec->last->recsize = (char *)last + last->recsize - (char *)jrec->last; 931 } else { 932 jrec->parent = NULL; 933 KKASSERT(jrec->last == NULL); 934 } 935 } 936 937 /* 938 * Write out a leaf record, including associated data. 939 */ 940 void 941 jrecord_leaf(struct jrecord *jrec, int16_t rectype, void *ptr, int bytes) 942 { 943 jrecord_write(jrec, rectype, bytes); 944 jrecord_data(jrec, ptr, bytes, JDATA_KERN); 945 } 946 947 void 948 jrecord_leaf_uio(struct jrecord *jrec, int16_t rectype, 949 struct uio *uio) 950 { 951 struct iovec *iov; 952 int i; 953 954 for (i = 0; i < uio->uio_iovcnt; ++i) { 955 iov = &uio->uio_iov[i]; 956 if (iov->iov_len == 0) 957 continue; 958 if (uio->uio_segflg == UIO_SYSSPACE) { 959 jrecord_write(jrec, rectype, iov->iov_len); 960 jrecord_data(jrec, iov->iov_base, iov->iov_len, JDATA_KERN); 961 } else { /* UIO_USERSPACE */ 962 jrecord_write(jrec, rectype, iov->iov_len); 963 jrecord_data(jrec, iov->iov_base, iov->iov_len, JDATA_USER); 964 } 965 } 966 } 967 968 void 969 jrecord_leaf_xio(struct jrecord *jrec, int16_t rectype, xio_t xio) 970 { 971 int bytes = xio->xio_npages * PAGE_SIZE; 972 973 jrecord_write(jrec, rectype, bytes); 974 jrecord_data(jrec, xio, bytes, JDATA_XIO); 975 } 976 977 /* 978 * Write a leaf record out and return a pointer to its base. The leaf 979 * record may contain potentially megabytes of data which is supplied 980 * in jrecord_data() calls. The exact amount must be specified in this 981 * call. 982 * 983 * THE RETURNED SUBRECORD POINTER IS ONLY VALID IMMEDIATELY AFTER THE 984 * CALL AND MAY BECOME INVALID AT ANY TIME. ONLY THE PUSH/POP CODE SHOULD 985 * USE THE RETURN VALUE. 986 */ 987 struct journal_subrecord * 988 jrecord_write(struct jrecord *jrec, int16_t rectype, int bytes) 989 { 990 struct journal_subrecord *last; 991 int pusheditout; 992 993 /* 994 * Try to catch some obvious errors. Nesting records must specify a 995 * size of 0, and there should be no left-overs from previous operations 996 * (such as incomplete data writeouts). 997 */ 998 KKASSERT(bytes == 0 || (rectype & JMASK_NESTED) == 0); 999 KKASSERT(jrec->residual == 0); 1000 1001 /* 1002 * Check to see if the current stream record has enough room for 1003 * the new subrecord header. If it doesn't we extend the current 1004 * stream record. 1005 * 1006 * This may have the side effect of pushing out the current stream record 1007 * and creating a new one. We must adjust our stream tracking fields 1008 * accordingly. 1009 */ 1010 if (jrec->stream_residual < sizeof(struct journal_subrecord)) { 1011 jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp, 1012 jrec->stream_reserved - jrec->stream_residual, 1013 JREC_DEFAULTSIZE, &pusheditout); 1014 if (pusheditout) { 1015 /* 1016 * If a pushout occured, the pushed out stream record was 1017 * truncated as specified and the new record is exactly the 1018 * extension size specified. 1019 */ 1020 jrec->stream_reserved = JREC_DEFAULTSIZE; 1021 jrec->stream_residual = JREC_DEFAULTSIZE; 1022 jrec->parent = NULL; /* no longer accessible */ 1023 jrec->pushptrgood = 0; /* restored parents in pops no good */ 1024 } else { 1025 /* 1026 * If no pushout occured the stream record is NOT truncated and 1027 * IS extended. 1028 */ 1029 jrec->stream_reserved += JREC_DEFAULTSIZE; 1030 jrec->stream_residual += JREC_DEFAULTSIZE; 1031 } 1032 } 1033 last = (void *)jrec->stream_ptr; 1034 last->rectype = rectype; 1035 last->reserved = 0; 1036 1037 /* 1038 * We may not know the record size for recursive records and the 1039 * header may become unavailable due to limited FIFO space. Write 1040 * -1 to indicate this special case. 1041 */ 1042 if ((rectype & JMASK_NESTED) && bytes == 0) 1043 last->recsize = -1; 1044 else 1045 last->recsize = sizeof(struct journal_subrecord) + bytes; 1046 jrec->last = last; 1047 jrec->residual = bytes; /* remaining data to be posted */ 1048 jrec->residual_align = -bytes & 7; /* post-data alignment required */ 1049 jrec->stream_ptr += sizeof(*last); /* current write pointer */ 1050 jrec->stream_residual -= sizeof(*last); /* space remaining in stream */ 1051 return(last); 1052 } 1053 1054 /* 1055 * Write out the data associated with a leaf record. Any number of calls 1056 * to this routine may be made as long as the byte count adds up to the 1057 * amount originally specified in jrecord_write(). 1058 * 1059 * The act of writing out the leaf data may result in numerous stream records 1060 * being pushed out. Callers should be aware that even the associated 1061 * subrecord header may become inaccessible due to stream record pushouts. 1062 */ 1063 static void 1064 jrecord_data(struct jrecord *jrec, void *buf, int bytes, int dtype) 1065 { 1066 int pusheditout; 1067 int extsize; 1068 int xio_offset = 0; 1069 1070 KKASSERT(bytes >= 0 && bytes <= jrec->residual); 1071 1072 /* 1073 * Push out stream records as long as there is insufficient room to hold 1074 * the remaining data. 1075 */ 1076 while (jrec->stream_residual < bytes) { 1077 /* 1078 * Fill in any remaining space in the current stream record. 1079 */ 1080 switch (dtype) { 1081 case JDATA_KERN: 1082 bcopy(buf, jrec->stream_ptr, jrec->stream_residual); 1083 break; 1084 case JDATA_USER: 1085 copyin(buf, jrec->stream_ptr, jrec->stream_residual); 1086 break; 1087 case JDATA_XIO: 1088 xio_copy_xtok((xio_t)buf, xio_offset, jrec->stream_ptr, 1089 jrec->stream_residual); 1090 xio_offset += jrec->stream_residual; 1091 break; 1092 } 1093 if (dtype != JDATA_XIO) 1094 buf = (char *)buf + jrec->stream_residual; 1095 bytes -= jrec->stream_residual; 1096 /*jrec->stream_ptr += jrec->stream_residual;*/ 1097 jrec->residual -= jrec->stream_residual; 1098 jrec->stream_residual = 0; 1099 1100 /* 1101 * Try to extend the current stream record, but no more then 1/4 1102 * the size of the FIFO. 1103 */ 1104 extsize = jrec->jo->fifo.size >> 2; 1105 if (extsize > bytes) 1106 extsize = (bytes + 15) & ~15; 1107 1108 jrec->stream_ptr = journal_extend(jrec->jo, &jrec->rawp, 1109 jrec->stream_reserved - jrec->stream_residual, 1110 extsize, &pusheditout); 1111 if (pusheditout) { 1112 jrec->stream_reserved = extsize; 1113 jrec->stream_residual = extsize; 1114 jrec->parent = NULL; /* no longer accessible */ 1115 jrec->last = NULL; /* no longer accessible */ 1116 jrec->pushptrgood = 0; /* restored parents in pops no good */ 1117 } else { 1118 jrec->stream_reserved += extsize; 1119 jrec->stream_residual += extsize; 1120 } 1121 } 1122 1123 /* 1124 * Push out any remaining bytes into the current stream record. 1125 */ 1126 if (bytes) { 1127 switch (dtype) { 1128 case JDATA_KERN: 1129 bcopy(buf, jrec->stream_ptr, bytes); 1130 break; 1131 case JDATA_USER: 1132 copyin(buf, jrec->stream_ptr, bytes); 1133 break; 1134 case JDATA_XIO: 1135 xio_copy_xtok((xio_t)buf, xio_offset, jrec->stream_ptr, bytes); 1136 break; 1137 } 1138 jrec->stream_ptr += bytes; 1139 jrec->stream_residual -= bytes; 1140 jrec->residual -= bytes; 1141 } 1142 1143 /* 1144 * Handle data alignment requirements for the subrecord. Because the 1145 * stream record's data space is more strictly aligned, it must already 1146 * have sufficient space to hold any subrecord alignment slop. 1147 */ 1148 if (jrec->residual == 0 && jrec->residual_align) { 1149 KKASSERT(jrec->residual_align <= jrec->stream_residual); 1150 bzero(jrec->stream_ptr, jrec->residual_align); 1151 jrec->stream_ptr += jrec->residual_align; 1152 jrec->stream_residual -= jrec->residual_align; 1153 jrec->residual_align = 0; 1154 } 1155 } 1156 1157 /* 1158 * We are finished with the transaction. This closes the transaction created 1159 * by jrecord_init(). 1160 * 1161 * NOTE: If abortit is not set then we must be at the top level with no 1162 * residual subrecord data left to output. 1163 * 1164 * If abortit is set then we can be in any state, all pushes will be 1165 * popped and it is ok for there to be residual data. This works 1166 * because the virtual stream itself is truncated. Scanners must deal 1167 * with this situation. 1168 * 1169 * The stream record will be committed or aborted as specified and jrecord 1170 * resources will be cleaned up. 1171 */ 1172 void 1173 jrecord_done(struct jrecord *jrec, int abortit) 1174 { 1175 KKASSERT(jrec->rawp != NULL); 1176 1177 if (abortit) { 1178 journal_abort(jrec->jo, &jrec->rawp); 1179 } else { 1180 KKASSERT(jrec->pushcount == 0 && jrec->residual == 0); 1181 journal_commit(jrec->jo, &jrec->rawp, 1182 jrec->stream_reserved - jrec->stream_residual, 1); 1183 } 1184 1185 /* 1186 * jrec should not be used beyond this point without another init, 1187 * but clean up some fields to ensure that we panic if it is. 1188 * 1189 * Note that jrec->rawp is NULLd out by journal_abort/journal_commit. 1190 */ 1191 jrec->jo = NULL; 1192 jrec->stream_ptr = NULL; 1193 } 1194 1195 /************************************************************************ 1196 * LOW LEVEL RECORD SUPPORT ROUTINES * 1197 ************************************************************************ 1198 * 1199 * These routine create low level recursive and leaf subrecords representing 1200 * common filesystem structures. 1201 */ 1202 1203 /* 1204 * Write out a filename path relative to the base of the mount point. 1205 * rectype is typically JLEAF_PATH{1,2,3,4}. 1206 */ 1207 void 1208 jrecord_write_path(struct jrecord *jrec, int16_t rectype, struct namecache *ncp) 1209 { 1210 char buf[64]; /* local buffer if it fits, else malloced */ 1211 char *base; 1212 int pathlen; 1213 int index; 1214 struct namecache *scan; 1215 1216 /* 1217 * Pass 1 - figure out the number of bytes required. Include terminating 1218 * \0 on last element and '/' separator on other elements. 1219 * 1220 * The namecache topology terminates at the root of the filesystem 1221 * (the normal lookup code would then continue by using the mount 1222 * structure to figure out what it was mounted on). 1223 */ 1224 again: 1225 pathlen = 0; 1226 for (scan = ncp; scan; scan = scan->nc_parent) { 1227 if (scan->nc_nlen > 0) 1228 pathlen += scan->nc_nlen + 1; 1229 } 1230 1231 if (pathlen <= sizeof(buf)) 1232 base = buf; 1233 else 1234 base = kmalloc(pathlen, M_TEMP, M_INTWAIT); 1235 1236 /* 1237 * Pass 2 - generate the path buffer 1238 */ 1239 index = pathlen; 1240 for (scan = ncp; scan; scan = scan->nc_parent) { 1241 if (scan->nc_nlen == 0) 1242 continue; 1243 if (scan->nc_nlen >= index) { 1244 if (base != buf) 1245 kfree(base, M_TEMP); 1246 goto again; 1247 } 1248 if (index == pathlen) 1249 base[--index] = 0; 1250 else 1251 base[--index] = '/'; 1252 index -= scan->nc_nlen; 1253 bcopy(scan->nc_name, base + index, scan->nc_nlen); 1254 } 1255 jrecord_leaf(jrec, rectype, base + index, pathlen - index); 1256 if (base != buf) 1257 kfree(base, M_TEMP); 1258 } 1259 1260 /* 1261 * Write out a file attribute structure. While somewhat inefficient, using 1262 * a recursive data structure is the most portable and extensible way. 1263 */ 1264 void 1265 jrecord_write_vattr(struct jrecord *jrec, struct vattr *vat) 1266 { 1267 void *save; 1268 1269 save = jrecord_push(jrec, JTYPE_VATTR); 1270 if (vat->va_type != VNON) 1271 jrecord_leaf(jrec, JLEAF_VTYPE, &vat->va_type, sizeof(vat->va_type)); 1272 if (vat->va_mode != (mode_t)VNOVAL) 1273 jrecord_leaf(jrec, JLEAF_MODES, &vat->va_mode, sizeof(vat->va_mode)); 1274 if (vat->va_nlink != VNOVAL) 1275 jrecord_leaf(jrec, JLEAF_NLINK, &vat->va_nlink, sizeof(vat->va_nlink)); 1276 if (vat->va_uid != VNOVAL) 1277 jrecord_leaf(jrec, JLEAF_UID, &vat->va_uid, sizeof(vat->va_uid)); 1278 if (vat->va_gid != VNOVAL) 1279 jrecord_leaf(jrec, JLEAF_GID, &vat->va_gid, sizeof(vat->va_gid)); 1280 if (vat->va_fsid != VNOVAL) 1281 jrecord_leaf(jrec, JLEAF_FSID, &vat->va_fsid, sizeof(vat->va_fsid)); 1282 if (vat->va_fileid != VNOVAL) 1283 jrecord_leaf(jrec, JLEAF_INUM, &vat->va_fileid, sizeof(vat->va_fileid)); 1284 if (vat->va_size != VNOVAL) 1285 jrecord_leaf(jrec, JLEAF_SIZE, &vat->va_size, sizeof(vat->va_size)); 1286 if (vat->va_atime.tv_sec != VNOVAL) 1287 jrecord_leaf(jrec, JLEAF_ATIME, &vat->va_atime, sizeof(vat->va_atime)); 1288 if (vat->va_mtime.tv_sec != VNOVAL) 1289 jrecord_leaf(jrec, JLEAF_MTIME, &vat->va_mtime, sizeof(vat->va_mtime)); 1290 if (vat->va_ctime.tv_sec != VNOVAL) 1291 jrecord_leaf(jrec, JLEAF_CTIME, &vat->va_ctime, sizeof(vat->va_ctime)); 1292 if (vat->va_gen != VNOVAL) 1293 jrecord_leaf(jrec, JLEAF_GEN, &vat->va_gen, sizeof(vat->va_gen)); 1294 if (vat->va_flags != VNOVAL) 1295 jrecord_leaf(jrec, JLEAF_FLAGS, &vat->va_flags, sizeof(vat->va_flags)); 1296 if (vat->va_rmajor != VNOVAL) { 1297 udev_t rdev = makeudev(vat->va_rmajor, vat->va_rminor); 1298 jrecord_leaf(jrec, JLEAF_UDEV, &rdev, sizeof(rdev)); 1299 jrecord_leaf(jrec, JLEAF_UMAJOR, &vat->va_rmajor, sizeof(vat->va_rmajor)); 1300 jrecord_leaf(jrec, JLEAF_UMINOR, &vat->va_rminor, sizeof(vat->va_rminor)); 1301 } 1302 #if 0 1303 if (vat->va_filerev != VNOVAL) 1304 jrecord_leaf(jrec, JLEAF_FILEREV, &vat->va_filerev, sizeof(vat->va_filerev)); 1305 #endif 1306 jrecord_pop(jrec, save); 1307 } 1308 1309 /* 1310 * Write out the creds used to issue a file operation. If a process is 1311 * available write out additional tracking information related to the 1312 * process. 1313 * 1314 * XXX additional tracking info 1315 * XXX tty line info 1316 */ 1317 void 1318 jrecord_write_cred(struct jrecord *jrec, struct thread *td, struct ucred *cred) 1319 { 1320 void *save; 1321 struct proc *p; 1322 1323 save = jrecord_push(jrec, JTYPE_CRED); 1324 jrecord_leaf(jrec, JLEAF_UID, &cred->cr_uid, sizeof(cred->cr_uid)); 1325 jrecord_leaf(jrec, JLEAF_GID, &cred->cr_gid, sizeof(cred->cr_gid)); 1326 if (td && (p = td->td_proc) != NULL) { 1327 jrecord_leaf(jrec, JLEAF_PID, &p->p_pid, sizeof(p->p_pid)); 1328 jrecord_leaf(jrec, JLEAF_COMM, p->p_comm, sizeof(p->p_comm)); 1329 } 1330 jrecord_pop(jrec, save); 1331 } 1332 1333 /* 1334 * Write out information required to identify a vnode 1335 * 1336 * XXX this needs work. We should write out the inode number as well, 1337 * and in fact avoid writing out the file path for seqential writes 1338 * occuring within e.g. a certain period of time. 1339 */ 1340 void 1341 jrecord_write_vnode_ref(struct jrecord *jrec, struct vnode *vp) 1342 { 1343 struct nchandle nch; 1344 1345 nch.mount = vp->v_mount; 1346 spin_lock_wr(&vp->v_spinlock); 1347 TAILQ_FOREACH(nch.ncp, &vp->v_namecache, nc_vnode) { 1348 if ((nch.ncp->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0) 1349 break; 1350 } 1351 if (nch.ncp) { 1352 cache_hold(&nch); 1353 spin_unlock_wr(&vp->v_spinlock); 1354 jrecord_write_path(jrec, JLEAF_PATH_REF, nch.ncp); 1355 cache_drop(&nch); 1356 } else { 1357 spin_unlock_wr(&vp->v_spinlock); 1358 } 1359 } 1360 1361 void 1362 jrecord_write_vnode_link(struct jrecord *jrec, struct vnode *vp, 1363 struct namecache *notncp) 1364 { 1365 struct nchandle nch; 1366 1367 nch.mount = vp->v_mount; 1368 spin_lock_wr(&vp->v_spinlock); 1369 TAILQ_FOREACH(nch.ncp, &vp->v_namecache, nc_vnode) { 1370 if (nch.ncp == notncp) 1371 continue; 1372 if ((nch.ncp->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0) 1373 break; 1374 } 1375 if (nch.ncp) { 1376 cache_hold(&nch); 1377 spin_unlock_wr(&vp->v_spinlock); 1378 jrecord_write_path(jrec, JLEAF_PATH_REF, nch.ncp); 1379 cache_drop(&nch); 1380 } else { 1381 spin_unlock_wr(&vp->v_spinlock); 1382 } 1383 } 1384 1385 /* 1386 * Write out the data represented by a pagelist 1387 */ 1388 void 1389 jrecord_write_pagelist(struct jrecord *jrec, int16_t rectype, 1390 struct vm_page **pglist, int *rtvals, int pgcount, 1391 off_t offset) 1392 { 1393 struct xio xio; 1394 int error; 1395 int b; 1396 int i; 1397 1398 i = 0; 1399 xio_init(&xio); 1400 while (i < pgcount) { 1401 /* 1402 * Find the next valid section. Skip any invalid elements 1403 */ 1404 if (rtvals[i] != VM_PAGER_OK) { 1405 ++i; 1406 offset += PAGE_SIZE; 1407 continue; 1408 } 1409 1410 /* 1411 * Figure out how big the valid section is, capping I/O at what the 1412 * MSFBUF can represent. 1413 */ 1414 b = i; 1415 while (i < pgcount && i - b != XIO_INTERNAL_PAGES && 1416 rtvals[i] == VM_PAGER_OK 1417 ) { 1418 ++i; 1419 } 1420 1421 /* 1422 * And write it out. 1423 */ 1424 if (i - b) { 1425 error = xio_init_pages(&xio, pglist + b, i - b, XIOF_READ); 1426 if (error == 0) { 1427 jrecord_leaf(jrec, JLEAF_SEEKPOS, &offset, sizeof(offset)); 1428 jrecord_leaf_xio(jrec, rectype, &xio); 1429 } else { 1430 kprintf("jrecord_write_pagelist: xio init failure\n"); 1431 } 1432 xio_release(&xio); 1433 offset += (off_t)(i - b) << PAGE_SHIFT; 1434 } 1435 } 1436 } 1437 1438 /* 1439 * Write out the data represented by a UIO. 1440 */ 1441 void 1442 jrecord_write_uio(struct jrecord *jrec, int16_t rectype, struct uio *uio) 1443 { 1444 if (uio->uio_segflg != UIO_NOCOPY) { 1445 jrecord_leaf(jrec, JLEAF_SEEKPOS, &uio->uio_offset, 1446 sizeof(uio->uio_offset)); 1447 jrecord_leaf_uio(jrec, rectype, uio); 1448 } 1449 } 1450 1451 void 1452 jrecord_file_data(struct jrecord *jrec, struct vnode *vp, 1453 off_t off, off_t bytes) 1454 { 1455 const int bufsize = 8192; 1456 char *buf; 1457 int error; 1458 int n; 1459 1460 buf = kmalloc(bufsize, M_JOURNAL, M_WAITOK); 1461 jrecord_leaf(jrec, JLEAF_SEEKPOS, &off, sizeof(off)); 1462 while (bytes) { 1463 n = (bytes > bufsize) ? bufsize : (int)bytes; 1464 error = vn_rdwr(UIO_READ, vp, buf, n, off, UIO_SYSSPACE, IO_NODELOCKED, 1465 proc0.p_ucred, NULL); 1466 if (error) { 1467 jrecord_leaf(jrec, JLEAF_ERROR, &error, sizeof(error)); 1468 break; 1469 } 1470 jrecord_leaf(jrec, JLEAF_FILEDATA, buf, n); 1471 bytes -= n; 1472 off += n; 1473 } 1474 kfree(buf, M_JOURNAL); 1475 } 1476 1477