1 /* $NetBSD: offtab.c,v 1.13 2014/01/25 16:38:15 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2014 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Taylor R. Campbell. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __RCSID("$NetBSD: offtab.c,v 1.13 2014/01/25 16:38:15 riastradh Exp $"); 34 35 #include <sys/types.h> 36 #include <sys/endian.h> 37 38 #include <assert.h> 39 #include <err.h> 40 #include <errno.h> 41 #include <inttypes.h> 42 #include <limits.h> 43 #include <stdbool.h> 44 #include <stdlib.h> 45 #include <unistd.h> 46 47 #include "common.h" 48 #include "utils.h" 49 50 #include "offtab.h" 51 52 static void __printflike(1,2) __dead 53 offtab_bug(const char *fmt, ...) 54 { 55 56 errx(1, "bug in offtab, please report"); 57 } 58 59 static void __printflike(1,2) __dead 60 offtab_bugx(const char *fmt, ...) 61 { 62 63 errx(1, "bug in offtab, please report"); 64 } 65 66 static uint32_t 67 offtab_compute_window_size(struct offtab *offtab, uint32_t start) 68 { 69 70 assert(start < offtab->ot_n_offsets); 71 return MIN(offtab->ot_window_size, (offtab->ot_n_offsets - start)); 72 } 73 74 static uint32_t 75 offtab_current_window_size(struct offtab *offtab) 76 { 77 78 return offtab_compute_window_size(offtab, offtab->ot_window_start); 79 } 80 81 static uint32_t 82 offtab_current_window_end(struct offtab *offtab) 83 { 84 85 assert(offtab->ot_window_start < offtab->ot_n_offsets); 86 assert(offtab_current_window_size(offtab) <= 87 (offtab->ot_n_offsets - offtab->ot_window_start)); 88 return (offtab->ot_window_start + offtab_current_window_size(offtab)); 89 } 90 91 static void 92 offtab_compute_window_position(struct offtab *offtab, uint32_t window_start, 93 size_t *bytes, off_t *pos) 94 { 95 const uint32_t window_size = offtab_compute_window_size(offtab, 96 window_start); 97 98 __CTASSERT(MAX_WINDOW_SIZE <= (SIZE_MAX / sizeof(uint64_t))); 99 *bytes = (window_size * sizeof(uint64_t)); 100 101 assert(window_start <= offtab->ot_n_offsets); 102 __CTASSERT(MAX_N_OFFSETS <= (OFF_MAX / sizeof(uint64_t))); 103 const off_t window_offset = ((off_t)window_start * 104 (off_t)sizeof(uint64_t)); 105 106 /* XXX This assertion is not justified. */ 107 assert(offtab->ot_fdpos <= (OFF_MAX - window_offset)); 108 *pos = (offtab->ot_fdpos + window_offset); 109 } 110 111 #define OFFTAB_READ_SEEK 0x01 112 #define OFFTAB_READ_NOSEEK 0x00 113 114 static bool 115 offtab_read_window(struct offtab *offtab, uint32_t blkno, int read_flags) 116 { 117 const uint32_t window_start = rounddown(blkno, offtab->ot_window_size); 118 size_t window_bytes; 119 off_t window_pos; 120 121 assert(offtab->ot_mode == OFFTAB_MODE_READ); 122 assert(ISSET(read_flags, OFFTAB_READ_SEEK) || 123 (lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) || 124 ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE))); 125 126 offtab_compute_window_position(offtab, window_start, 127 &window_bytes, &window_pos); 128 const ssize_t n_read = (ISSET(read_flags, OFFTAB_READ_SEEK) 129 ? pread_block(offtab->ot_fd, offtab->ot_window, window_bytes, 130 window_pos) 131 : read_block(offtab->ot_fd, offtab->ot_window, window_bytes)); 132 if (n_read == -1) { 133 (*offtab->ot_report)("read offset table at %"PRIuMAX, 134 (uintmax_t)window_pos); 135 return false; 136 } 137 assert(n_read >= 0); 138 if ((size_t)n_read != window_bytes) { 139 (*offtab->ot_reportx)("partial read of offset table" 140 " at %"PRIuMAX": %zu != %zu", 141 (uintmax_t)window_pos, (size_t)n_read, window_bytes); 142 return false; 143 } 144 145 offtab->ot_window_start = window_start; 146 147 return true; 148 } 149 150 static bool 151 offtab_maybe_read_window(struct offtab *offtab, uint32_t blkno, int read_flags) 152 { 153 154 /* Don't bother if blkno is already in the window. */ 155 if ((offtab->ot_window_start <= blkno) && 156 (blkno < offtab_current_window_end(offtab))) 157 return true; 158 159 if (!offtab_read_window(offtab, blkno, read_flags)) 160 return false; 161 162 return true; 163 } 164 165 static void 166 offtab_write_window(struct offtab *offtab) 167 { 168 size_t window_bytes; 169 off_t window_pos; 170 171 assert(offtab->ot_mode == OFFTAB_MODE_WRITE); 172 173 offtab_compute_window_position(offtab, offtab->ot_window_start, 174 &window_bytes, &window_pos); 175 const ssize_t n_written = pwrite(offtab->ot_fd, offtab->ot_window, 176 window_bytes, window_pos); 177 if (n_written == -1) 178 err_ss(1, "write initial offset table"); 179 assert(n_written >= 0); 180 if ((size_t)n_written != window_bytes) 181 errx_ss(1, "partial write of initial offset bytes: %zu <= %zu", 182 (size_t)n_written, 183 window_bytes); 184 } 185 186 static void 187 offtab_maybe_write_window(struct offtab *offtab, uint32_t start, uint32_t end) 188 { 189 190 /* Don't bother if [start, end) does not cover our window. */ 191 if (end <= offtab->ot_window_start) 192 return; 193 if (offtab_current_window_end(offtab) < start) 194 return; 195 196 offtab_write_window(offtab); 197 } 198 199 /* 200 * Initialize an offtab to support the specified number of offsets read 201 * to or written from fd at byte position fdpos. 202 */ 203 void 204 offtab_init(struct offtab *offtab, uint32_t n_offsets, uint32_t window_size, 205 int fd, off_t fdpos) 206 { 207 208 assert(offtab != NULL); 209 assert(0 < n_offsets); 210 assert(0 <= fd); 211 assert(0 <= fdpos); 212 213 offtab->ot_n_offsets = n_offsets; 214 if ((window_size == 0) || (n_offsets < window_size)) 215 offtab->ot_window_size = n_offsets; 216 else 217 offtab->ot_window_size = window_size; 218 assert(offtab->ot_window_size <= offtab->ot_n_offsets); 219 offtab->ot_window_start = (uint32_t)-1; 220 __CTASSERT(MAX_WINDOW_SIZE <= (SIZE_MAX / sizeof(uint64_t))); 221 offtab->ot_window = malloc(offtab->ot_window_size * sizeof(uint64_t)); 222 if (offtab->ot_window == NULL) 223 err(1, "malloc offset table"); 224 offtab->ot_blkno = (uint32_t)-1; 225 offtab->ot_fd = fd; 226 offtab->ot_fdpos = fdpos; 227 offtab->ot_report = &offtab_bug; 228 offtab->ot_reportx = &offtab_bugx; 229 offtab->ot_mode = OFFTAB_MODE_NONE; 230 } 231 232 /* 233 * Destroy an offtab. 234 */ 235 void 236 offtab_destroy(struct offtab *offtab) 237 { 238 239 free(offtab->ot_window); 240 } 241 242 /* 243 * For an offtab that has been used to read data from disk, convert it 244 * to an offtab that can be used to write subsequent data to disk. 245 * blkno is the last valid blkno read from disk. 246 */ 247 bool 248 offtab_transmogrify_read_to_write(struct offtab *offtab, uint32_t blkno) 249 { 250 251 assert(offtab->ot_mode == OFFTAB_MODE_READ); 252 assert(0 < blkno); 253 254 if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK)) 255 return false; 256 257 offtab->ot_mode = OFFTAB_MODE_WRITE; 258 offtab->ot_blkno = blkno; 259 260 return true; 261 } 262 263 /* 264 * Reset an offtab for reading an offset table from the beginning. 265 * Initializes in-memory state and may read data from offtab->ot_fd, 266 * which must currently be at byte position offtab->ot_fdpos. Failure 267 * will be reported by the report/reportx routines, which are called 268 * like warn/warnx. May fail; returns true on success, false on 269 * failure. 270 * 271 * This almost has copypasta of offtab_prepare_get, but this uses read, 272 * rather than pread, so that it will work on nonseekable input if the 273 * window is the whole offset table. 274 */ 275 bool 276 offtab_reset_read(struct offtab *offtab, 277 void (*report)(const char *, ...) __printflike(1,2), 278 void (*reportx)(const char *, ...) __printflike(1,2)) 279 { 280 281 assert((lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos) || 282 ((lseek(offtab->ot_fd, 0, SEEK_CUR) == -1) && (errno == ESPIPE))); 283 284 offtab->ot_report = report; 285 offtab->ot_reportx = reportx; 286 offtab->ot_mode = OFFTAB_MODE_READ; 287 offtab->ot_blkno = (uint32_t)-1; 288 289 if (!offtab_read_window(offtab, 0, OFFTAB_READ_NOSEEK)) 290 return false; 291 292 if (offtab->ot_window_size < offtab->ot_n_offsets) { 293 __CTASSERT(MAX_N_OFFSETS <= (OFF_MAX / sizeof(uint64_t))); 294 const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets * 295 (off_t)sizeof(uint64_t)); 296 assert(offtab->ot_fdpos <= (OFF_MAX - offtab_bytes)); 297 const off_t first_offset = (offtab->ot_fdpos + offtab_bytes); 298 if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1) { 299 (*offtab->ot_report)("lseek to first offset 0x%"PRIx64, 300 first_offset); 301 return false; 302 } 303 } 304 305 return true; 306 } 307 308 /* 309 * Do any I/O or bookkeeping necessary to fetch the offset for blkno in 310 * preparation for a call to offtab_get. May fail; returns true on 311 * success, false on failure. 312 */ 313 bool 314 offtab_prepare_get(struct offtab *offtab, uint32_t blkno) 315 { 316 317 assert(offtab->ot_mode == OFFTAB_MODE_READ); 318 assert(blkno < offtab->ot_n_offsets); 319 320 if (!offtab_maybe_read_window(offtab, blkno, OFFTAB_READ_SEEK)) 321 return false; 322 323 assert(offtab->ot_window_start <= blkno); 324 assert(blkno < offtab_current_window_end(offtab)); 325 326 offtab->ot_blkno = blkno; 327 return true; 328 } 329 330 /* 331 * Return the offset for blkno. Caller must have called 332 * offtab_prepare_get beforehand. 333 */ 334 uint64_t 335 offtab_get(struct offtab *offtab, uint32_t blkno) 336 { 337 338 assert(offtab->ot_mode == OFFTAB_MODE_READ); 339 assert(blkno == offtab->ot_blkno); 340 assert(offtab->ot_window_start <= blkno); 341 assert(blkno < offtab_current_window_end(offtab)); 342 343 return be64toh(offtab->ot_window[blkno - offtab->ot_window_start]); 344 } 345 346 /* 347 * Reset offtab for writing a fresh offset table. Initializes 348 * in-memory state and writes an empty offset table to offtab->ot_fd, 349 * which must currently be at byte position offtab->ot_fdpos. May 350 * fail; returns on success, aborts with err(3) on failure. 351 */ 352 void 353 offtab_reset_write(struct offtab *offtab) 354 { 355 uint32_t i; 356 357 assert(lseek(offtab->ot_fd, 0, SEEK_CUR) == offtab->ot_fdpos); 358 359 offtab->ot_mode = OFFTAB_MODE_WRITE; 360 offtab->ot_blkno = (uint32_t)-1; 361 362 /* 363 * Initialize the offset table to all ones (except for the 364 * fixed first offset) so that we can easily detect where we 365 * were interrupted if we want to restart. 366 */ 367 __CTASSERT(MAX_N_OFFSETS <= UINT32_MAX); 368 assert(offtab->ot_n_offsets > 0); 369 370 for (i = 0; i < offtab->ot_window_size; i++) 371 offtab->ot_window[i] = ~(uint64_t)0; 372 373 const uint32_t n_windows = 374 howmany(offtab->ot_n_offsets, offtab->ot_window_size); 375 for (i = 1; i < n_windows; i++) { 376 /* Change the start but reuse the all-ones buffer. */ 377 offtab->ot_window_start = (i * offtab->ot_window_size); 378 offtab_write_window(offtab); 379 } 380 381 offtab->ot_window_start = 0; 382 __CTASSERT(MAX_N_OFFSETS <= 383 (MIN(OFF_MAX, UINT64_MAX) / sizeof(uint64_t))); 384 const off_t offtab_bytes = ((off_t)offtab->ot_n_offsets * 385 sizeof(uint64_t)); 386 assert(offtab->ot_fdpos <= 387 ((off_t)MIN(OFF_MAX, UINT64_MAX) - offtab_bytes)); 388 const off_t first_offset = (offtab->ot_fdpos + offtab_bytes); 389 assert(first_offset <= (off_t)MIN(OFF_MAX, UINT64_MAX)); 390 offtab->ot_window[0] = htobe64((uint64_t)first_offset); 391 offtab_write_window(offtab); 392 393 if (lseek(offtab->ot_fd, first_offset, SEEK_SET) == -1) 394 err(1, "lseek to first offset failed"); 395 } 396 397 /* 398 * Guarantee that the disk reflects block offsets [0, n_offsets). If 399 * OFFTAB_CHECKPOINT_SYNC is set in flags, will also fsync the entire 400 * offset table. May fail; returns on success, aborts with err(3) on 401 * failure. Fsync failure is considered success but is reported with a 402 * warning. 403 * 404 * This routine does not write state in memory, and does not read state 405 * that is not signal-safe. The only state read is offtab->ot_window, 406 * offtab->ot_window_start, and quantities that are static for the 407 * signal-interruptable existence of the offset table. 408 */ 409 void 410 offtab_checkpoint(struct offtab *offtab, uint32_t n_offsets, int flags) 411 { 412 413 assert(offtab->ot_mode == OFFTAB_MODE_WRITE); 414 assert(n_offsets <= offtab->ot_n_offsets); 415 416 /* 417 * Write the window unless we just did that and were 418 * interrupted before we could move the window. 419 */ 420 if (offtab->ot_window != NULL) 421 offtab_maybe_write_window(offtab, 0, n_offsets); 422 423 if (ISSET(flags, OFFTAB_CHECKPOINT_SYNC)) { 424 __CTASSERT(MAX_N_OFFSETS <= (OFF_MAX / sizeof(uint64_t))); 425 const off_t sync_bytes = ((off_t)n_offsets * 426 (off_t)sizeof(uint64_t)); 427 assert(offtab->ot_fdpos <= (OFF_MAX - sync_bytes)); 428 if (fsync_range(offtab->ot_fd, (FFILESYNC | FDISKSYNC), 429 offtab->ot_fdpos, (offtab->ot_fdpos + sync_bytes)) 430 == -1) 431 warn_ss("fsync of offset table failed"); 432 } 433 } 434 435 /* 436 * Do any I/O or bookkeeping necessary to set an offset for blkno. May 437 * fail; returns on success, aborts with err(3) on failure. 438 */ 439 void 440 offtab_prepare_put(struct offtab *offtab, uint32_t blkno) 441 { 442 uint32_t i; 443 444 assert(offtab->ot_mode == OFFTAB_MODE_WRITE); 445 assert(blkno < offtab->ot_n_offsets); 446 447 /* 448 * Assume, for convenience, that we write blocks in order. 449 * Thus we need not do another read -- we can just clear the 450 * window. 451 */ 452 assert((offtab->ot_blkno == (uint32_t)-1) || 453 ((offtab->ot_blkno + 1) == blkno)); 454 455 /* If it's already in our window, we're good to go. */ 456 if ((offtab->ot_window_start <= blkno) && 457 (blkno < offtab_current_window_end(offtab))) 458 goto win; 459 460 /* Otherwise, write out the current window and choose a new one. */ 461 offtab_write_window(offtab); 462 463 assert(offtab->ot_window_size <= blkno); 464 assert(offtab->ot_window_start == (blkno - offtab->ot_window_size)); 465 assert((offtab->ot_window_start + offtab->ot_window_size) == 466 rounddown(blkno, offtab->ot_window_size)); 467 468 { 469 uint64_t *window; 470 sigset_t sigmask; 471 472 /* 473 * Mark the window as being updated so nobody tries to write it 474 * (since we just wrote it) while we fill it with ones. 475 */ 476 block_signals(&sigmask); 477 window = offtab->ot_window; 478 offtab->ot_window = NULL; 479 restore_sigmask(&sigmask); 480 481 /* Fill the window with ones. */ 482 for (i = 0; i < offtab_current_window_size(offtab); i++) 483 window[i] = ~(uint64_t)0; 484 485 /* Restore the window as ready again. */ 486 block_signals(&sigmask); 487 offtab->ot_window = window; 488 offtab->ot_window_start = rounddown(blkno, offtab->ot_window_size); 489 restore_sigmask(&sigmask); 490 } 491 492 win: assert(offtab->ot_window_start <= blkno); 493 assert(blkno < offtab_current_window_end(offtab)); 494 495 offtab->ot_blkno = blkno; 496 } 497 498 /* 499 * Actually set the offset for blkno. 500 */ 501 void 502 offtab_put(struct offtab *offtab, uint32_t blkno, uint64_t offset) 503 { 504 505 assert(offtab->ot_mode == OFFTAB_MODE_WRITE); 506 assert(blkno == offtab->ot_blkno); 507 assert(offtab->ot_window_start <= blkno); 508 assert(blkno < offtab_current_window_end(offtab)); 509 510 offtab->ot_window[blkno - offtab->ot_window_start] = htobe64(offset); 511 } 512