1 /* $NetBSD: functions.c,v 1.3 2010/12/26 14:48:34 christos Exp $ */ 2 3 /* 4 * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. 5 * 6 * This copyrighted material is made available to anyone wishing to use, 7 * modify, copy, or redistribute it subject to the terms and conditions 8 * of the GNU Lesser General Public License v.2.1. 9 * 10 * You should have received a copy of the GNU Lesser General Public License 11 * along with this program; if not, write to the Free Software Foundation, 12 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 13 */ 14 #define _GNU_SOURCE 15 #define _FILE_OFFSET_BITS 64 16 17 #include <stdint.h> 18 #include <errno.h> 19 #include <string.h> 20 #include <sys/types.h> 21 #include <sys/stat.h> 22 #include <dirent.h> 23 #include <unistd.h> 24 #include <signal.h> 25 #include <linux/kdev_t.h> 26 //#define __USE_GNU /* for O_DIRECT */ 27 #include <fcntl.h> 28 #include <time.h> 29 #include "libdevmapper.h" 30 #include "dm-log-userspace.h" 31 #include "functions.h" 32 #include "common.h" 33 #include "cluster.h" 34 #include "logging.h" 35 36 #define BYTE_SHIFT 3 37 38 /* 39 * Magic for persistent mirrors: "MiRr" 40 * Following on-disk header information is stolen from 41 * drivers/md/dm-log.c 42 */ 43 #define MIRROR_MAGIC 0x4D695272 44 #define MIRROR_DISK_VERSION 2 45 #define LOG_OFFSET 2 46 47 #define RESYNC_HISTORY 50 48 //static char resync_history[RESYNC_HISTORY][128]; 49 //static int idx = 0; 50 #define LOG_SPRINT(_lc, f, arg...) do { \ 51 lc->idx++; \ 52 lc->idx = lc->idx % RESYNC_HISTORY; \ 53 sprintf(lc->resync_history[lc->idx], f, ## arg); \ 54 } while (0) 55 56 struct log_header { 57 uint32_t magic; 58 uint32_t version; 59 uint64_t nr_regions; 60 }; 61 62 struct log_c { 63 struct dm_list list; 64 65 char uuid[DM_UUID_LEN]; 66 uint64_t luid; 67 68 time_t delay; /* limits how fast a resume can happen after suspend */ 69 int touched; 70 uint32_t region_size; 71 uint32_t region_count; 72 uint64_t sync_count; 73 74 dm_bitset_t clean_bits; 75 dm_bitset_t sync_bits; 76 uint32_t recoverer; 77 uint64_t recovering_region; /* -1 means not recovering */ 78 uint64_t skip_bit_warning; /* used to warn if region skipped */ 79 int sync_search; 80 81 int resume_override; 82 83 uint32_t block_on_error; 84 enum sync { 85 DEFAULTSYNC, /* Synchronize if necessary */ 86 NOSYNC, /* Devices known to be already in sync */ 87 FORCESYNC, /* Force a sync to happen */ 88 } sync; 89 90 uint32_t state; /* current operational state of the log */ 91 92 struct dm_list mark_list; 93 94 uint32_t recovery_halted; 95 struct recovery_request *recovery_request_list; 96 97 int disk_fd; /* -1 means no disk log */ 98 int log_dev_failed; 99 uint64_t disk_nr_regions; 100 size_t disk_size; /* size of disk_buffer in bytes */ 101 void *disk_buffer; /* aligned memory for O_DIRECT */ 102 int idx; 103 char resync_history[RESYNC_HISTORY][128]; 104 }; 105 106 struct mark_entry { 107 struct dm_list list; 108 uint32_t nodeid; 109 uint64_t region; 110 }; 111 112 struct recovery_request { 113 uint64_t region; 114 struct recovery_request *next; 115 }; 116 117 static DM_LIST_INIT(log_list); 118 static DM_LIST_INIT(log_pending_list); 119 120 static int log_test_bit(dm_bitset_t bs, int bit) 121 { 122 return dm_bit(bs, bit); 123 } 124 125 static void log_set_bit(struct log_c *lc, dm_bitset_t bs, int bit) 126 { 127 dm_bit_set(bs, bit); 128 lc->touched = 1; 129 } 130 131 static void log_clear_bit(struct log_c *lc, dm_bitset_t bs, int bit) 132 { 133 dm_bit_clear(bs, bit); 134 lc->touched = 1; 135 } 136 137 static int find_next_zero_bit(dm_bitset_t bs, int start) 138 { 139 while (dm_bit(bs, start++)) 140 if (start >= (int)bs[0]) 141 return -1; 142 143 return start - 1; 144 } 145 146 static uint64_t count_bits32(dm_bitset_t bs) 147 { 148 int i, size = ((int)bs[0]/DM_BITS_PER_INT + 1); 149 unsigned count = 0; 150 151 for (i = 1; i <= size; i++) 152 count += hweight32(bs[i]); 153 154 return (uint64_t)count; 155 } 156 157 /* 158 * get_log 159 * 160 * Returns: log if found, NULL otherwise 161 */ 162 static struct log_c *get_log(const char *uuid, uint64_t luid) 163 { 164 struct log_c *lc; 165 166 dm_list_iterate_items(lc, &log_list) 167 if (!strcmp(lc->uuid, uuid) && 168 (!luid || (luid == lc->luid))) 169 return lc; 170 171 return NULL; 172 } 173 174 /* 175 * get_pending_log 176 * 177 * Pending logs are logs that have been 'clog_ctr'ed, but 178 * have not joined the CPG (via clog_resume). 179 * 180 * Returns: log if found, NULL otherwise 181 */ 182 static struct log_c *get_pending_log(const char *uuid, uint64_t luid) 183 { 184 struct log_c *lc; 185 186 dm_list_iterate_items(lc, &log_pending_list) 187 if (!strcmp(lc->uuid, uuid) && 188 (!luid || (luid == lc->luid))) 189 return lc; 190 191 return NULL; 192 } 193 194 static void header_to_disk(struct log_header *mem, struct log_header *disk) 195 { 196 memcpy(disk, mem, sizeof(struct log_header)); 197 } 198 199 static void header_from_disk(struct log_header *mem, struct log_header *disk) 200 { 201 memcpy(mem, disk, sizeof(struct log_header)); 202 } 203 204 static int rw_log(struct log_c *lc, int do_write) 205 { 206 int r; 207 208 r = lseek(lc->disk_fd, 0, SEEK_SET); 209 if (r < 0) { 210 LOG_ERROR("[%s] rw_log: lseek failure: %s", 211 SHORT_UUID(lc->uuid), strerror(errno)); 212 return -errno; 213 } 214 215 if (do_write) { 216 r = write(lc->disk_fd, lc->disk_buffer, lc->disk_size); 217 if (r < 0) { 218 LOG_ERROR("[%s] rw_log: write failure: %s", 219 SHORT_UUID(lc->uuid), strerror(errno)); 220 return -EIO; /* Failed disk write */ 221 } 222 return 0; 223 } 224 225 /* Read */ 226 r = read(lc->disk_fd, lc->disk_buffer, lc->disk_size); 227 if (r < 0) 228 LOG_ERROR("[%s] rw_log: read failure: %s", 229 SHORT_UUID(lc->uuid), strerror(errno)); 230 if (r != lc->disk_size) 231 return -EIO; /* Failed disk read */ 232 return 0; 233 } 234 235 /* 236 * read_log 237 * @lc 238 * 239 * Valid return codes: 240 * -EINVAL: Invalid header, bits not copied 241 * -EIO: Unable to read disk log 242 * 0: Valid header, disk bit -> lc->clean_bits 243 * 244 * Returns: 0 on success, -EXXX on failure 245 */ 246 static int read_log(struct log_c *lc) 247 { 248 struct log_header lh; 249 size_t bitset_size; 250 251 memset(&lh, 0, sizeof(struct log_header)); 252 253 if (rw_log(lc, 0)) 254 return -EIO; /* Failed disk read */ 255 256 header_from_disk(&lh, lc->disk_buffer); 257 if (lh.magic != MIRROR_MAGIC) 258 return -EINVAL; 259 260 lc->disk_nr_regions = lh.nr_regions; 261 262 /* Read disk bits into sync_bits */ 263 bitset_size = lc->region_count / 8; 264 bitset_size += (lc->region_count % 8) ? 1 : 0; 265 memcpy(lc->clean_bits, lc->disk_buffer + 1024, bitset_size); 266 267 return 0; 268 } 269 270 /* 271 * write_log 272 * @lc 273 * 274 * Returns: 0 on success, -EIO on failure 275 */ 276 static int write_log(struct log_c *lc) 277 { 278 struct log_header lh; 279 size_t bitset_size; 280 281 lh.magic = MIRROR_MAGIC; 282 lh.version = MIRROR_DISK_VERSION; 283 lh.nr_regions = lc->region_count; 284 285 header_to_disk(&lh, lc->disk_buffer); 286 287 /* Write disk bits from clean_bits */ 288 bitset_size = lc->region_count / 8; 289 bitset_size += (lc->region_count % 8) ? 1 : 0; 290 memcpy(lc->disk_buffer + 1024, lc->clean_bits, bitset_size); 291 292 if (rw_log(lc, 1)) { 293 lc->log_dev_failed = 1; 294 return -EIO; /* Failed disk write */ 295 } 296 return 0; 297 } 298 299 static int find_disk_path(char *major_minor_str, char *path_rtn, int *unlink_path) 300 { 301 int r; 302 DIR *dp; 303 struct dirent *dep; 304 struct stat statbuf; 305 int major, minor; 306 mode_t old_umask; 307 308 if (!strstr(major_minor_str, ":")) { 309 r = stat(major_minor_str, &statbuf); 310 if (r) 311 return -errno; 312 if (!S_ISBLK(statbuf.st_mode)) 313 return -EINVAL; 314 sprintf(path_rtn, "%s", major_minor_str); 315 return 0; 316 } 317 318 r = sscanf(major_minor_str, "%d:%d", &major, &minor); 319 if (r != 2) 320 return -EINVAL; 321 322 LOG_DBG("Checking /dev/mapper for device %d:%d", major, minor); 323 /* Check /dev/mapper dir */ 324 dp = opendir("/dev/mapper"); 325 if (!dp) 326 return -ENOENT; 327 328 while ((dep = readdir(dp)) != NULL) { 329 /* 330 * FIXME: This is racy. By the time the path is used, 331 * it may point to something else. 'fstat' will be 332 * required upon opening to ensure we got what we 333 * wanted. 334 */ 335 336 sprintf(path_rtn, "/dev/mapper/%s", dep->d_name); 337 stat(path_rtn, &statbuf); 338 if (S_ISBLK(statbuf.st_mode) && 339 (major(statbuf.st_rdev) == major) && 340 (minor(statbuf.st_rdev) == minor)) { 341 LOG_DBG(" %s: YES", dep->d_name); 342 closedir(dp); 343 return 0; 344 } else { 345 LOG_DBG(" %s: NO", dep->d_name); 346 } 347 } 348 349 closedir(dp); 350 351 LOG_DBG("Path not found for %d/%d", major, minor); 352 LOG_DBG("Creating /dev/mapper/%d-%d", major, minor); 353 sprintf(path_rtn, "/dev/mapper/%d-%d", major, minor); 354 old_umask = umask(0); 355 r = mknod(path_rtn, S_IFBLK | DM_DEVICE_MODE, MKDEV(major, minor)); 356 umask(old_umask); 357 358 if (r != -1) 359 r = chown(path_rtn, DM_DEVICE_UID, DM_DEVICE_GID); 360 361 /* 362 * If we have to make the path, we unlink it after we open it 363 */ 364 *unlink_path = 1; 365 366 return r ? -errno : 0; 367 } 368 369 static int _clog_ctr(char *uuid, uint64_t luid, 370 int argc, char **argv, uint64_t device_size) 371 { 372 int i; 373 int r = 0; 374 char *p; 375 uint64_t region_size; 376 uint64_t region_count; 377 struct log_c *lc = NULL; 378 struct log_c *duplicate; 379 enum sync sync = DEFAULTSYNC; 380 uint32_t block_on_error = 0; 381 382 int disk_log = 0; 383 char disk_path[128]; 384 int unlink_path = 0; 385 size_t page_size; 386 int pages; 387 388 /* If core log request, then argv[0] will be region_size */ 389 if (!strtoll(argv[0], &p, 0) || *p) { 390 disk_log = 1; 391 392 if ((argc < 2) || (argc > 4)) { 393 LOG_ERROR("Too %s arguments to clustered_disk log type", 394 (argc < 3) ? "few" : "many"); 395 r = -EINVAL; 396 goto fail; 397 } 398 399 r = find_disk_path(argv[0], disk_path, &unlink_path); 400 if (r) { 401 LOG_ERROR("Unable to find path to device %s", argv[0]); 402 goto fail; 403 } 404 LOG_DBG("Clustered log disk is %s", disk_path); 405 } else { 406 disk_log = 0; 407 408 if ((argc < 1) || (argc > 3)) { 409 LOG_ERROR("Too %s arguments to clustered_core log type", 410 (argc < 2) ? "few" : "many"); 411 r = -EINVAL; 412 goto fail; 413 } 414 } 415 416 if (!(region_size = strtoll(argv[disk_log], &p, 0)) || *p) { 417 LOG_ERROR("Invalid region_size argument to clustered_%s log type", 418 (disk_log) ? "disk" : "core"); 419 r = -EINVAL; 420 goto fail; 421 } 422 423 region_count = device_size / region_size; 424 if (device_size % region_size) { 425 /* 426 * I can't remember if device_size must be a multiple 427 * of region_size, so check it anyway. 428 */ 429 region_count++; 430 } 431 432 for (i = 0; i < argc; i++) { 433 if (!strcmp(argv[i], "sync")) 434 sync = FORCESYNC; 435 else if (!strcmp(argv[i], "nosync")) 436 sync = NOSYNC; 437 else if (!strcmp(argv[i], "block_on_error")) 438 block_on_error = 1; 439 } 440 441 lc = malloc(sizeof(*lc)); 442 if (!lc) { 443 LOG_ERROR("Unable to allocate cluster log context"); 444 r = -ENOMEM; 445 goto fail; 446 } 447 memset(lc, 0, sizeof(*lc)); 448 449 lc->region_size = region_size; 450 lc->region_count = region_count; 451 lc->sync = sync; 452 lc->block_on_error = block_on_error; 453 lc->sync_search = 0; 454 lc->recovering_region = (uint64_t)-1; 455 lc->skip_bit_warning = region_count; 456 lc->disk_fd = -1; 457 lc->log_dev_failed = 0; 458 strncpy(lc->uuid, uuid, DM_UUID_LEN); 459 lc->luid = luid; 460 461 if ((duplicate = get_log(lc->uuid, lc->luid)) || 462 (duplicate = get_pending_log(lc->uuid, lc->luid))) { 463 LOG_ERROR("[%s/%llu] Log already exists, unable to create.", 464 SHORT_UUID(lc->uuid), lc->luid); 465 free(lc); 466 return -EINVAL; 467 } 468 469 dm_list_init(&lc->mark_list); 470 471 lc->clean_bits = dm_bitset_create(NULL, region_count); 472 if (!lc->clean_bits) { 473 LOG_ERROR("Unable to allocate clean bitset"); 474 r = -ENOMEM; 475 goto fail; 476 } 477 478 lc->sync_bits = dm_bitset_create(NULL, region_count); 479 if (!lc->sync_bits) { 480 LOG_ERROR("Unable to allocate sync bitset"); 481 r = -ENOMEM; 482 goto fail; 483 } 484 if (sync == NOSYNC) 485 dm_bit_set_all(lc->sync_bits); 486 487 lc->sync_count = (sync == NOSYNC) ? region_count : 0; 488 if (disk_log) { 489 page_size = sysconf(_SC_PAGESIZE); 490 pages = ((int)lc->clean_bits[0])/page_size; 491 pages += ((int)lc->clean_bits[0])%page_size ? 1 : 0; 492 pages += 1; /* for header */ 493 494 r = open(disk_path, O_RDWR | O_DIRECT); 495 if (r < 0) { 496 LOG_ERROR("Unable to open log device, %s: %s", 497 disk_path, strerror(errno)); 498 r = errno; 499 goto fail; 500 } 501 if (unlink_path) 502 unlink(disk_path); 503 504 lc->disk_fd = r; 505 lc->disk_size = pages * page_size; 506 507 r = posix_memalign(&(lc->disk_buffer), page_size, 508 lc->disk_size); 509 if (r) { 510 LOG_ERROR("Unable to allocate memory for disk_buffer"); 511 goto fail; 512 } 513 memset(lc->disk_buffer, 0, lc->disk_size); 514 LOG_DBG("Disk log ready"); 515 } 516 517 dm_list_add(&log_pending_list, &lc->list); 518 519 return 0; 520 fail: 521 if (lc) { 522 if (lc->clean_bits) 523 free(lc->clean_bits); 524 if (lc->sync_bits) 525 free(lc->sync_bits); 526 if (lc->disk_buffer) 527 free(lc->disk_buffer); 528 if (lc->disk_fd >= 0) 529 close(lc->disk_fd); 530 free(lc); 531 } 532 return r; 533 } 534 535 /* 536 * clog_ctr 537 * @rq 538 * 539 * rq->data should contain constructor string as follows: 540 * <log_type> [disk] <region_size> [[no]sync] <device_len> 541 * The kernel is responsible for adding the <dev_len> argument 542 * to the end; otherwise, we cannot compute the region_count. 543 * 544 * FIXME: Currently relies on caller to fill in rq->error 545 */ 546 static int clog_dtr(struct dm_ulog_request *rq); 547 static int clog_ctr(struct dm_ulog_request *rq) 548 { 549 int argc, i, r = 0; 550 char *p, **argv = NULL; 551 char *dev_size_str; 552 uint64_t device_size; 553 554 /* Sanity checks */ 555 if (!rq->data_size) { 556 LOG_ERROR("Received constructor request with no data"); 557 return -EINVAL; 558 } 559 560 if (strlen(rq->data) > rq->data_size) { 561 LOG_ERROR("Received constructor request with bad data"); 562 LOG_ERROR("strlen(rq->data)[%d] != rq->data_size[%llu]", 563 (int)strlen(rq->data), 564 (unsigned long long)rq->data_size); 565 LOG_ERROR("rq->data = '%s' [%d]", 566 rq->data, (int)strlen(rq->data)); 567 return -EINVAL; 568 } 569 570 /* Split up args */ 571 for (argc = 0, p = rq->data; (p = strstr(p, " ")); p++, argc++) 572 *p = '\0'; 573 574 argv = malloc(argc * sizeof(char *)); 575 if (!argv) 576 return -ENOMEM; 577 578 p = dev_size_str = rq->data; 579 p += strlen(p) + 1; 580 for (i = 0; i < argc; i++, p = p + strlen(p) + 1) 581 argv[i] = p; 582 583 if (strcmp(argv[0], "clustered_disk") && 584 strcmp(argv[0], "clustered_core")) { 585 LOG_ERROR("Unsupported userspace log type, \"%s\"", argv[0]); 586 free(argv); 587 return -EINVAL; 588 } 589 590 if (!(device_size = strtoll(dev_size_str, &p, 0)) || *p) { 591 LOG_ERROR("Invalid device size argument: %s", dev_size_str); 592 free(argv); 593 return -EINVAL; 594 } 595 596 r = _clog_ctr(rq->uuid, rq->luid, argc - 1, argv + 1, device_size); 597 598 /* We join the CPG when we resume */ 599 600 /* No returning data */ 601 rq->data_size = 0; 602 603 if (r) { 604 LOG_ERROR("Failed to create cluster log (%s)", rq->uuid); 605 for (i = 0; i < argc; i++) 606 LOG_ERROR("argv[%d] = %s", i, argv[i]); 607 } 608 else 609 LOG_DBG("[%s] Cluster log created", 610 SHORT_UUID(rq->uuid)); 611 612 free(argv); 613 return r; 614 } 615 616 /* 617 * clog_dtr 618 * @rq 619 * 620 */ 621 static int clog_dtr(struct dm_ulog_request *rq) 622 { 623 struct log_c *lc = get_log(rq->uuid, rq->luid); 624 625 if (lc) { 626 /* 627 * The log should not be on the official list. There 628 * should have been a suspend first. 629 */ 630 LOG_ERROR("[%s] DTR before SUS: leaving CPG", 631 SHORT_UUID(rq->uuid)); 632 destroy_cluster_cpg(rq->uuid); 633 } else if (!(lc = get_pending_log(rq->uuid, rq->luid))) { 634 LOG_ERROR("clog_dtr called on log that is not official or pending"); 635 return -EINVAL; 636 } 637 638 LOG_DBG("[%s] Cluster log removed", SHORT_UUID(lc->uuid)); 639 640 dm_list_del(&lc->list); 641 if (lc->disk_fd != -1) 642 close(lc->disk_fd); 643 if (lc->disk_buffer) 644 free(lc->disk_buffer); 645 free(lc->clean_bits); 646 free(lc->sync_bits); 647 free(lc); 648 649 return 0; 650 } 651 652 /* 653 * clog_presuspend 654 * @rq 655 * 656 */ 657 static int clog_presuspend(struct dm_ulog_request *rq) 658 { 659 struct log_c *lc = get_log(rq->uuid, rq->luid); 660 661 if (!lc) 662 return -EINVAL; 663 664 if (lc->touched) 665 LOG_DBG("WARNING: log still marked as 'touched' during suspend"); 666 667 lc->recovery_halted = 1; 668 669 return 0; 670 } 671 672 /* 673 * clog_postsuspend 674 * @rq 675 * 676 */ 677 static int clog_postsuspend(struct dm_ulog_request *rq) 678 { 679 struct log_c *lc = get_log(rq->uuid, rq->luid); 680 681 if (!lc) 682 return -EINVAL; 683 684 LOG_DBG("[%s] clog_postsuspend: leaving CPG", SHORT_UUID(lc->uuid)); 685 destroy_cluster_cpg(rq->uuid); 686 687 lc->state = LOG_SUSPENDED; 688 lc->recovering_region = (uint64_t)-1; 689 lc->recoverer = (uint32_t)-1; 690 lc->delay = time(NULL); 691 692 return 0; 693 } 694 695 /* 696 * cluster_postsuspend 697 * @rq 698 * 699 */ 700 int cluster_postsuspend(char *uuid, uint64_t luid) 701 { 702 struct log_c *lc = get_log(uuid, luid); 703 704 if (!lc) 705 return -EINVAL; 706 707 LOG_DBG("[%s] clog_postsuspend: finalizing", SHORT_UUID(lc->uuid)); 708 lc->resume_override = 0; 709 710 /* move log to pending list */ 711 dm_list_del(&lc->list); 712 dm_list_add(&log_pending_list, &lc->list); 713 714 return 0; 715 } 716 717 /* 718 * clog_resume 719 * @rq 720 * 721 * Does the main work of resuming. 722 */ 723 static int clog_resume(struct dm_ulog_request *rq) 724 { 725 uint32_t i; 726 int commit_log = 0; 727 struct log_c *lc = get_log(rq->uuid, rq->luid); 728 729 if (!lc) 730 return -EINVAL; 731 732 switch (lc->resume_override) { 733 case 1000: 734 LOG_ERROR("[%s] Additional resume issued before suspend", 735 SHORT_UUID(rq->uuid)); 736 #ifdef DEBUG 737 kill(getpid(), SIGUSR1); 738 #endif 739 return 0; 740 case 0: 741 lc->resume_override = 1000; 742 if (lc->disk_fd == -1) { 743 LOG_DBG("[%s] Master resume.", 744 SHORT_UUID(lc->uuid)); 745 goto no_disk; 746 } 747 748 LOG_DBG("[%s] Master resume: reading disk log", 749 SHORT_UUID(lc->uuid)); 750 commit_log = 1; 751 break; 752 case 1: 753 LOG_ERROR("Error:: partial bit loading (just sync_bits)"); 754 return -EINVAL; 755 case 2: 756 LOG_ERROR("Error:: partial bit loading (just clean_bits)"); 757 return -EINVAL; 758 case 3: 759 LOG_DBG("[%s] Non-master resume: bits pre-loaded", 760 SHORT_UUID(lc->uuid)); 761 lc->resume_override = 1000; 762 goto out; 763 default: 764 LOG_ERROR("Error:: multiple loading of bits (%d)", 765 lc->resume_override); 766 return -EINVAL; 767 } 768 769 if (lc->log_dev_failed) { 770 LOG_ERROR("Log device has failed, unable to read bits"); 771 rq->error = 0; /* We can handle this so far */ 772 lc->disk_nr_regions = 0; 773 } else 774 rq->error = read_log(lc); 775 776 switch (rq->error) { 777 case 0: 778 if (lc->disk_nr_regions < lc->region_count) 779 LOG_DBG("[%s] Mirror has grown, updating log bits", 780 SHORT_UUID(lc->uuid)); 781 else if (lc->disk_nr_regions > lc->region_count) 782 LOG_DBG("[%s] Mirror has shrunk, updating log bits", 783 SHORT_UUID(lc->uuid)); 784 break; 785 case -EINVAL: 786 LOG_DBG("[%s] (Re)initializing mirror log - resync issued.", 787 SHORT_UUID(lc->uuid)); 788 lc->disk_nr_regions = 0; 789 break; 790 default: 791 LOG_ERROR("Failed to read disk log"); 792 lc->disk_nr_regions = 0; 793 break; 794 } 795 796 no_disk: 797 /* If mirror has grown, set bits appropriately */ 798 if (lc->sync == NOSYNC) 799 for (i = lc->disk_nr_regions; i < lc->region_count; i++) 800 log_set_bit(lc, lc->clean_bits, i); 801 else 802 for (i = lc->disk_nr_regions; i < lc->region_count; i++) 803 log_clear_bit(lc, lc->clean_bits, i); 804 805 /* Clear any old bits if device has shrunk */ 806 for (i = lc->region_count; i % 32; i++) 807 log_clear_bit(lc, lc->clean_bits, i); 808 809 /* copy clean across to sync */ 810 dm_bit_copy(lc->sync_bits, lc->clean_bits); 811 812 if (commit_log && (lc->disk_fd >= 0)) { 813 rq->error = write_log(lc); 814 if (rq->error) 815 LOG_ERROR("Failed initial disk log write"); 816 else 817 LOG_DBG("Disk log initialized"); 818 lc->touched = 0; 819 } 820 out: 821 /* 822 * Clear any old bits if device has shrunk - necessary 823 * for non-master resume 824 */ 825 for (i = lc->region_count; i % 32; i++) { 826 log_clear_bit(lc, lc->clean_bits, i); 827 log_clear_bit(lc, lc->sync_bits, i); 828 } 829 830 lc->sync_count = count_bits32(lc->sync_bits); 831 832 LOG_SPRINT(lc, "[%s] Initial sync_count = %llu", 833 SHORT_UUID(lc->uuid), (unsigned long long)lc->sync_count); 834 lc->sync_search = 0; 835 lc->state = LOG_RESUMED; 836 lc->recovery_halted = 0; 837 838 return rq->error; 839 } 840 841 /* 842 * local_resume 843 * @rq 844 * 845 * If the log is pending, we must first join the cpg and 846 * put the log in the official list. 847 * 848 */ 849 int local_resume(struct dm_ulog_request *rq) 850 { 851 int r; 852 time_t t; 853 struct log_c *lc = get_log(rq->uuid, rq->luid); 854 855 if (!lc) { 856 /* Is the log in the pending list? */ 857 lc = get_pending_log(rq->uuid, rq->luid); 858 if (!lc) { 859 LOG_ERROR("clog_resume called on log that is not official or pending"); 860 return -EINVAL; 861 } 862 863 t = time(NULL); 864 t -= lc->delay; 865 /* 866 * This should be considered a temporary fix. It addresses 867 * a problem that exists when nodes suspend/resume in rapid 868 * succession. While the problem is very rare, it has been 869 * seen to happen in real-world-like testing. 870 * 871 * The problem: 872 * - Node A joins cluster 873 * - Node B joins cluster 874 * - Node A prepares checkpoint 875 * - Node A gets ready to write checkpoint 876 * - Node B leaves 877 * - Node B joins 878 * - Node A finishes write of checkpoint 879 * - Node B receives checkpoint meant for previous session 880 * -- Node B can now be non-coherent 881 * 882 * This timer will solve the problem for now, but could be 883 * replaced by a generation number sent with the resume 884 * command from the kernel. The generation number would 885 * be included in the name of the checkpoint to prevent 886 * reading stale data. 887 */ 888 if ((t < 3) && (t >= 0)) 889 sleep(3 - t); 890 891 /* Join the CPG */ 892 r = create_cluster_cpg(rq->uuid, rq->luid); 893 if (r) { 894 LOG_ERROR("clog_resume: Failed to create cluster CPG"); 895 return r; 896 } 897 898 /* move log to official list */ 899 dm_list_del(&lc->list); 900 dm_list_add(&log_list, &lc->list); 901 } 902 903 return 0; 904 } 905 906 /* 907 * clog_get_region_size 908 * @rq 909 * 910 * Since this value doesn't change, the kernel 911 * should not need to talk to server to get this 912 * The function is here for completness 913 * 914 * Returns: 0 on success, -EXXX on failure 915 */ 916 static int clog_get_region_size(struct dm_ulog_request *rq) 917 { 918 uint64_t *rtn = (uint64_t *)rq->data; 919 struct log_c *lc = get_log(rq->uuid, rq->luid); 920 921 if (!lc && !(lc = get_pending_log(rq->uuid, rq->luid))) 922 return -EINVAL; 923 924 *rtn = lc->region_size; 925 rq->data_size = sizeof(*rtn); 926 927 return 0; 928 } 929 930 /* 931 * clog_is_clean 932 * @rq 933 * 934 * Returns: 1 if clean, 0 otherwise 935 */ 936 static int clog_is_clean(struct dm_ulog_request *rq) 937 { 938 int64_t *rtn = (int64_t *)rq->data; 939 uint64_t region = *((uint64_t *)(rq->data)); 940 struct log_c *lc = get_log(rq->uuid, rq->luid); 941 942 if (!lc) 943 return -EINVAL; 944 945 *rtn = log_test_bit(lc->clean_bits, region); 946 rq->data_size = sizeof(*rtn); 947 948 return 0; 949 } 950 951 /* 952 * clog_in_sync 953 * @rq 954 * 955 * We ignore any request for non-block. That 956 * should be handled elsewhere. (If the request 957 * has come this far, it has already blocked.) 958 * 959 * Returns: 1 if in-sync, 0 otherwise 960 */ 961 static int clog_in_sync(struct dm_ulog_request *rq) 962 { 963 int64_t *rtn = (int64_t *)rq->data; 964 uint64_t region = *((uint64_t *)(rq->data)); 965 struct log_c *lc = get_log(rq->uuid, rq->luid); 966 967 if (!lc) 968 return -EINVAL; 969 970 if (region > lc->region_count) 971 return -EINVAL; 972 973 *rtn = log_test_bit(lc->sync_bits, region); 974 if (*rtn) 975 LOG_DBG("[%s] Region is in-sync: %llu", 976 SHORT_UUID(lc->uuid), (unsigned long long)region); 977 else 978 LOG_DBG("[%s] Region is not in-sync: %llu", 979 SHORT_UUID(lc->uuid), (unsigned long long)region); 980 981 rq->data_size = sizeof(*rtn); 982 983 return 0; 984 } 985 986 /* 987 * clog_flush 988 * @rq 989 * 990 */ 991 static int clog_flush(struct dm_ulog_request *rq, int server) 992 { 993 int r = 0; 994 struct log_c *lc = get_log(rq->uuid, rq->luid); 995 996 if (!lc) 997 return -EINVAL; 998 999 if (!lc->touched) 1000 return 0; 1001 1002 /* 1003 * Do the actual flushing of the log only 1004 * if we are the server. 1005 */ 1006 if (server && (lc->disk_fd >= 0)) { 1007 r = rq->error = write_log(lc); 1008 if (r) 1009 LOG_ERROR("[%s] Error writing to disk log", 1010 SHORT_UUID(lc->uuid)); 1011 else 1012 LOG_DBG("[%s] Disk log written", SHORT_UUID(lc->uuid)); 1013 } 1014 1015 lc->touched = 0; 1016 1017 return r; 1018 1019 } 1020 1021 /* 1022 * mark_region 1023 * @lc 1024 * @region 1025 * @who 1026 * 1027 * Put a mark region request in the tree for tracking. 1028 * 1029 * Returns: 0 on success, -EXXX on error 1030 */ 1031 static int mark_region(struct log_c *lc, uint64_t region, uint32_t who) 1032 { 1033 int found = 0; 1034 struct mark_entry *m; 1035 1036 dm_list_iterate_items(m, &lc->mark_list) 1037 if (m->region == region) { 1038 found = 1; 1039 if (m->nodeid == who) 1040 return 0; 1041 } 1042 1043 if (!found) 1044 log_clear_bit(lc, lc->clean_bits, region); 1045 1046 /* 1047 * Save allocation until here - if there is a failure, 1048 * at least we have cleared the bit. 1049 */ 1050 m = malloc(sizeof(*m)); 1051 if (!m) { 1052 LOG_ERROR("Unable to allocate space for mark_entry: %llu/%u", 1053 (unsigned long long)region, who); 1054 return -ENOMEM; 1055 } 1056 1057 m->nodeid = who; 1058 m->region = region; 1059 dm_list_add(&lc->mark_list, &m->list); 1060 1061 return 0; 1062 } 1063 1064 /* 1065 * clog_mark_region 1066 * @rq 1067 * 1068 * rq may contain more than one mark request. We 1069 * can determine the number from the 'data_size' field. 1070 * 1071 * Returns: 0 on success, -EXXX on failure 1072 */ 1073 static int clog_mark_region(struct dm_ulog_request *rq, uint32_t originator) 1074 { 1075 int r; 1076 int count; 1077 uint64_t *region; 1078 struct log_c *lc = get_log(rq->uuid, rq->luid); 1079 1080 if (!lc) 1081 return -EINVAL; 1082 1083 if (rq->data_size % sizeof(uint64_t)) { 1084 LOG_ERROR("Bad data size given for mark_region request"); 1085 return -EINVAL; 1086 } 1087 1088 count = rq->data_size / sizeof(uint64_t); 1089 region = (uint64_t *)&rq->data; 1090 1091 for (; count > 0; count--, region++) { 1092 r = mark_region(lc, *region, originator); 1093 if (r) 1094 return r; 1095 } 1096 1097 rq->data_size = 0; 1098 1099 return 0; 1100 } 1101 1102 static int clear_region(struct log_c *lc, uint64_t region, uint32_t who) 1103 { 1104 int other_matches = 0; 1105 struct mark_entry *m, *n; 1106 1107 dm_list_iterate_items_safe(m, n, &lc->mark_list) 1108 if (m->region == region) { 1109 if (m->nodeid == who) { 1110 dm_list_del(&m->list); 1111 free(m); 1112 } else 1113 other_matches = 1; 1114 } 1115 1116 /* 1117 * Clear region if: 1118 * 1) It is in-sync 1119 * 2) There are no other machines that have it marked 1120 */ 1121 if (!other_matches && log_test_bit(lc->sync_bits, region)) 1122 log_set_bit(lc, lc->clean_bits, region); 1123 1124 return 0; 1125 } 1126 1127 /* 1128 * clog_clear_region 1129 * @rq 1130 * 1131 * rq may contain more than one clear request. We 1132 * can determine the number from the 'data_size' field. 1133 * 1134 * Returns: 0 on success, -EXXX on failure 1135 */ 1136 static int clog_clear_region(struct dm_ulog_request *rq, uint32_t originator) 1137 { 1138 int r; 1139 int count; 1140 uint64_t *region; 1141 struct log_c *lc = get_log(rq->uuid, rq->luid); 1142 1143 if (!lc) 1144 return -EINVAL; 1145 1146 if (rq->data_size % sizeof(uint64_t)) { 1147 LOG_ERROR("Bad data size given for clear_region request"); 1148 return -EINVAL; 1149 } 1150 1151 count = rq->data_size / sizeof(uint64_t); 1152 region = (uint64_t *)&rq->data; 1153 1154 for (; count > 0; count--, region++) { 1155 r = clear_region(lc, *region, originator); 1156 if (r) 1157 return r; 1158 } 1159 1160 rq->data_size = 0; 1161 1162 return 0; 1163 } 1164 1165 /* 1166 * clog_get_resync_work 1167 * @rq 1168 * 1169 */ 1170 static int clog_get_resync_work(struct dm_ulog_request *rq, uint32_t originator) 1171 { 1172 struct { 1173 int64_t i; 1174 uint64_t r; 1175 } *pkg = (void *)rq->data; 1176 struct log_c *lc = get_log(rq->uuid, rq->luid); 1177 1178 if (!lc) 1179 return -EINVAL; 1180 1181 rq->data_size = sizeof(*pkg); 1182 pkg->i = 0; 1183 1184 if (lc->sync_search >= lc->region_count) { 1185 /* 1186 * FIXME: handle intermittent errors during recovery 1187 * by resetting sync_search... but not to many times. 1188 */ 1189 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1190 "Recovery finished", 1191 rq->seq, SHORT_UUID(lc->uuid), originator); 1192 return 0; 1193 } 1194 1195 if (lc->recovering_region != (uint64_t)-1) { 1196 if (lc->recoverer == originator) { 1197 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1198 "Re-requesting work (%llu)", 1199 rq->seq, SHORT_UUID(lc->uuid), originator, 1200 (unsigned long long)lc->recovering_region); 1201 pkg->r = lc->recovering_region; 1202 pkg->i = 1; 1203 LOG_COND(log_resend_requests, "***** RE-REQUEST *****"); 1204 } else { 1205 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1206 "Someone already recovering (%llu)", 1207 rq->seq, SHORT_UUID(lc->uuid), originator, 1208 (unsigned long long)lc->recovering_region); 1209 } 1210 1211 return 0; 1212 } 1213 1214 while (lc->recovery_request_list) { 1215 struct recovery_request *del; 1216 1217 del = lc->recovery_request_list; 1218 lc->recovery_request_list = del->next; 1219 1220 pkg->r = del->region; 1221 free(del); 1222 1223 if (!log_test_bit(lc->sync_bits, pkg->r)) { 1224 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1225 "Assigning priority resync work (%llu)", 1226 rq->seq, SHORT_UUID(lc->uuid), originator, 1227 (unsigned long long)pkg->r); 1228 pkg->i = 1; 1229 lc->recovering_region = pkg->r; 1230 lc->recoverer = originator; 1231 return 0; 1232 } 1233 } 1234 1235 pkg->r = find_next_zero_bit(lc->sync_bits, 1236 lc->sync_search); 1237 1238 if (pkg->r >= lc->region_count) { 1239 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1240 "Resync work complete.", 1241 rq->seq, SHORT_UUID(lc->uuid), originator); 1242 return 0; 1243 } 1244 1245 lc->sync_search = pkg->r + 1; 1246 1247 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1248 "Assigning resync work (%llu)", 1249 rq->seq, SHORT_UUID(lc->uuid), originator, 1250 (unsigned long long)pkg->r); 1251 pkg->i = 1; 1252 lc->recovering_region = pkg->r; 1253 lc->recoverer = originator; 1254 1255 return 0; 1256 } 1257 1258 /* 1259 * clog_set_region_sync 1260 * @rq 1261 */ 1262 static int clog_set_region_sync(struct dm_ulog_request *rq, uint32_t originator) 1263 { 1264 struct { 1265 uint64_t region; 1266 int64_t in_sync; 1267 } *pkg = (void *)rq->data; 1268 struct log_c *lc = get_log(rq->uuid, rq->luid); 1269 1270 if (!lc) 1271 return -EINVAL; 1272 1273 lc->recovering_region = (uint64_t)-1; 1274 1275 if (pkg->in_sync) { 1276 if (log_test_bit(lc->sync_bits, pkg->region)) { 1277 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1278 "Region already set (%llu)", 1279 rq->seq, SHORT_UUID(lc->uuid), originator, 1280 (unsigned long long)pkg->region); 1281 } else { 1282 log_set_bit(lc, lc->sync_bits, pkg->region); 1283 lc->sync_count++; 1284 1285 /* The rest of this section is all for debugging */ 1286 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1287 "Setting region (%llu)", 1288 rq->seq, SHORT_UUID(lc->uuid), originator, 1289 (unsigned long long)pkg->region); 1290 if (pkg->region == lc->skip_bit_warning) 1291 lc->skip_bit_warning = lc->region_count; 1292 1293 if (pkg->region > (lc->skip_bit_warning + 5)) { 1294 LOG_ERROR("*** Region #%llu skipped during recovery ***", 1295 (unsigned long long)lc->skip_bit_warning); 1296 lc->skip_bit_warning = lc->region_count; 1297 #ifdef DEBUG 1298 kill(getpid(), SIGUSR1); 1299 #endif 1300 } 1301 1302 if (!log_test_bit(lc->sync_bits, 1303 (pkg->region) ? pkg->region - 1 : 0)) { 1304 LOG_SPRINT(lc, "*** Previous bit not set ***"); 1305 lc->skip_bit_warning = (pkg->region) ? 1306 pkg->region - 1 : 0; 1307 } 1308 } 1309 } else if (log_test_bit(lc->sync_bits, pkg->region)) { 1310 lc->sync_count--; 1311 log_clear_bit(lc, lc->sync_bits, pkg->region); 1312 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1313 "Unsetting region (%llu)", 1314 rq->seq, SHORT_UUID(lc->uuid), originator, 1315 (unsigned long long)pkg->region); 1316 } 1317 1318 if (lc->sync_count != count_bits32(lc->sync_bits)) { 1319 unsigned long long reset = count_bits32(lc->sync_bits); 1320 1321 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1322 "sync_count(%llu) != bitmap count(%llu)", 1323 rq->seq, SHORT_UUID(lc->uuid), originator, 1324 (unsigned long long)lc->sync_count, reset); 1325 #ifdef DEBUG 1326 kill(getpid(), SIGUSR1); 1327 #endif 1328 lc->sync_count = reset; 1329 } 1330 1331 if (lc->sync_count > lc->region_count) 1332 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1333 "(lc->sync_count > lc->region_count) - this is bad", 1334 rq->seq, SHORT_UUID(lc->uuid), originator); 1335 1336 rq->data_size = 0; 1337 return 0; 1338 } 1339 1340 /* 1341 * clog_get_sync_count 1342 * @rq 1343 */ 1344 static int clog_get_sync_count(struct dm_ulog_request *rq, uint32_t originator) 1345 { 1346 uint64_t *sync_count = (uint64_t *)rq->data; 1347 struct log_c *lc = get_log(rq->uuid, rq->luid); 1348 1349 /* 1350 * FIXME: Mirror requires us to be able to ask for 1351 * the sync count while pending... but I don't like 1352 * it because other machines may not be suspended and 1353 * the stored value may not be accurate. 1354 */ 1355 if (!lc) 1356 lc = get_pending_log(rq->uuid, rq->luid); 1357 1358 if (!lc) 1359 return -EINVAL; 1360 1361 *sync_count = lc->sync_count; 1362 1363 rq->data_size = sizeof(*sync_count); 1364 1365 if (lc->sync_count != count_bits32(lc->sync_bits)) { 1366 unsigned long long reset = count_bits32(lc->sync_bits); 1367 1368 LOG_SPRINT(lc, "get_sync_count - SEQ#=%u, UUID=%s, nodeid = %u:: " 1369 "sync_count(%llu) != bitmap count(%llu)", 1370 rq->seq, SHORT_UUID(lc->uuid), originator, 1371 (unsigned long long)lc->sync_count, reset); 1372 #ifdef DEBUG 1373 kill(getpid(), SIGUSR1); 1374 #endif 1375 lc->sync_count = reset; 1376 } 1377 1378 return 0; 1379 } 1380 1381 static int core_status_info(struct log_c *lc, struct dm_ulog_request *rq) 1382 { 1383 char *data = (char *)rq->data; 1384 1385 rq->data_size = sprintf(data, "1 clustered_core"); 1386 1387 return 0; 1388 } 1389 1390 static int disk_status_info(struct log_c *lc, struct dm_ulog_request *rq) 1391 { 1392 char *data = (char *)rq->data; 1393 struct stat statbuf; 1394 1395 if(fstat(lc->disk_fd, &statbuf)) { 1396 rq->error = -errno; 1397 return -errno; 1398 } 1399 1400 rq->data_size = sprintf(data, "3 clustered_disk %d:%d %c", 1401 major(statbuf.st_rdev), minor(statbuf.st_rdev), 1402 (lc->log_dev_failed) ? 'D' : 'A'); 1403 1404 return 0; 1405 } 1406 1407 /* 1408 * clog_status_info 1409 * @rq 1410 * 1411 */ 1412 static int clog_status_info(struct dm_ulog_request *rq) 1413 { 1414 int r; 1415 struct log_c *lc = get_log(rq->uuid, rq->luid); 1416 1417 if (!lc) 1418 lc = get_pending_log(rq->uuid, rq->luid); 1419 1420 if (!lc) 1421 return -EINVAL; 1422 1423 if (lc->disk_fd == -1) 1424 r = core_status_info(lc, rq); 1425 else 1426 r = disk_status_info(lc, rq); 1427 1428 return r; 1429 } 1430 1431 static int core_status_table(struct log_c *lc, struct dm_ulog_request *rq) 1432 { 1433 char *data = (char *)rq->data; 1434 1435 rq->data_size = sprintf(data, "clustered_core %u %s%s ", 1436 lc->region_size, 1437 (lc->sync == DEFAULTSYNC) ? "" : 1438 (lc->sync == NOSYNC) ? "nosync " : "sync ", 1439 (lc->block_on_error) ? "block_on_error" : ""); 1440 return 0; 1441 } 1442 1443 static int disk_status_table(struct log_c *lc, struct dm_ulog_request *rq) 1444 { 1445 char *data = (char *)rq->data; 1446 struct stat statbuf; 1447 1448 if(fstat(lc->disk_fd, &statbuf)) { 1449 rq->error = -errno; 1450 return -errno; 1451 } 1452 1453 rq->data_size = sprintf(data, "clustered_disk %d:%d %u %s%s ", 1454 major(statbuf.st_rdev), minor(statbuf.st_rdev), 1455 lc->region_size, 1456 (lc->sync == DEFAULTSYNC) ? "" : 1457 (lc->sync == NOSYNC) ? "nosync " : "sync ", 1458 (lc->block_on_error) ? "block_on_error" : ""); 1459 return 0; 1460 } 1461 1462 /* 1463 * clog_status_table 1464 * @rq 1465 * 1466 */ 1467 static int clog_status_table(struct dm_ulog_request *rq) 1468 { 1469 int r; 1470 struct log_c *lc = get_log(rq->uuid, rq->luid); 1471 1472 if (!lc) 1473 lc = get_pending_log(rq->uuid, rq->luid); 1474 1475 if (!lc) 1476 return -EINVAL; 1477 1478 if (lc->disk_fd == -1) 1479 r = core_status_table(lc, rq); 1480 else 1481 r = disk_status_table(lc, rq); 1482 1483 return r; 1484 } 1485 1486 /* 1487 * clog_is_remote_recovering 1488 * @rq 1489 * 1490 */ 1491 static int clog_is_remote_recovering(struct dm_ulog_request *rq) 1492 { 1493 uint64_t region = *((uint64_t *)(rq->data)); 1494 struct { 1495 int64_t is_recovering; 1496 uint64_t in_sync_hint; 1497 } *pkg = (void *)rq->data; 1498 struct log_c *lc = get_log(rq->uuid, rq->luid); 1499 1500 if (!lc) 1501 return -EINVAL; 1502 1503 if (region > lc->region_count) 1504 return -EINVAL; 1505 1506 if (lc->recovery_halted) { 1507 LOG_DBG("[%s] Recovery halted... [not remote recovering]: %llu", 1508 SHORT_UUID(lc->uuid), (unsigned long long)region); 1509 pkg->is_recovering = 0; 1510 pkg->in_sync_hint = lc->region_count; /* none are recovering */ 1511 } else { 1512 pkg->is_recovering = !log_test_bit(lc->sync_bits, region); 1513 1514 /* 1515 * Remember, 'lc->sync_search' is 1 plus the region 1516 * currently being recovered. So, we must take off 1 1517 * to account for that; but only if 'sync_search > 1'. 1518 */ 1519 pkg->in_sync_hint = lc->sync_search ? (lc->sync_search - 1) : 0; 1520 LOG_DBG("[%s] Region is %s: %llu", 1521 SHORT_UUID(lc->uuid), 1522 (region == lc->recovering_region) ? 1523 "currently remote recovering" : 1524 (pkg->is_recovering) ? "pending remote recovery" : 1525 "not remote recovering", (unsigned long long)region); 1526 } 1527 1528 if (pkg->is_recovering && 1529 (region != lc->recovering_region)) { 1530 struct recovery_request *rr; 1531 1532 /* Already in the list? */ 1533 for (rr = lc->recovery_request_list; rr; rr = rr->next) 1534 if (rr->region == region) 1535 goto out; 1536 1537 /* Failure to allocated simply means we can't prioritize it */ 1538 rr = malloc(sizeof(*rr)); 1539 if (!rr) 1540 goto out; 1541 1542 LOG_DBG("[%s] Adding region to priority list: %llu", 1543 SHORT_UUID(lc->uuid), (unsigned long long)region); 1544 rr->region = region; 1545 rr->next = lc->recovery_request_list; 1546 lc->recovery_request_list = rr; 1547 } 1548 1549 out: 1550 1551 rq->data_size = sizeof(*pkg); 1552 1553 return 0; 1554 } 1555 1556 1557 /* 1558 * do_request 1559 * @rq: the request 1560 * @server: is this request performed by the server 1561 * 1562 * An inability to perform this function will return an error 1563 * from this function. However, an inability to successfully 1564 * perform the request will fill in the 'rq->error' field. 1565 * 1566 * Returns: 0 on success, -EXXX on error 1567 */ 1568 int do_request(struct clog_request *rq, int server) 1569 { 1570 int r; 1571 1572 if (!rq) 1573 return 0; 1574 1575 if (rq->u_rq.error) 1576 LOG_DBG("Programmer error: rq struct has error set"); 1577 1578 switch (rq->u_rq.request_type) { 1579 case DM_ULOG_CTR: 1580 r = clog_ctr(&rq->u_rq); 1581 break; 1582 case DM_ULOG_DTR: 1583 r = clog_dtr(&rq->u_rq); 1584 break; 1585 case DM_ULOG_PRESUSPEND: 1586 r = clog_presuspend(&rq->u_rq); 1587 break; 1588 case DM_ULOG_POSTSUSPEND: 1589 r = clog_postsuspend(&rq->u_rq); 1590 break; 1591 case DM_ULOG_RESUME: 1592 r = clog_resume(&rq->u_rq); 1593 break; 1594 case DM_ULOG_GET_REGION_SIZE: 1595 r = clog_get_region_size(&rq->u_rq); 1596 break; 1597 case DM_ULOG_IS_CLEAN: 1598 r = clog_is_clean(&rq->u_rq); 1599 break; 1600 case DM_ULOG_IN_SYNC: 1601 r = clog_in_sync(&rq->u_rq); 1602 break; 1603 case DM_ULOG_FLUSH: 1604 r = clog_flush(&rq->u_rq, server); 1605 break; 1606 case DM_ULOG_MARK_REGION: 1607 r = clog_mark_region(&rq->u_rq, rq->originator); 1608 break; 1609 case DM_ULOG_CLEAR_REGION: 1610 r = clog_clear_region(&rq->u_rq, rq->originator); 1611 break; 1612 case DM_ULOG_GET_RESYNC_WORK: 1613 r = clog_get_resync_work(&rq->u_rq, rq->originator); 1614 break; 1615 case DM_ULOG_SET_REGION_SYNC: 1616 r = clog_set_region_sync(&rq->u_rq, rq->originator); 1617 break; 1618 case DM_ULOG_GET_SYNC_COUNT: 1619 r = clog_get_sync_count(&rq->u_rq, rq->originator); 1620 break; 1621 case DM_ULOG_STATUS_INFO: 1622 r = clog_status_info(&rq->u_rq); 1623 break; 1624 case DM_ULOG_STATUS_TABLE: 1625 r = clog_status_table(&rq->u_rq); 1626 break; 1627 case DM_ULOG_IS_REMOTE_RECOVERING: 1628 r = clog_is_remote_recovering(&rq->u_rq); 1629 break; 1630 default: 1631 LOG_ERROR("Unknown request"); 1632 r = rq->u_rq.error = -EINVAL; 1633 break; 1634 } 1635 1636 if (r && !rq->u_rq.error) 1637 rq->u_rq.error = r; 1638 else if (r != rq->u_rq.error) 1639 LOG_DBG("Warning: error from function != rq->u_rq.error"); 1640 1641 if (rq->u_rq.error && rq->u_rq.data_size) { 1642 /* Make sure I'm handling errors correctly above */ 1643 LOG_DBG("Programmer error: rq->u_rq.error && rq->u_rq.data_size"); 1644 rq->u_rq.data_size = 0; 1645 } 1646 1647 return 0; 1648 } 1649 1650 static void print_bits(char *buf, int size, int print) 1651 { 1652 int i; 1653 char outbuf[128]; 1654 1655 memset(outbuf, 0, sizeof(outbuf)); 1656 1657 for (i = 0; i < size; i++) { 1658 if (!(i % 16)) { 1659 if (outbuf[0] != '\0') { 1660 if (print) 1661 LOG_PRINT("%s", outbuf); 1662 else 1663 LOG_DBG("%s", outbuf); 1664 } 1665 memset(outbuf, 0, sizeof(outbuf)); 1666 sprintf(outbuf, "[%3d - %3d]", i, i+15); 1667 } 1668 sprintf(outbuf + strlen(outbuf), " %.2X", (unsigned char)buf[i]); 1669 } 1670 if (outbuf[0] != '\0') { 1671 if (print) 1672 LOG_PRINT("%s", outbuf); 1673 else 1674 LOG_DBG("%s", outbuf); 1675 } 1676 } 1677 1678 /* int store_bits(const char *uuid, const char *which, char **buf)*/ 1679 int push_state(const char *uuid, uint64_t luid, 1680 const char *which, char **buf, uint32_t debug_who) 1681 { 1682 int bitset_size; 1683 struct log_c *lc; 1684 1685 if (*buf) 1686 LOG_ERROR("store_bits: *buf != NULL"); 1687 1688 lc = get_log(uuid, luid); 1689 if (!lc) { 1690 LOG_ERROR("store_bits: No log found for %s", uuid); 1691 return -EINVAL; 1692 } 1693 1694 if (!strcmp(which, "recovering_region")) { 1695 *buf = malloc(64); /* easily handles the 2 written numbers */ 1696 if (!*buf) 1697 return -ENOMEM; 1698 sprintf(*buf, "%llu %u", (unsigned long long)lc->recovering_region, 1699 lc->recoverer); 1700 1701 LOG_SPRINT(lc, "CKPT SEND - SEQ#=X, UUID=%s, nodeid = %u:: " 1702 "recovering_region=%llu, recoverer=%u, sync_count=%llu", 1703 SHORT_UUID(lc->uuid), debug_who, 1704 (unsigned long long)lc->recovering_region, 1705 lc->recoverer, 1706 (unsigned long long)count_bits32(lc->sync_bits)); 1707 return 64; 1708 } 1709 1710 /* Size in 'int's */ 1711 bitset_size = ((int)lc->clean_bits[0]/DM_BITS_PER_INT) + 1; 1712 1713 /* Size in bytes */ 1714 bitset_size *= 4; 1715 1716 *buf = malloc(bitset_size); 1717 1718 if (!*buf) { 1719 LOG_ERROR("store_bits: Unable to allocate memory"); 1720 return -ENOMEM; 1721 } 1722 1723 if (!strncmp(which, "sync_bits", 9)) { 1724 memcpy(*buf, lc->sync_bits + 1, bitset_size); 1725 LOG_DBG("[%s] storing sync_bits (sync_count = %llu):", 1726 SHORT_UUID(uuid), (unsigned long long) 1727 count_bits32(lc->sync_bits)); 1728 print_bits(*buf, bitset_size, 0); 1729 } else if (!strncmp(which, "clean_bits", 9)) { 1730 memcpy(*buf, lc->clean_bits + 1, bitset_size); 1731 LOG_DBG("[%s] storing clean_bits:", SHORT_UUID(lc->uuid)); 1732 print_bits(*buf, bitset_size, 0); 1733 } 1734 1735 return bitset_size; 1736 } 1737 1738 /*int load_bits(const char *uuid, const char *which, char *buf, int size)*/ 1739 int pull_state(const char *uuid, uint64_t luid, 1740 const char *which, char *buf, int size) 1741 { 1742 int bitset_size; 1743 struct log_c *lc; 1744 1745 if (!buf) 1746 LOG_ERROR("pull_state: buf == NULL"); 1747 1748 lc = get_log(uuid, luid); 1749 if (!lc) { 1750 LOG_ERROR("pull_state: No log found for %s", uuid); 1751 return -EINVAL; 1752 } 1753 1754 if (!strncmp(which, "recovering_region", 17)) { 1755 sscanf(buf, "%llu %u", (unsigned long long *)&lc->recovering_region, 1756 &lc->recoverer); 1757 LOG_SPRINT(lc, "CKPT INIT - SEQ#=X, UUID=%s, nodeid = X:: " 1758 "recovering_region=%llu, recoverer=%u", 1759 SHORT_UUID(lc->uuid), 1760 (unsigned long long)lc->recovering_region, lc->recoverer); 1761 return 0; 1762 } 1763 1764 /* Size in 'int's */ 1765 bitset_size = ((int)lc->clean_bits[0]/DM_BITS_PER_INT) + 1; 1766 1767 /* Size in bytes */ 1768 bitset_size *= 4; 1769 1770 if (bitset_size != size) { 1771 LOG_ERROR("pull_state(%s): bad bitset_size (%d vs %d)", 1772 which, size, bitset_size); 1773 return -EINVAL; 1774 } 1775 1776 if (!strncmp(which, "sync_bits", 9)) { 1777 lc->resume_override += 1; 1778 memcpy(lc->sync_bits + 1, buf, bitset_size); 1779 LOG_DBG("[%s] loading sync_bits (sync_count = %llu):", 1780 SHORT_UUID(lc->uuid),(unsigned long long) 1781 count_bits32(lc->sync_bits)); 1782 print_bits((char *)lc->sync_bits, bitset_size, 0); 1783 } else if (!strncmp(which, "clean_bits", 9)) { 1784 lc->resume_override += 2; 1785 memcpy(lc->clean_bits + 1, buf, bitset_size); 1786 LOG_DBG("[%s] loading clean_bits:", SHORT_UUID(lc->uuid)); 1787 print_bits((char *)lc->clean_bits, bitset_size, 0); 1788 } 1789 1790 return 0; 1791 } 1792 1793 int log_get_state(struct dm_ulog_request *rq) 1794 { 1795 struct log_c *lc; 1796 1797 lc = get_log(rq->uuid, rq->luid); 1798 if (!lc) 1799 return -EINVAL; 1800 1801 return lc->state; 1802 } 1803 1804 /* 1805 * log_status 1806 * 1807 * Returns: 1 if logs are still present, 0 otherwise 1808 */ 1809 int log_status(void) 1810 { 1811 if (!dm_list_empty(&log_list) || !dm_list_empty(&log_pending_list)) 1812 return 1; 1813 1814 return 0; 1815 } 1816 1817 void log_debug(void) 1818 { 1819 struct log_c *lc; 1820 uint64_t r; 1821 int i; 1822 1823 LOG_ERROR(""); 1824 LOG_ERROR("LOG COMPONENT DEBUGGING::"); 1825 LOG_ERROR("Official log list:"); 1826 LOG_ERROR("Pending log list:"); 1827 dm_list_iterate_items(lc, &log_pending_list) { 1828 LOG_ERROR("%s", lc->uuid); 1829 LOG_ERROR("sync_bits:"); 1830 print_bits((char *)lc->sync_bits, (int)lc->sync_bits[0], 1); 1831 LOG_ERROR("clean_bits:"); 1832 print_bits((char *)lc->clean_bits, (int)lc->sync_bits[0], 1); 1833 } 1834 1835 dm_list_iterate_items(lc, &log_list) { 1836 LOG_ERROR("%s", lc->uuid); 1837 LOG_ERROR(" recoverer : %u", lc->recoverer); 1838 LOG_ERROR(" recovering_region: %llu", 1839 (unsigned long long)lc->recovering_region); 1840 LOG_ERROR(" recovery_halted : %s", (lc->recovery_halted) ? 1841 "YES" : "NO"); 1842 LOG_ERROR("sync_bits:"); 1843 print_bits((char *)lc->sync_bits, (int)lc->sync_bits[0], 1); 1844 LOG_ERROR("clean_bits:"); 1845 print_bits((char *)lc->clean_bits, (int)lc->sync_bits[0], 1); 1846 1847 LOG_ERROR("Validating %s::", SHORT_UUID(lc->uuid)); 1848 r = find_next_zero_bit(lc->sync_bits, 0); 1849 LOG_ERROR(" lc->region_count = %llu", 1850 (unsigned long long)lc->region_count); 1851 LOG_ERROR(" lc->sync_count = %llu", 1852 (unsigned long long)lc->sync_count); 1853 LOG_ERROR(" next zero bit = %llu", 1854 (unsigned long long)r); 1855 if ((r > lc->region_count) || 1856 ((r == lc->region_count) && (lc->sync_count > lc->region_count))) { 1857 LOG_ERROR("ADJUSTING SYNC_COUNT"); 1858 lc->sync_count = lc->region_count; 1859 } 1860 1861 LOG_ERROR("Resync request history:"); 1862 for (i = 0; i < RESYNC_HISTORY; i++) { 1863 lc->idx++; 1864 lc->idx = lc->idx % RESYNC_HISTORY; 1865 if (lc->resync_history[lc->idx][0] == '\0') 1866 continue; 1867 LOG_ERROR("%d:%d) %s", i, lc->idx, 1868 lc->resync_history[lc->idx]); 1869 } 1870 } 1871 } 1872