1 /* $NetBSD: functions.c,v 1.3 2010/12/26 14:48:34 christos Exp $ */
2
3 /*
4 * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
5 *
6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions
8 * of the GNU Lesser General Public License v.2.1.
9 *
10 * You should have received a copy of the GNU Lesser General Public License
11 * along with this program; if not, write to the Free Software Foundation,
12 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
13 */
14 #define _GNU_SOURCE
15 #define _FILE_OFFSET_BITS 64
16
17 #include <stdint.h>
18 #include <errno.h>
19 #include <string.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 #include <dirent.h>
23 #include <unistd.h>
24 #include <signal.h>
25 #include <linux/kdev_t.h>
26 //#define __USE_GNU /* for O_DIRECT */
27 #include <fcntl.h>
28 #include <time.h>
29 #include "libdevmapper.h"
30 #include "dm-log-userspace.h"
31 #include "functions.h"
32 #include "common.h"
33 #include "cluster.h"
34 #include "logging.h"
35
36 #define BYTE_SHIFT 3
37
38 /*
39 * Magic for persistent mirrors: "MiRr"
40 * Following on-disk header information is stolen from
41 * drivers/md/dm-log.c
42 */
43 #define MIRROR_MAGIC 0x4D695272
44 #define MIRROR_DISK_VERSION 2
45 #define LOG_OFFSET 2
46
47 #define RESYNC_HISTORY 50
48 //static char resync_history[RESYNC_HISTORY][128];
49 //static int idx = 0;
50 #define LOG_SPRINT(_lc, f, arg...) do { \
51 lc->idx++; \
52 lc->idx = lc->idx % RESYNC_HISTORY; \
53 sprintf(lc->resync_history[lc->idx], f, ## arg); \
54 } while (0)
55
56 struct log_header {
57 uint32_t magic;
58 uint32_t version;
59 uint64_t nr_regions;
60 };
61
62 struct log_c {
63 struct dm_list list;
64
65 char uuid[DM_UUID_LEN];
66 uint64_t luid;
67
68 time_t delay; /* limits how fast a resume can happen after suspend */
69 int touched;
70 uint32_t region_size;
71 uint32_t region_count;
72 uint64_t sync_count;
73
74 dm_bitset_t clean_bits;
75 dm_bitset_t sync_bits;
76 uint32_t recoverer;
77 uint64_t recovering_region; /* -1 means not recovering */
78 uint64_t skip_bit_warning; /* used to warn if region skipped */
79 int sync_search;
80
81 int resume_override;
82
83 uint32_t block_on_error;
84 enum sync {
85 DEFAULTSYNC, /* Synchronize if necessary */
86 NOSYNC, /* Devices known to be already in sync */
87 FORCESYNC, /* Force a sync to happen */
88 } sync;
89
90 uint32_t state; /* current operational state of the log */
91
92 struct dm_list mark_list;
93
94 uint32_t recovery_halted;
95 struct recovery_request *recovery_request_list;
96
97 int disk_fd; /* -1 means no disk log */
98 int log_dev_failed;
99 uint64_t disk_nr_regions;
100 size_t disk_size; /* size of disk_buffer in bytes */
101 void *disk_buffer; /* aligned memory for O_DIRECT */
102 int idx;
103 char resync_history[RESYNC_HISTORY][128];
104 };
105
106 struct mark_entry {
107 struct dm_list list;
108 uint32_t nodeid;
109 uint64_t region;
110 };
111
112 struct recovery_request {
113 uint64_t region;
114 struct recovery_request *next;
115 };
116
117 static DM_LIST_INIT(log_list);
118 static DM_LIST_INIT(log_pending_list);
119
log_test_bit(dm_bitset_t bs,int bit)120 static int log_test_bit(dm_bitset_t bs, int bit)
121 {
122 return dm_bit(bs, bit);
123 }
124
log_set_bit(struct log_c * lc,dm_bitset_t bs,int bit)125 static void log_set_bit(struct log_c *lc, dm_bitset_t bs, int bit)
126 {
127 dm_bit_set(bs, bit);
128 lc->touched = 1;
129 }
130
log_clear_bit(struct log_c * lc,dm_bitset_t bs,int bit)131 static void log_clear_bit(struct log_c *lc, dm_bitset_t bs, int bit)
132 {
133 dm_bit_clear(bs, bit);
134 lc->touched = 1;
135 }
136
find_next_zero_bit(dm_bitset_t bs,int start)137 static int find_next_zero_bit(dm_bitset_t bs, int start)
138 {
139 while (dm_bit(bs, start++))
140 if (start >= (int)bs[0])
141 return -1;
142
143 return start - 1;
144 }
145
count_bits32(dm_bitset_t bs)146 static uint64_t count_bits32(dm_bitset_t bs)
147 {
148 int i, size = ((int)bs[0]/DM_BITS_PER_INT + 1);
149 unsigned count = 0;
150
151 for (i = 1; i <= size; i++)
152 count += hweight32(bs[i]);
153
154 return (uint64_t)count;
155 }
156
157 /*
158 * get_log
159 *
160 * Returns: log if found, NULL otherwise
161 */
get_log(const char * uuid,uint64_t luid)162 static struct log_c *get_log(const char *uuid, uint64_t luid)
163 {
164 struct log_c *lc;
165
166 dm_list_iterate_items(lc, &log_list)
167 if (!strcmp(lc->uuid, uuid) &&
168 (!luid || (luid == lc->luid)))
169 return lc;
170
171 return NULL;
172 }
173
174 /*
175 * get_pending_log
176 *
177 * Pending logs are logs that have been 'clog_ctr'ed, but
178 * have not joined the CPG (via clog_resume).
179 *
180 * Returns: log if found, NULL otherwise
181 */
get_pending_log(const char * uuid,uint64_t luid)182 static struct log_c *get_pending_log(const char *uuid, uint64_t luid)
183 {
184 struct log_c *lc;
185
186 dm_list_iterate_items(lc, &log_pending_list)
187 if (!strcmp(lc->uuid, uuid) &&
188 (!luid || (luid == lc->luid)))
189 return lc;
190
191 return NULL;
192 }
193
header_to_disk(struct log_header * mem,struct log_header * disk)194 static void header_to_disk(struct log_header *mem, struct log_header *disk)
195 {
196 memcpy(disk, mem, sizeof(struct log_header));
197 }
198
header_from_disk(struct log_header * mem,struct log_header * disk)199 static void header_from_disk(struct log_header *mem, struct log_header *disk)
200 {
201 memcpy(mem, disk, sizeof(struct log_header));
202 }
203
rw_log(struct log_c * lc,int do_write)204 static int rw_log(struct log_c *lc, int do_write)
205 {
206 int r;
207
208 r = lseek(lc->disk_fd, 0, SEEK_SET);
209 if (r < 0) {
210 LOG_ERROR("[%s] rw_log: lseek failure: %s",
211 SHORT_UUID(lc->uuid), strerror(errno));
212 return -errno;
213 }
214
215 if (do_write) {
216 r = write(lc->disk_fd, lc->disk_buffer, lc->disk_size);
217 if (r < 0) {
218 LOG_ERROR("[%s] rw_log: write failure: %s",
219 SHORT_UUID(lc->uuid), strerror(errno));
220 return -EIO; /* Failed disk write */
221 }
222 return 0;
223 }
224
225 /* Read */
226 r = read(lc->disk_fd, lc->disk_buffer, lc->disk_size);
227 if (r < 0)
228 LOG_ERROR("[%s] rw_log: read failure: %s",
229 SHORT_UUID(lc->uuid), strerror(errno));
230 if (r != lc->disk_size)
231 return -EIO; /* Failed disk read */
232 return 0;
233 }
234
235 /*
236 * read_log
237 * @lc
238 *
239 * Valid return codes:
240 * -EINVAL: Invalid header, bits not copied
241 * -EIO: Unable to read disk log
242 * 0: Valid header, disk bit -> lc->clean_bits
243 *
244 * Returns: 0 on success, -EXXX on failure
245 */
read_log(struct log_c * lc)246 static int read_log(struct log_c *lc)
247 {
248 struct log_header lh;
249 size_t bitset_size;
250
251 memset(&lh, 0, sizeof(struct log_header));
252
253 if (rw_log(lc, 0))
254 return -EIO; /* Failed disk read */
255
256 header_from_disk(&lh, lc->disk_buffer);
257 if (lh.magic != MIRROR_MAGIC)
258 return -EINVAL;
259
260 lc->disk_nr_regions = lh.nr_regions;
261
262 /* Read disk bits into sync_bits */
263 bitset_size = lc->region_count / 8;
264 bitset_size += (lc->region_count % 8) ? 1 : 0;
265 memcpy(lc->clean_bits, lc->disk_buffer + 1024, bitset_size);
266
267 return 0;
268 }
269
270 /*
271 * write_log
272 * @lc
273 *
274 * Returns: 0 on success, -EIO on failure
275 */
write_log(struct log_c * lc)276 static int write_log(struct log_c *lc)
277 {
278 struct log_header lh;
279 size_t bitset_size;
280
281 lh.magic = MIRROR_MAGIC;
282 lh.version = MIRROR_DISK_VERSION;
283 lh.nr_regions = lc->region_count;
284
285 header_to_disk(&lh, lc->disk_buffer);
286
287 /* Write disk bits from clean_bits */
288 bitset_size = lc->region_count / 8;
289 bitset_size += (lc->region_count % 8) ? 1 : 0;
290 memcpy(lc->disk_buffer + 1024, lc->clean_bits, bitset_size);
291
292 if (rw_log(lc, 1)) {
293 lc->log_dev_failed = 1;
294 return -EIO; /* Failed disk write */
295 }
296 return 0;
297 }
298
find_disk_path(char * major_minor_str,char * path_rtn,int * unlink_path)299 static int find_disk_path(char *major_minor_str, char *path_rtn, int *unlink_path)
300 {
301 int r;
302 DIR *dp;
303 struct dirent *dep;
304 struct stat statbuf;
305 int major, minor;
306 mode_t old_umask;
307
308 if (!strstr(major_minor_str, ":")) {
309 r = stat(major_minor_str, &statbuf);
310 if (r)
311 return -errno;
312 if (!S_ISBLK(statbuf.st_mode))
313 return -EINVAL;
314 sprintf(path_rtn, "%s", major_minor_str);
315 return 0;
316 }
317
318 r = sscanf(major_minor_str, "%d:%d", &major, &minor);
319 if (r != 2)
320 return -EINVAL;
321
322 LOG_DBG("Checking /dev/mapper for device %d:%d", major, minor);
323 /* Check /dev/mapper dir */
324 dp = opendir("/dev/mapper");
325 if (!dp)
326 return -ENOENT;
327
328 while ((dep = readdir(dp)) != NULL) {
329 /*
330 * FIXME: This is racy. By the time the path is used,
331 * it may point to something else. 'fstat' will be
332 * required upon opening to ensure we got what we
333 * wanted.
334 */
335
336 sprintf(path_rtn, "/dev/mapper/%s", dep->d_name);
337 stat(path_rtn, &statbuf);
338 if (S_ISBLK(statbuf.st_mode) &&
339 (major(statbuf.st_rdev) == major) &&
340 (minor(statbuf.st_rdev) == minor)) {
341 LOG_DBG(" %s: YES", dep->d_name);
342 closedir(dp);
343 return 0;
344 } else {
345 LOG_DBG(" %s: NO", dep->d_name);
346 }
347 }
348
349 closedir(dp);
350
351 LOG_DBG("Path not found for %d/%d", major, minor);
352 LOG_DBG("Creating /dev/mapper/%d-%d", major, minor);
353 sprintf(path_rtn, "/dev/mapper/%d-%d", major, minor);
354 old_umask = umask(0);
355 r = mknod(path_rtn, S_IFBLK | DM_DEVICE_MODE, MKDEV(major, minor));
356 umask(old_umask);
357
358 if (r != -1)
359 r = chown(path_rtn, DM_DEVICE_UID, DM_DEVICE_GID);
360
361 /*
362 * If we have to make the path, we unlink it after we open it
363 */
364 *unlink_path = 1;
365
366 return r ? -errno : 0;
367 }
368
_clog_ctr(char * uuid,uint64_t luid,int argc,char ** argv,uint64_t device_size)369 static int _clog_ctr(char *uuid, uint64_t luid,
370 int argc, char **argv, uint64_t device_size)
371 {
372 int i;
373 int r = 0;
374 char *p;
375 uint64_t region_size;
376 uint64_t region_count;
377 struct log_c *lc = NULL;
378 struct log_c *duplicate;
379 enum sync sync = DEFAULTSYNC;
380 uint32_t block_on_error = 0;
381
382 int disk_log = 0;
383 char disk_path[128];
384 int unlink_path = 0;
385 size_t page_size;
386 int pages;
387
388 /* If core log request, then argv[0] will be region_size */
389 if (!strtoll(argv[0], &p, 0) || *p) {
390 disk_log = 1;
391
392 if ((argc < 2) || (argc > 4)) {
393 LOG_ERROR("Too %s arguments to clustered_disk log type",
394 (argc < 3) ? "few" : "many");
395 r = -EINVAL;
396 goto fail;
397 }
398
399 r = find_disk_path(argv[0], disk_path, &unlink_path);
400 if (r) {
401 LOG_ERROR("Unable to find path to device %s", argv[0]);
402 goto fail;
403 }
404 LOG_DBG("Clustered log disk is %s", disk_path);
405 } else {
406 disk_log = 0;
407
408 if ((argc < 1) || (argc > 3)) {
409 LOG_ERROR("Too %s arguments to clustered_core log type",
410 (argc < 2) ? "few" : "many");
411 r = -EINVAL;
412 goto fail;
413 }
414 }
415
416 if (!(region_size = strtoll(argv[disk_log], &p, 0)) || *p) {
417 LOG_ERROR("Invalid region_size argument to clustered_%s log type",
418 (disk_log) ? "disk" : "core");
419 r = -EINVAL;
420 goto fail;
421 }
422
423 region_count = device_size / region_size;
424 if (device_size % region_size) {
425 /*
426 * I can't remember if device_size must be a multiple
427 * of region_size, so check it anyway.
428 */
429 region_count++;
430 }
431
432 for (i = 0; i < argc; i++) {
433 if (!strcmp(argv[i], "sync"))
434 sync = FORCESYNC;
435 else if (!strcmp(argv[i], "nosync"))
436 sync = NOSYNC;
437 else if (!strcmp(argv[i], "block_on_error"))
438 block_on_error = 1;
439 }
440
441 lc = malloc(sizeof(*lc));
442 if (!lc) {
443 LOG_ERROR("Unable to allocate cluster log context");
444 r = -ENOMEM;
445 goto fail;
446 }
447 memset(lc, 0, sizeof(*lc));
448
449 lc->region_size = region_size;
450 lc->region_count = region_count;
451 lc->sync = sync;
452 lc->block_on_error = block_on_error;
453 lc->sync_search = 0;
454 lc->recovering_region = (uint64_t)-1;
455 lc->skip_bit_warning = region_count;
456 lc->disk_fd = -1;
457 lc->log_dev_failed = 0;
458 strncpy(lc->uuid, uuid, DM_UUID_LEN);
459 lc->luid = luid;
460
461 if ((duplicate = get_log(lc->uuid, lc->luid)) ||
462 (duplicate = get_pending_log(lc->uuid, lc->luid))) {
463 LOG_ERROR("[%s/%llu] Log already exists, unable to create.",
464 SHORT_UUID(lc->uuid), lc->luid);
465 free(lc);
466 return -EINVAL;
467 }
468
469 dm_list_init(&lc->mark_list);
470
471 lc->clean_bits = dm_bitset_create(NULL, region_count);
472 if (!lc->clean_bits) {
473 LOG_ERROR("Unable to allocate clean bitset");
474 r = -ENOMEM;
475 goto fail;
476 }
477
478 lc->sync_bits = dm_bitset_create(NULL, region_count);
479 if (!lc->sync_bits) {
480 LOG_ERROR("Unable to allocate sync bitset");
481 r = -ENOMEM;
482 goto fail;
483 }
484 if (sync == NOSYNC)
485 dm_bit_set_all(lc->sync_bits);
486
487 lc->sync_count = (sync == NOSYNC) ? region_count : 0;
488 if (disk_log) {
489 page_size = sysconf(_SC_PAGESIZE);
490 pages = ((int)lc->clean_bits[0])/page_size;
491 pages += ((int)lc->clean_bits[0])%page_size ? 1 : 0;
492 pages += 1; /* for header */
493
494 r = open(disk_path, O_RDWR | O_DIRECT);
495 if (r < 0) {
496 LOG_ERROR("Unable to open log device, %s: %s",
497 disk_path, strerror(errno));
498 r = errno;
499 goto fail;
500 }
501 if (unlink_path)
502 unlink(disk_path);
503
504 lc->disk_fd = r;
505 lc->disk_size = pages * page_size;
506
507 r = posix_memalign(&(lc->disk_buffer), page_size,
508 lc->disk_size);
509 if (r) {
510 LOG_ERROR("Unable to allocate memory for disk_buffer");
511 goto fail;
512 }
513 memset(lc->disk_buffer, 0, lc->disk_size);
514 LOG_DBG("Disk log ready");
515 }
516
517 dm_list_add(&log_pending_list, &lc->list);
518
519 return 0;
520 fail:
521 if (lc) {
522 if (lc->clean_bits)
523 free(lc->clean_bits);
524 if (lc->sync_bits)
525 free(lc->sync_bits);
526 if (lc->disk_buffer)
527 free(lc->disk_buffer);
528 if (lc->disk_fd >= 0)
529 close(lc->disk_fd);
530 free(lc);
531 }
532 return r;
533 }
534
535 /*
536 * clog_ctr
537 * @rq
538 *
539 * rq->data should contain constructor string as follows:
540 * <log_type> [disk] <region_size> [[no]sync] <device_len>
541 * The kernel is responsible for adding the <dev_len> argument
542 * to the end; otherwise, we cannot compute the region_count.
543 *
544 * FIXME: Currently relies on caller to fill in rq->error
545 */
546 static int clog_dtr(struct dm_ulog_request *rq);
clog_ctr(struct dm_ulog_request * rq)547 static int clog_ctr(struct dm_ulog_request *rq)
548 {
549 int argc, i, r = 0;
550 char *p, **argv = NULL;
551 char *dev_size_str;
552 uint64_t device_size;
553
554 /* Sanity checks */
555 if (!rq->data_size) {
556 LOG_ERROR("Received constructor request with no data");
557 return -EINVAL;
558 }
559
560 if (strlen(rq->data) > rq->data_size) {
561 LOG_ERROR("Received constructor request with bad data");
562 LOG_ERROR("strlen(rq->data)[%d] != rq->data_size[%llu]",
563 (int)strlen(rq->data),
564 (unsigned long long)rq->data_size);
565 LOG_ERROR("rq->data = '%s' [%d]",
566 rq->data, (int)strlen(rq->data));
567 return -EINVAL;
568 }
569
570 /* Split up args */
571 for (argc = 0, p = rq->data; (p = strstr(p, " ")); p++, argc++)
572 *p = '\0';
573
574 argv = malloc(argc * sizeof(char *));
575 if (!argv)
576 return -ENOMEM;
577
578 p = dev_size_str = rq->data;
579 p += strlen(p) + 1;
580 for (i = 0; i < argc; i++, p = p + strlen(p) + 1)
581 argv[i] = p;
582
583 if (strcmp(argv[0], "clustered_disk") &&
584 strcmp(argv[0], "clustered_core")) {
585 LOG_ERROR("Unsupported userspace log type, \"%s\"", argv[0]);
586 free(argv);
587 return -EINVAL;
588 }
589
590 if (!(device_size = strtoll(dev_size_str, &p, 0)) || *p) {
591 LOG_ERROR("Invalid device size argument: %s", dev_size_str);
592 free(argv);
593 return -EINVAL;
594 }
595
596 r = _clog_ctr(rq->uuid, rq->luid, argc - 1, argv + 1, device_size);
597
598 /* We join the CPG when we resume */
599
600 /* No returning data */
601 rq->data_size = 0;
602
603 if (r) {
604 LOG_ERROR("Failed to create cluster log (%s)", rq->uuid);
605 for (i = 0; i < argc; i++)
606 LOG_ERROR("argv[%d] = %s", i, argv[i]);
607 }
608 else
609 LOG_DBG("[%s] Cluster log created",
610 SHORT_UUID(rq->uuid));
611
612 free(argv);
613 return r;
614 }
615
616 /*
617 * clog_dtr
618 * @rq
619 *
620 */
clog_dtr(struct dm_ulog_request * rq)621 static int clog_dtr(struct dm_ulog_request *rq)
622 {
623 struct log_c *lc = get_log(rq->uuid, rq->luid);
624
625 if (lc) {
626 /*
627 * The log should not be on the official list. There
628 * should have been a suspend first.
629 */
630 LOG_ERROR("[%s] DTR before SUS: leaving CPG",
631 SHORT_UUID(rq->uuid));
632 destroy_cluster_cpg(rq->uuid);
633 } else if (!(lc = get_pending_log(rq->uuid, rq->luid))) {
634 LOG_ERROR("clog_dtr called on log that is not official or pending");
635 return -EINVAL;
636 }
637
638 LOG_DBG("[%s] Cluster log removed", SHORT_UUID(lc->uuid));
639
640 dm_list_del(&lc->list);
641 if (lc->disk_fd != -1)
642 close(lc->disk_fd);
643 if (lc->disk_buffer)
644 free(lc->disk_buffer);
645 free(lc->clean_bits);
646 free(lc->sync_bits);
647 free(lc);
648
649 return 0;
650 }
651
652 /*
653 * clog_presuspend
654 * @rq
655 *
656 */
clog_presuspend(struct dm_ulog_request * rq)657 static int clog_presuspend(struct dm_ulog_request *rq)
658 {
659 struct log_c *lc = get_log(rq->uuid, rq->luid);
660
661 if (!lc)
662 return -EINVAL;
663
664 if (lc->touched)
665 LOG_DBG("WARNING: log still marked as 'touched' during suspend");
666
667 lc->recovery_halted = 1;
668
669 return 0;
670 }
671
672 /*
673 * clog_postsuspend
674 * @rq
675 *
676 */
clog_postsuspend(struct dm_ulog_request * rq)677 static int clog_postsuspend(struct dm_ulog_request *rq)
678 {
679 struct log_c *lc = get_log(rq->uuid, rq->luid);
680
681 if (!lc)
682 return -EINVAL;
683
684 LOG_DBG("[%s] clog_postsuspend: leaving CPG", SHORT_UUID(lc->uuid));
685 destroy_cluster_cpg(rq->uuid);
686
687 lc->state = LOG_SUSPENDED;
688 lc->recovering_region = (uint64_t)-1;
689 lc->recoverer = (uint32_t)-1;
690 lc->delay = time(NULL);
691
692 return 0;
693 }
694
695 /*
696 * cluster_postsuspend
697 * @rq
698 *
699 */
cluster_postsuspend(char * uuid,uint64_t luid)700 int cluster_postsuspend(char *uuid, uint64_t luid)
701 {
702 struct log_c *lc = get_log(uuid, luid);
703
704 if (!lc)
705 return -EINVAL;
706
707 LOG_DBG("[%s] clog_postsuspend: finalizing", SHORT_UUID(lc->uuid));
708 lc->resume_override = 0;
709
710 /* move log to pending list */
711 dm_list_del(&lc->list);
712 dm_list_add(&log_pending_list, &lc->list);
713
714 return 0;
715 }
716
717 /*
718 * clog_resume
719 * @rq
720 *
721 * Does the main work of resuming.
722 */
clog_resume(struct dm_ulog_request * rq)723 static int clog_resume(struct dm_ulog_request *rq)
724 {
725 uint32_t i;
726 int commit_log = 0;
727 struct log_c *lc = get_log(rq->uuid, rq->luid);
728
729 if (!lc)
730 return -EINVAL;
731
732 switch (lc->resume_override) {
733 case 1000:
734 LOG_ERROR("[%s] Additional resume issued before suspend",
735 SHORT_UUID(rq->uuid));
736 #ifdef DEBUG
737 kill(getpid(), SIGUSR1);
738 #endif
739 return 0;
740 case 0:
741 lc->resume_override = 1000;
742 if (lc->disk_fd == -1) {
743 LOG_DBG("[%s] Master resume.",
744 SHORT_UUID(lc->uuid));
745 goto no_disk;
746 }
747
748 LOG_DBG("[%s] Master resume: reading disk log",
749 SHORT_UUID(lc->uuid));
750 commit_log = 1;
751 break;
752 case 1:
753 LOG_ERROR("Error:: partial bit loading (just sync_bits)");
754 return -EINVAL;
755 case 2:
756 LOG_ERROR("Error:: partial bit loading (just clean_bits)");
757 return -EINVAL;
758 case 3:
759 LOG_DBG("[%s] Non-master resume: bits pre-loaded",
760 SHORT_UUID(lc->uuid));
761 lc->resume_override = 1000;
762 goto out;
763 default:
764 LOG_ERROR("Error:: multiple loading of bits (%d)",
765 lc->resume_override);
766 return -EINVAL;
767 }
768
769 if (lc->log_dev_failed) {
770 LOG_ERROR("Log device has failed, unable to read bits");
771 rq->error = 0; /* We can handle this so far */
772 lc->disk_nr_regions = 0;
773 } else
774 rq->error = read_log(lc);
775
776 switch (rq->error) {
777 case 0:
778 if (lc->disk_nr_regions < lc->region_count)
779 LOG_DBG("[%s] Mirror has grown, updating log bits",
780 SHORT_UUID(lc->uuid));
781 else if (lc->disk_nr_regions > lc->region_count)
782 LOG_DBG("[%s] Mirror has shrunk, updating log bits",
783 SHORT_UUID(lc->uuid));
784 break;
785 case -EINVAL:
786 LOG_DBG("[%s] (Re)initializing mirror log - resync issued.",
787 SHORT_UUID(lc->uuid));
788 lc->disk_nr_regions = 0;
789 break;
790 default:
791 LOG_ERROR("Failed to read disk log");
792 lc->disk_nr_regions = 0;
793 break;
794 }
795
796 no_disk:
797 /* If mirror has grown, set bits appropriately */
798 if (lc->sync == NOSYNC)
799 for (i = lc->disk_nr_regions; i < lc->region_count; i++)
800 log_set_bit(lc, lc->clean_bits, i);
801 else
802 for (i = lc->disk_nr_regions; i < lc->region_count; i++)
803 log_clear_bit(lc, lc->clean_bits, i);
804
805 /* Clear any old bits if device has shrunk */
806 for (i = lc->region_count; i % 32; i++)
807 log_clear_bit(lc, lc->clean_bits, i);
808
809 /* copy clean across to sync */
810 dm_bit_copy(lc->sync_bits, lc->clean_bits);
811
812 if (commit_log && (lc->disk_fd >= 0)) {
813 rq->error = write_log(lc);
814 if (rq->error)
815 LOG_ERROR("Failed initial disk log write");
816 else
817 LOG_DBG("Disk log initialized");
818 lc->touched = 0;
819 }
820 out:
821 /*
822 * Clear any old bits if device has shrunk - necessary
823 * for non-master resume
824 */
825 for (i = lc->region_count; i % 32; i++) {
826 log_clear_bit(lc, lc->clean_bits, i);
827 log_clear_bit(lc, lc->sync_bits, i);
828 }
829
830 lc->sync_count = count_bits32(lc->sync_bits);
831
832 LOG_SPRINT(lc, "[%s] Initial sync_count = %llu",
833 SHORT_UUID(lc->uuid), (unsigned long long)lc->sync_count);
834 lc->sync_search = 0;
835 lc->state = LOG_RESUMED;
836 lc->recovery_halted = 0;
837
838 return rq->error;
839 }
840
841 /*
842 * local_resume
843 * @rq
844 *
845 * If the log is pending, we must first join the cpg and
846 * put the log in the official list.
847 *
848 */
local_resume(struct dm_ulog_request * rq)849 int local_resume(struct dm_ulog_request *rq)
850 {
851 int r;
852 time_t t;
853 struct log_c *lc = get_log(rq->uuid, rq->luid);
854
855 if (!lc) {
856 /* Is the log in the pending list? */
857 lc = get_pending_log(rq->uuid, rq->luid);
858 if (!lc) {
859 LOG_ERROR("clog_resume called on log that is not official or pending");
860 return -EINVAL;
861 }
862
863 t = time(NULL);
864 t -= lc->delay;
865 /*
866 * This should be considered a temporary fix. It addresses
867 * a problem that exists when nodes suspend/resume in rapid
868 * succession. While the problem is very rare, it has been
869 * seen to happen in real-world-like testing.
870 *
871 * The problem:
872 * - Node A joins cluster
873 * - Node B joins cluster
874 * - Node A prepares checkpoint
875 * - Node A gets ready to write checkpoint
876 * - Node B leaves
877 * - Node B joins
878 * - Node A finishes write of checkpoint
879 * - Node B receives checkpoint meant for previous session
880 * -- Node B can now be non-coherent
881 *
882 * This timer will solve the problem for now, but could be
883 * replaced by a generation number sent with the resume
884 * command from the kernel. The generation number would
885 * be included in the name of the checkpoint to prevent
886 * reading stale data.
887 */
888 if ((t < 3) && (t >= 0))
889 sleep(3 - t);
890
891 /* Join the CPG */
892 r = create_cluster_cpg(rq->uuid, rq->luid);
893 if (r) {
894 LOG_ERROR("clog_resume: Failed to create cluster CPG");
895 return r;
896 }
897
898 /* move log to official list */
899 dm_list_del(&lc->list);
900 dm_list_add(&log_list, &lc->list);
901 }
902
903 return 0;
904 }
905
906 /*
907 * clog_get_region_size
908 * @rq
909 *
910 * Since this value doesn't change, the kernel
911 * should not need to talk to server to get this
912 * The function is here for completness
913 *
914 * Returns: 0 on success, -EXXX on failure
915 */
clog_get_region_size(struct dm_ulog_request * rq)916 static int clog_get_region_size(struct dm_ulog_request *rq)
917 {
918 uint64_t *rtn = (uint64_t *)rq->data;
919 struct log_c *lc = get_log(rq->uuid, rq->luid);
920
921 if (!lc && !(lc = get_pending_log(rq->uuid, rq->luid)))
922 return -EINVAL;
923
924 *rtn = lc->region_size;
925 rq->data_size = sizeof(*rtn);
926
927 return 0;
928 }
929
930 /*
931 * clog_is_clean
932 * @rq
933 *
934 * Returns: 1 if clean, 0 otherwise
935 */
clog_is_clean(struct dm_ulog_request * rq)936 static int clog_is_clean(struct dm_ulog_request *rq)
937 {
938 int64_t *rtn = (int64_t *)rq->data;
939 uint64_t region = *((uint64_t *)(rq->data));
940 struct log_c *lc = get_log(rq->uuid, rq->luid);
941
942 if (!lc)
943 return -EINVAL;
944
945 *rtn = log_test_bit(lc->clean_bits, region);
946 rq->data_size = sizeof(*rtn);
947
948 return 0;
949 }
950
951 /*
952 * clog_in_sync
953 * @rq
954 *
955 * We ignore any request for non-block. That
956 * should be handled elsewhere. (If the request
957 * has come this far, it has already blocked.)
958 *
959 * Returns: 1 if in-sync, 0 otherwise
960 */
clog_in_sync(struct dm_ulog_request * rq)961 static int clog_in_sync(struct dm_ulog_request *rq)
962 {
963 int64_t *rtn = (int64_t *)rq->data;
964 uint64_t region = *((uint64_t *)(rq->data));
965 struct log_c *lc = get_log(rq->uuid, rq->luid);
966
967 if (!lc)
968 return -EINVAL;
969
970 if (region > lc->region_count)
971 return -EINVAL;
972
973 *rtn = log_test_bit(lc->sync_bits, region);
974 if (*rtn)
975 LOG_DBG("[%s] Region is in-sync: %llu",
976 SHORT_UUID(lc->uuid), (unsigned long long)region);
977 else
978 LOG_DBG("[%s] Region is not in-sync: %llu",
979 SHORT_UUID(lc->uuid), (unsigned long long)region);
980
981 rq->data_size = sizeof(*rtn);
982
983 return 0;
984 }
985
986 /*
987 * clog_flush
988 * @rq
989 *
990 */
clog_flush(struct dm_ulog_request * rq,int server)991 static int clog_flush(struct dm_ulog_request *rq, int server)
992 {
993 int r = 0;
994 struct log_c *lc = get_log(rq->uuid, rq->luid);
995
996 if (!lc)
997 return -EINVAL;
998
999 if (!lc->touched)
1000 return 0;
1001
1002 /*
1003 * Do the actual flushing of the log only
1004 * if we are the server.
1005 */
1006 if (server && (lc->disk_fd >= 0)) {
1007 r = rq->error = write_log(lc);
1008 if (r)
1009 LOG_ERROR("[%s] Error writing to disk log",
1010 SHORT_UUID(lc->uuid));
1011 else
1012 LOG_DBG("[%s] Disk log written", SHORT_UUID(lc->uuid));
1013 }
1014
1015 lc->touched = 0;
1016
1017 return r;
1018
1019 }
1020
1021 /*
1022 * mark_region
1023 * @lc
1024 * @region
1025 * @who
1026 *
1027 * Put a mark region request in the tree for tracking.
1028 *
1029 * Returns: 0 on success, -EXXX on error
1030 */
mark_region(struct log_c * lc,uint64_t region,uint32_t who)1031 static int mark_region(struct log_c *lc, uint64_t region, uint32_t who)
1032 {
1033 int found = 0;
1034 struct mark_entry *m;
1035
1036 dm_list_iterate_items(m, &lc->mark_list)
1037 if (m->region == region) {
1038 found = 1;
1039 if (m->nodeid == who)
1040 return 0;
1041 }
1042
1043 if (!found)
1044 log_clear_bit(lc, lc->clean_bits, region);
1045
1046 /*
1047 * Save allocation until here - if there is a failure,
1048 * at least we have cleared the bit.
1049 */
1050 m = malloc(sizeof(*m));
1051 if (!m) {
1052 LOG_ERROR("Unable to allocate space for mark_entry: %llu/%u",
1053 (unsigned long long)region, who);
1054 return -ENOMEM;
1055 }
1056
1057 m->nodeid = who;
1058 m->region = region;
1059 dm_list_add(&lc->mark_list, &m->list);
1060
1061 return 0;
1062 }
1063
1064 /*
1065 * clog_mark_region
1066 * @rq
1067 *
1068 * rq may contain more than one mark request. We
1069 * can determine the number from the 'data_size' field.
1070 *
1071 * Returns: 0 on success, -EXXX on failure
1072 */
clog_mark_region(struct dm_ulog_request * rq,uint32_t originator)1073 static int clog_mark_region(struct dm_ulog_request *rq, uint32_t originator)
1074 {
1075 int r;
1076 int count;
1077 uint64_t *region;
1078 struct log_c *lc = get_log(rq->uuid, rq->luid);
1079
1080 if (!lc)
1081 return -EINVAL;
1082
1083 if (rq->data_size % sizeof(uint64_t)) {
1084 LOG_ERROR("Bad data size given for mark_region request");
1085 return -EINVAL;
1086 }
1087
1088 count = rq->data_size / sizeof(uint64_t);
1089 region = (uint64_t *)&rq->data;
1090
1091 for (; count > 0; count--, region++) {
1092 r = mark_region(lc, *region, originator);
1093 if (r)
1094 return r;
1095 }
1096
1097 rq->data_size = 0;
1098
1099 return 0;
1100 }
1101
clear_region(struct log_c * lc,uint64_t region,uint32_t who)1102 static int clear_region(struct log_c *lc, uint64_t region, uint32_t who)
1103 {
1104 int other_matches = 0;
1105 struct mark_entry *m, *n;
1106
1107 dm_list_iterate_items_safe(m, n, &lc->mark_list)
1108 if (m->region == region) {
1109 if (m->nodeid == who) {
1110 dm_list_del(&m->list);
1111 free(m);
1112 } else
1113 other_matches = 1;
1114 }
1115
1116 /*
1117 * Clear region if:
1118 * 1) It is in-sync
1119 * 2) There are no other machines that have it marked
1120 */
1121 if (!other_matches && log_test_bit(lc->sync_bits, region))
1122 log_set_bit(lc, lc->clean_bits, region);
1123
1124 return 0;
1125 }
1126
1127 /*
1128 * clog_clear_region
1129 * @rq
1130 *
1131 * rq may contain more than one clear request. We
1132 * can determine the number from the 'data_size' field.
1133 *
1134 * Returns: 0 on success, -EXXX on failure
1135 */
clog_clear_region(struct dm_ulog_request * rq,uint32_t originator)1136 static int clog_clear_region(struct dm_ulog_request *rq, uint32_t originator)
1137 {
1138 int r;
1139 int count;
1140 uint64_t *region;
1141 struct log_c *lc = get_log(rq->uuid, rq->luid);
1142
1143 if (!lc)
1144 return -EINVAL;
1145
1146 if (rq->data_size % sizeof(uint64_t)) {
1147 LOG_ERROR("Bad data size given for clear_region request");
1148 return -EINVAL;
1149 }
1150
1151 count = rq->data_size / sizeof(uint64_t);
1152 region = (uint64_t *)&rq->data;
1153
1154 for (; count > 0; count--, region++) {
1155 r = clear_region(lc, *region, originator);
1156 if (r)
1157 return r;
1158 }
1159
1160 rq->data_size = 0;
1161
1162 return 0;
1163 }
1164
1165 /*
1166 * clog_get_resync_work
1167 * @rq
1168 *
1169 */
clog_get_resync_work(struct dm_ulog_request * rq,uint32_t originator)1170 static int clog_get_resync_work(struct dm_ulog_request *rq, uint32_t originator)
1171 {
1172 struct {
1173 int64_t i;
1174 uint64_t r;
1175 } *pkg = (void *)rq->data;
1176 struct log_c *lc = get_log(rq->uuid, rq->luid);
1177
1178 if (!lc)
1179 return -EINVAL;
1180
1181 rq->data_size = sizeof(*pkg);
1182 pkg->i = 0;
1183
1184 if (lc->sync_search >= lc->region_count) {
1185 /*
1186 * FIXME: handle intermittent errors during recovery
1187 * by resetting sync_search... but not to many times.
1188 */
1189 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1190 "Recovery finished",
1191 rq->seq, SHORT_UUID(lc->uuid), originator);
1192 return 0;
1193 }
1194
1195 if (lc->recovering_region != (uint64_t)-1) {
1196 if (lc->recoverer == originator) {
1197 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1198 "Re-requesting work (%llu)",
1199 rq->seq, SHORT_UUID(lc->uuid), originator,
1200 (unsigned long long)lc->recovering_region);
1201 pkg->r = lc->recovering_region;
1202 pkg->i = 1;
1203 LOG_COND(log_resend_requests, "***** RE-REQUEST *****");
1204 } else {
1205 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1206 "Someone already recovering (%llu)",
1207 rq->seq, SHORT_UUID(lc->uuid), originator,
1208 (unsigned long long)lc->recovering_region);
1209 }
1210
1211 return 0;
1212 }
1213
1214 while (lc->recovery_request_list) {
1215 struct recovery_request *del;
1216
1217 del = lc->recovery_request_list;
1218 lc->recovery_request_list = del->next;
1219
1220 pkg->r = del->region;
1221 free(del);
1222
1223 if (!log_test_bit(lc->sync_bits, pkg->r)) {
1224 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1225 "Assigning priority resync work (%llu)",
1226 rq->seq, SHORT_UUID(lc->uuid), originator,
1227 (unsigned long long)pkg->r);
1228 pkg->i = 1;
1229 lc->recovering_region = pkg->r;
1230 lc->recoverer = originator;
1231 return 0;
1232 }
1233 }
1234
1235 pkg->r = find_next_zero_bit(lc->sync_bits,
1236 lc->sync_search);
1237
1238 if (pkg->r >= lc->region_count) {
1239 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1240 "Resync work complete.",
1241 rq->seq, SHORT_UUID(lc->uuid), originator);
1242 return 0;
1243 }
1244
1245 lc->sync_search = pkg->r + 1;
1246
1247 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1248 "Assigning resync work (%llu)",
1249 rq->seq, SHORT_UUID(lc->uuid), originator,
1250 (unsigned long long)pkg->r);
1251 pkg->i = 1;
1252 lc->recovering_region = pkg->r;
1253 lc->recoverer = originator;
1254
1255 return 0;
1256 }
1257
1258 /*
1259 * clog_set_region_sync
1260 * @rq
1261 */
clog_set_region_sync(struct dm_ulog_request * rq,uint32_t originator)1262 static int clog_set_region_sync(struct dm_ulog_request *rq, uint32_t originator)
1263 {
1264 struct {
1265 uint64_t region;
1266 int64_t in_sync;
1267 } *pkg = (void *)rq->data;
1268 struct log_c *lc = get_log(rq->uuid, rq->luid);
1269
1270 if (!lc)
1271 return -EINVAL;
1272
1273 lc->recovering_region = (uint64_t)-1;
1274
1275 if (pkg->in_sync) {
1276 if (log_test_bit(lc->sync_bits, pkg->region)) {
1277 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1278 "Region already set (%llu)",
1279 rq->seq, SHORT_UUID(lc->uuid), originator,
1280 (unsigned long long)pkg->region);
1281 } else {
1282 log_set_bit(lc, lc->sync_bits, pkg->region);
1283 lc->sync_count++;
1284
1285 /* The rest of this section is all for debugging */
1286 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1287 "Setting region (%llu)",
1288 rq->seq, SHORT_UUID(lc->uuid), originator,
1289 (unsigned long long)pkg->region);
1290 if (pkg->region == lc->skip_bit_warning)
1291 lc->skip_bit_warning = lc->region_count;
1292
1293 if (pkg->region > (lc->skip_bit_warning + 5)) {
1294 LOG_ERROR("*** Region #%llu skipped during recovery ***",
1295 (unsigned long long)lc->skip_bit_warning);
1296 lc->skip_bit_warning = lc->region_count;
1297 #ifdef DEBUG
1298 kill(getpid(), SIGUSR1);
1299 #endif
1300 }
1301
1302 if (!log_test_bit(lc->sync_bits,
1303 (pkg->region) ? pkg->region - 1 : 0)) {
1304 LOG_SPRINT(lc, "*** Previous bit not set ***");
1305 lc->skip_bit_warning = (pkg->region) ?
1306 pkg->region - 1 : 0;
1307 }
1308 }
1309 } else if (log_test_bit(lc->sync_bits, pkg->region)) {
1310 lc->sync_count--;
1311 log_clear_bit(lc, lc->sync_bits, pkg->region);
1312 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1313 "Unsetting region (%llu)",
1314 rq->seq, SHORT_UUID(lc->uuid), originator,
1315 (unsigned long long)pkg->region);
1316 }
1317
1318 if (lc->sync_count != count_bits32(lc->sync_bits)) {
1319 unsigned long long reset = count_bits32(lc->sync_bits);
1320
1321 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1322 "sync_count(%llu) != bitmap count(%llu)",
1323 rq->seq, SHORT_UUID(lc->uuid), originator,
1324 (unsigned long long)lc->sync_count, reset);
1325 #ifdef DEBUG
1326 kill(getpid(), SIGUSR1);
1327 #endif
1328 lc->sync_count = reset;
1329 }
1330
1331 if (lc->sync_count > lc->region_count)
1332 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1333 "(lc->sync_count > lc->region_count) - this is bad",
1334 rq->seq, SHORT_UUID(lc->uuid), originator);
1335
1336 rq->data_size = 0;
1337 return 0;
1338 }
1339
1340 /*
1341 * clog_get_sync_count
1342 * @rq
1343 */
clog_get_sync_count(struct dm_ulog_request * rq,uint32_t originator)1344 static int clog_get_sync_count(struct dm_ulog_request *rq, uint32_t originator)
1345 {
1346 uint64_t *sync_count = (uint64_t *)rq->data;
1347 struct log_c *lc = get_log(rq->uuid, rq->luid);
1348
1349 /*
1350 * FIXME: Mirror requires us to be able to ask for
1351 * the sync count while pending... but I don't like
1352 * it because other machines may not be suspended and
1353 * the stored value may not be accurate.
1354 */
1355 if (!lc)
1356 lc = get_pending_log(rq->uuid, rq->luid);
1357
1358 if (!lc)
1359 return -EINVAL;
1360
1361 *sync_count = lc->sync_count;
1362
1363 rq->data_size = sizeof(*sync_count);
1364
1365 if (lc->sync_count != count_bits32(lc->sync_bits)) {
1366 unsigned long long reset = count_bits32(lc->sync_bits);
1367
1368 LOG_SPRINT(lc, "get_sync_count - SEQ#=%u, UUID=%s, nodeid = %u:: "
1369 "sync_count(%llu) != bitmap count(%llu)",
1370 rq->seq, SHORT_UUID(lc->uuid), originator,
1371 (unsigned long long)lc->sync_count, reset);
1372 #ifdef DEBUG
1373 kill(getpid(), SIGUSR1);
1374 #endif
1375 lc->sync_count = reset;
1376 }
1377
1378 return 0;
1379 }
1380
core_status_info(struct log_c * lc,struct dm_ulog_request * rq)1381 static int core_status_info(struct log_c *lc, struct dm_ulog_request *rq)
1382 {
1383 char *data = (char *)rq->data;
1384
1385 rq->data_size = sprintf(data, "1 clustered_core");
1386
1387 return 0;
1388 }
1389
disk_status_info(struct log_c * lc,struct dm_ulog_request * rq)1390 static int disk_status_info(struct log_c *lc, struct dm_ulog_request *rq)
1391 {
1392 char *data = (char *)rq->data;
1393 struct stat statbuf;
1394
1395 if(fstat(lc->disk_fd, &statbuf)) {
1396 rq->error = -errno;
1397 return -errno;
1398 }
1399
1400 rq->data_size = sprintf(data, "3 clustered_disk %d:%d %c",
1401 major(statbuf.st_rdev), minor(statbuf.st_rdev),
1402 (lc->log_dev_failed) ? 'D' : 'A');
1403
1404 return 0;
1405 }
1406
1407 /*
1408 * clog_status_info
1409 * @rq
1410 *
1411 */
clog_status_info(struct dm_ulog_request * rq)1412 static int clog_status_info(struct dm_ulog_request *rq)
1413 {
1414 int r;
1415 struct log_c *lc = get_log(rq->uuid, rq->luid);
1416
1417 if (!lc)
1418 lc = get_pending_log(rq->uuid, rq->luid);
1419
1420 if (!lc)
1421 return -EINVAL;
1422
1423 if (lc->disk_fd == -1)
1424 r = core_status_info(lc, rq);
1425 else
1426 r = disk_status_info(lc, rq);
1427
1428 return r;
1429 }
1430
core_status_table(struct log_c * lc,struct dm_ulog_request * rq)1431 static int core_status_table(struct log_c *lc, struct dm_ulog_request *rq)
1432 {
1433 char *data = (char *)rq->data;
1434
1435 rq->data_size = sprintf(data, "clustered_core %u %s%s ",
1436 lc->region_size,
1437 (lc->sync == DEFAULTSYNC) ? "" :
1438 (lc->sync == NOSYNC) ? "nosync " : "sync ",
1439 (lc->block_on_error) ? "block_on_error" : "");
1440 return 0;
1441 }
1442
disk_status_table(struct log_c * lc,struct dm_ulog_request * rq)1443 static int disk_status_table(struct log_c *lc, struct dm_ulog_request *rq)
1444 {
1445 char *data = (char *)rq->data;
1446 struct stat statbuf;
1447
1448 if(fstat(lc->disk_fd, &statbuf)) {
1449 rq->error = -errno;
1450 return -errno;
1451 }
1452
1453 rq->data_size = sprintf(data, "clustered_disk %d:%d %u %s%s ",
1454 major(statbuf.st_rdev), minor(statbuf.st_rdev),
1455 lc->region_size,
1456 (lc->sync == DEFAULTSYNC) ? "" :
1457 (lc->sync == NOSYNC) ? "nosync " : "sync ",
1458 (lc->block_on_error) ? "block_on_error" : "");
1459 return 0;
1460 }
1461
1462 /*
1463 * clog_status_table
1464 * @rq
1465 *
1466 */
clog_status_table(struct dm_ulog_request * rq)1467 static int clog_status_table(struct dm_ulog_request *rq)
1468 {
1469 int r;
1470 struct log_c *lc = get_log(rq->uuid, rq->luid);
1471
1472 if (!lc)
1473 lc = get_pending_log(rq->uuid, rq->luid);
1474
1475 if (!lc)
1476 return -EINVAL;
1477
1478 if (lc->disk_fd == -1)
1479 r = core_status_table(lc, rq);
1480 else
1481 r = disk_status_table(lc, rq);
1482
1483 return r;
1484 }
1485
1486 /*
1487 * clog_is_remote_recovering
1488 * @rq
1489 *
1490 */
clog_is_remote_recovering(struct dm_ulog_request * rq)1491 static int clog_is_remote_recovering(struct dm_ulog_request *rq)
1492 {
1493 uint64_t region = *((uint64_t *)(rq->data));
1494 struct {
1495 int64_t is_recovering;
1496 uint64_t in_sync_hint;
1497 } *pkg = (void *)rq->data;
1498 struct log_c *lc = get_log(rq->uuid, rq->luid);
1499
1500 if (!lc)
1501 return -EINVAL;
1502
1503 if (region > lc->region_count)
1504 return -EINVAL;
1505
1506 if (lc->recovery_halted) {
1507 LOG_DBG("[%s] Recovery halted... [not remote recovering]: %llu",
1508 SHORT_UUID(lc->uuid), (unsigned long long)region);
1509 pkg->is_recovering = 0;
1510 pkg->in_sync_hint = lc->region_count; /* none are recovering */
1511 } else {
1512 pkg->is_recovering = !log_test_bit(lc->sync_bits, region);
1513
1514 /*
1515 * Remember, 'lc->sync_search' is 1 plus the region
1516 * currently being recovered. So, we must take off 1
1517 * to account for that; but only if 'sync_search > 1'.
1518 */
1519 pkg->in_sync_hint = lc->sync_search ? (lc->sync_search - 1) : 0;
1520 LOG_DBG("[%s] Region is %s: %llu",
1521 SHORT_UUID(lc->uuid),
1522 (region == lc->recovering_region) ?
1523 "currently remote recovering" :
1524 (pkg->is_recovering) ? "pending remote recovery" :
1525 "not remote recovering", (unsigned long long)region);
1526 }
1527
1528 if (pkg->is_recovering &&
1529 (region != lc->recovering_region)) {
1530 struct recovery_request *rr;
1531
1532 /* Already in the list? */
1533 for (rr = lc->recovery_request_list; rr; rr = rr->next)
1534 if (rr->region == region)
1535 goto out;
1536
1537 /* Failure to allocated simply means we can't prioritize it */
1538 rr = malloc(sizeof(*rr));
1539 if (!rr)
1540 goto out;
1541
1542 LOG_DBG("[%s] Adding region to priority list: %llu",
1543 SHORT_UUID(lc->uuid), (unsigned long long)region);
1544 rr->region = region;
1545 rr->next = lc->recovery_request_list;
1546 lc->recovery_request_list = rr;
1547 }
1548
1549 out:
1550
1551 rq->data_size = sizeof(*pkg);
1552
1553 return 0;
1554 }
1555
1556
1557 /*
1558 * do_request
1559 * @rq: the request
1560 * @server: is this request performed by the server
1561 *
1562 * An inability to perform this function will return an error
1563 * from this function. However, an inability to successfully
1564 * perform the request will fill in the 'rq->error' field.
1565 *
1566 * Returns: 0 on success, -EXXX on error
1567 */
do_request(struct clog_request * rq,int server)1568 int do_request(struct clog_request *rq, int server)
1569 {
1570 int r;
1571
1572 if (!rq)
1573 return 0;
1574
1575 if (rq->u_rq.error)
1576 LOG_DBG("Programmer error: rq struct has error set");
1577
1578 switch (rq->u_rq.request_type) {
1579 case DM_ULOG_CTR:
1580 r = clog_ctr(&rq->u_rq);
1581 break;
1582 case DM_ULOG_DTR:
1583 r = clog_dtr(&rq->u_rq);
1584 break;
1585 case DM_ULOG_PRESUSPEND:
1586 r = clog_presuspend(&rq->u_rq);
1587 break;
1588 case DM_ULOG_POSTSUSPEND:
1589 r = clog_postsuspend(&rq->u_rq);
1590 break;
1591 case DM_ULOG_RESUME:
1592 r = clog_resume(&rq->u_rq);
1593 break;
1594 case DM_ULOG_GET_REGION_SIZE:
1595 r = clog_get_region_size(&rq->u_rq);
1596 break;
1597 case DM_ULOG_IS_CLEAN:
1598 r = clog_is_clean(&rq->u_rq);
1599 break;
1600 case DM_ULOG_IN_SYNC:
1601 r = clog_in_sync(&rq->u_rq);
1602 break;
1603 case DM_ULOG_FLUSH:
1604 r = clog_flush(&rq->u_rq, server);
1605 break;
1606 case DM_ULOG_MARK_REGION:
1607 r = clog_mark_region(&rq->u_rq, rq->originator);
1608 break;
1609 case DM_ULOG_CLEAR_REGION:
1610 r = clog_clear_region(&rq->u_rq, rq->originator);
1611 break;
1612 case DM_ULOG_GET_RESYNC_WORK:
1613 r = clog_get_resync_work(&rq->u_rq, rq->originator);
1614 break;
1615 case DM_ULOG_SET_REGION_SYNC:
1616 r = clog_set_region_sync(&rq->u_rq, rq->originator);
1617 break;
1618 case DM_ULOG_GET_SYNC_COUNT:
1619 r = clog_get_sync_count(&rq->u_rq, rq->originator);
1620 break;
1621 case DM_ULOG_STATUS_INFO:
1622 r = clog_status_info(&rq->u_rq);
1623 break;
1624 case DM_ULOG_STATUS_TABLE:
1625 r = clog_status_table(&rq->u_rq);
1626 break;
1627 case DM_ULOG_IS_REMOTE_RECOVERING:
1628 r = clog_is_remote_recovering(&rq->u_rq);
1629 break;
1630 default:
1631 LOG_ERROR("Unknown request");
1632 r = rq->u_rq.error = -EINVAL;
1633 break;
1634 }
1635
1636 if (r && !rq->u_rq.error)
1637 rq->u_rq.error = r;
1638 else if (r != rq->u_rq.error)
1639 LOG_DBG("Warning: error from function != rq->u_rq.error");
1640
1641 if (rq->u_rq.error && rq->u_rq.data_size) {
1642 /* Make sure I'm handling errors correctly above */
1643 LOG_DBG("Programmer error: rq->u_rq.error && rq->u_rq.data_size");
1644 rq->u_rq.data_size = 0;
1645 }
1646
1647 return 0;
1648 }
1649
print_bits(char * buf,int size,int print)1650 static void print_bits(char *buf, int size, int print)
1651 {
1652 int i;
1653 char outbuf[128];
1654
1655 memset(outbuf, 0, sizeof(outbuf));
1656
1657 for (i = 0; i < size; i++) {
1658 if (!(i % 16)) {
1659 if (outbuf[0] != '\0') {
1660 if (print)
1661 LOG_PRINT("%s", outbuf);
1662 else
1663 LOG_DBG("%s", outbuf);
1664 }
1665 memset(outbuf, 0, sizeof(outbuf));
1666 sprintf(outbuf, "[%3d - %3d]", i, i+15);
1667 }
1668 sprintf(outbuf + strlen(outbuf), " %.2X", (unsigned char)buf[i]);
1669 }
1670 if (outbuf[0] != '\0') {
1671 if (print)
1672 LOG_PRINT("%s", outbuf);
1673 else
1674 LOG_DBG("%s", outbuf);
1675 }
1676 }
1677
1678 /* int store_bits(const char *uuid, const char *which, char **buf)*/
push_state(const char * uuid,uint64_t luid,const char * which,char ** buf,uint32_t debug_who)1679 int push_state(const char *uuid, uint64_t luid,
1680 const char *which, char **buf, uint32_t debug_who)
1681 {
1682 int bitset_size;
1683 struct log_c *lc;
1684
1685 if (*buf)
1686 LOG_ERROR("store_bits: *buf != NULL");
1687
1688 lc = get_log(uuid, luid);
1689 if (!lc) {
1690 LOG_ERROR("store_bits: No log found for %s", uuid);
1691 return -EINVAL;
1692 }
1693
1694 if (!strcmp(which, "recovering_region")) {
1695 *buf = malloc(64); /* easily handles the 2 written numbers */
1696 if (!*buf)
1697 return -ENOMEM;
1698 sprintf(*buf, "%llu %u", (unsigned long long)lc->recovering_region,
1699 lc->recoverer);
1700
1701 LOG_SPRINT(lc, "CKPT SEND - SEQ#=X, UUID=%s, nodeid = %u:: "
1702 "recovering_region=%llu, recoverer=%u, sync_count=%llu",
1703 SHORT_UUID(lc->uuid), debug_who,
1704 (unsigned long long)lc->recovering_region,
1705 lc->recoverer,
1706 (unsigned long long)count_bits32(lc->sync_bits));
1707 return 64;
1708 }
1709
1710 /* Size in 'int's */
1711 bitset_size = ((int)lc->clean_bits[0]/DM_BITS_PER_INT) + 1;
1712
1713 /* Size in bytes */
1714 bitset_size *= 4;
1715
1716 *buf = malloc(bitset_size);
1717
1718 if (!*buf) {
1719 LOG_ERROR("store_bits: Unable to allocate memory");
1720 return -ENOMEM;
1721 }
1722
1723 if (!strncmp(which, "sync_bits", 9)) {
1724 memcpy(*buf, lc->sync_bits + 1, bitset_size);
1725 LOG_DBG("[%s] storing sync_bits (sync_count = %llu):",
1726 SHORT_UUID(uuid), (unsigned long long)
1727 count_bits32(lc->sync_bits));
1728 print_bits(*buf, bitset_size, 0);
1729 } else if (!strncmp(which, "clean_bits", 9)) {
1730 memcpy(*buf, lc->clean_bits + 1, bitset_size);
1731 LOG_DBG("[%s] storing clean_bits:", SHORT_UUID(lc->uuid));
1732 print_bits(*buf, bitset_size, 0);
1733 }
1734
1735 return bitset_size;
1736 }
1737
1738 /*int load_bits(const char *uuid, const char *which, char *buf, int size)*/
pull_state(const char * uuid,uint64_t luid,const char * which,char * buf,int size)1739 int pull_state(const char *uuid, uint64_t luid,
1740 const char *which, char *buf, int size)
1741 {
1742 int bitset_size;
1743 struct log_c *lc;
1744
1745 if (!buf)
1746 LOG_ERROR("pull_state: buf == NULL");
1747
1748 lc = get_log(uuid, luid);
1749 if (!lc) {
1750 LOG_ERROR("pull_state: No log found for %s", uuid);
1751 return -EINVAL;
1752 }
1753
1754 if (!strncmp(which, "recovering_region", 17)) {
1755 sscanf(buf, "%llu %u", (unsigned long long *)&lc->recovering_region,
1756 &lc->recoverer);
1757 LOG_SPRINT(lc, "CKPT INIT - SEQ#=X, UUID=%s, nodeid = X:: "
1758 "recovering_region=%llu, recoverer=%u",
1759 SHORT_UUID(lc->uuid),
1760 (unsigned long long)lc->recovering_region, lc->recoverer);
1761 return 0;
1762 }
1763
1764 /* Size in 'int's */
1765 bitset_size = ((int)lc->clean_bits[0]/DM_BITS_PER_INT) + 1;
1766
1767 /* Size in bytes */
1768 bitset_size *= 4;
1769
1770 if (bitset_size != size) {
1771 LOG_ERROR("pull_state(%s): bad bitset_size (%d vs %d)",
1772 which, size, bitset_size);
1773 return -EINVAL;
1774 }
1775
1776 if (!strncmp(which, "sync_bits", 9)) {
1777 lc->resume_override += 1;
1778 memcpy(lc->sync_bits + 1, buf, bitset_size);
1779 LOG_DBG("[%s] loading sync_bits (sync_count = %llu):",
1780 SHORT_UUID(lc->uuid),(unsigned long long)
1781 count_bits32(lc->sync_bits));
1782 print_bits((char *)lc->sync_bits, bitset_size, 0);
1783 } else if (!strncmp(which, "clean_bits", 9)) {
1784 lc->resume_override += 2;
1785 memcpy(lc->clean_bits + 1, buf, bitset_size);
1786 LOG_DBG("[%s] loading clean_bits:", SHORT_UUID(lc->uuid));
1787 print_bits((char *)lc->clean_bits, bitset_size, 0);
1788 }
1789
1790 return 0;
1791 }
1792
log_get_state(struct dm_ulog_request * rq)1793 int log_get_state(struct dm_ulog_request *rq)
1794 {
1795 struct log_c *lc;
1796
1797 lc = get_log(rq->uuid, rq->luid);
1798 if (!lc)
1799 return -EINVAL;
1800
1801 return lc->state;
1802 }
1803
1804 /*
1805 * log_status
1806 *
1807 * Returns: 1 if logs are still present, 0 otherwise
1808 */
log_status(void)1809 int log_status(void)
1810 {
1811 if (!dm_list_empty(&log_list) || !dm_list_empty(&log_pending_list))
1812 return 1;
1813
1814 return 0;
1815 }
1816
log_debug(void)1817 void log_debug(void)
1818 {
1819 struct log_c *lc;
1820 uint64_t r;
1821 int i;
1822
1823 LOG_ERROR("");
1824 LOG_ERROR("LOG COMPONENT DEBUGGING::");
1825 LOG_ERROR("Official log list:");
1826 LOG_ERROR("Pending log list:");
1827 dm_list_iterate_items(lc, &log_pending_list) {
1828 LOG_ERROR("%s", lc->uuid);
1829 LOG_ERROR("sync_bits:");
1830 print_bits((char *)lc->sync_bits, (int)lc->sync_bits[0], 1);
1831 LOG_ERROR("clean_bits:");
1832 print_bits((char *)lc->clean_bits, (int)lc->sync_bits[0], 1);
1833 }
1834
1835 dm_list_iterate_items(lc, &log_list) {
1836 LOG_ERROR("%s", lc->uuid);
1837 LOG_ERROR(" recoverer : %u", lc->recoverer);
1838 LOG_ERROR(" recovering_region: %llu",
1839 (unsigned long long)lc->recovering_region);
1840 LOG_ERROR(" recovery_halted : %s", (lc->recovery_halted) ?
1841 "YES" : "NO");
1842 LOG_ERROR("sync_bits:");
1843 print_bits((char *)lc->sync_bits, (int)lc->sync_bits[0], 1);
1844 LOG_ERROR("clean_bits:");
1845 print_bits((char *)lc->clean_bits, (int)lc->sync_bits[0], 1);
1846
1847 LOG_ERROR("Validating %s::", SHORT_UUID(lc->uuid));
1848 r = find_next_zero_bit(lc->sync_bits, 0);
1849 LOG_ERROR(" lc->region_count = %llu",
1850 (unsigned long long)lc->region_count);
1851 LOG_ERROR(" lc->sync_count = %llu",
1852 (unsigned long long)lc->sync_count);
1853 LOG_ERROR(" next zero bit = %llu",
1854 (unsigned long long)r);
1855 if ((r > lc->region_count) ||
1856 ((r == lc->region_count) && (lc->sync_count > lc->region_count))) {
1857 LOG_ERROR("ADJUSTING SYNC_COUNT");
1858 lc->sync_count = lc->region_count;
1859 }
1860
1861 LOG_ERROR("Resync request history:");
1862 for (i = 0; i < RESYNC_HISTORY; i++) {
1863 lc->idx++;
1864 lc->idx = lc->idx % RESYNC_HISTORY;
1865 if (lc->resync_history[lc->idx][0] == '\0')
1866 continue;
1867 LOG_ERROR("%d:%d) %s", i, lc->idx,
1868 lc->resync_history[lc->idx]);
1869 }
1870 }
1871 }
1872