xref: /netbsd-src/external/gpl2/lvm2/dist/daemons/cmirrord/functions.c (revision 017d3467b169e862a7194e0556d75844a27c1282)
1 /*	$NetBSD: functions.c,v 1.3 2010/12/26 14:48:34 christos Exp $	*/
2 
3 /*
4  * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved.
5  *
6  * This copyrighted material is made available to anyone wishing to use,
7  * modify, copy, or redistribute it subject to the terms and conditions
8  * of the GNU Lesser General Public License v.2.1.
9  *
10  * You should have received a copy of the GNU Lesser General Public License
11  * along with this program; if not, write to the Free Software Foundation,
12  * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
13  */
14 #define _GNU_SOURCE
15 #define _FILE_OFFSET_BITS 64
16 
17 #include <stdint.h>
18 #include <errno.h>
19 #include <string.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 #include <dirent.h>
23 #include <unistd.h>
24 #include <signal.h>
25 #include <linux/kdev_t.h>
26 //#define __USE_GNU /* for O_DIRECT */
27 #include <fcntl.h>
28 #include <time.h>
29 #include "libdevmapper.h"
30 #include "dm-log-userspace.h"
31 #include "functions.h"
32 #include "common.h"
33 #include "cluster.h"
34 #include "logging.h"
35 
36 #define BYTE_SHIFT 3
37 
38 /*
39  * Magic for persistent mirrors: "MiRr"
40  * Following on-disk header information is stolen from
41  * drivers/md/dm-log.c
42  */
43 #define MIRROR_MAGIC 0x4D695272
44 #define MIRROR_DISK_VERSION 2
45 #define LOG_OFFSET 2
46 
47 #define RESYNC_HISTORY 50
48 //static char resync_history[RESYNC_HISTORY][128];
49 //static int idx = 0;
50 #define LOG_SPRINT(_lc, f, arg...) do {					\
51 		lc->idx++;						\
52 		lc->idx = lc->idx % RESYNC_HISTORY;			\
53 		sprintf(lc->resync_history[lc->idx], f, ## arg);	\
54 	} while (0)
55 
56 struct log_header {
57         uint32_t magic;
58         uint32_t version;
59         uint64_t nr_regions;
60 };
61 
62 struct log_c {
63 	struct dm_list list;
64 
65 	char uuid[DM_UUID_LEN];
66 	uint64_t luid;
67 
68 	time_t delay; /* limits how fast a resume can happen after suspend */
69 	int touched;
70 	uint32_t region_size;
71 	uint32_t region_count;
72 	uint64_t sync_count;
73 
74 	dm_bitset_t clean_bits;
75 	dm_bitset_t sync_bits;
76 	uint32_t recoverer;
77 	uint64_t recovering_region; /* -1 means not recovering */
78 	uint64_t skip_bit_warning; /* used to warn if region skipped */
79 	int sync_search;
80 
81 	int resume_override;
82 
83 	uint32_t block_on_error;
84         enum sync {
85                 DEFAULTSYNC,    /* Synchronize if necessary */
86                 NOSYNC,         /* Devices known to be already in sync */
87                 FORCESYNC,      /* Force a sync to happen */
88         } sync;
89 
90 	uint32_t state;         /* current operational state of the log */
91 
92 	struct dm_list mark_list;
93 
94 	uint32_t recovery_halted;
95 	struct recovery_request *recovery_request_list;
96 
97 	int disk_fd;            /* -1 means no disk log */
98 	int log_dev_failed;
99 	uint64_t disk_nr_regions;
100 	size_t disk_size;       /* size of disk_buffer in bytes */
101 	void *disk_buffer;      /* aligned memory for O_DIRECT */
102 	int idx;
103 	char resync_history[RESYNC_HISTORY][128];
104 };
105 
106 struct mark_entry {
107 	struct dm_list list;
108 	uint32_t nodeid;
109 	uint64_t region;
110 };
111 
112 struct recovery_request {
113 	uint64_t region;
114 	struct recovery_request *next;
115 };
116 
117 static DM_LIST_INIT(log_list);
118 static DM_LIST_INIT(log_pending_list);
119 
log_test_bit(dm_bitset_t bs,int bit)120 static int log_test_bit(dm_bitset_t bs, int bit)
121 {
122 	return dm_bit(bs, bit);
123 }
124 
log_set_bit(struct log_c * lc,dm_bitset_t bs,int bit)125 static void log_set_bit(struct log_c *lc, dm_bitset_t bs, int bit)
126 {
127 	dm_bit_set(bs, bit);
128 	lc->touched = 1;
129 }
130 
log_clear_bit(struct log_c * lc,dm_bitset_t bs,int bit)131 static void log_clear_bit(struct log_c *lc, dm_bitset_t bs, int bit)
132 {
133 	dm_bit_clear(bs, bit);
134 	lc->touched = 1;
135 }
136 
find_next_zero_bit(dm_bitset_t bs,int start)137 static int find_next_zero_bit(dm_bitset_t bs, int start)
138 {
139 	while (dm_bit(bs, start++))
140 		if (start >= (int)bs[0])
141 			return -1;
142 
143 	return start - 1;
144 }
145 
count_bits32(dm_bitset_t bs)146 static uint64_t count_bits32(dm_bitset_t bs)
147 {
148 	int i, size = ((int)bs[0]/DM_BITS_PER_INT + 1);
149 	unsigned count = 0;
150 
151 	for (i = 1; i <= size; i++)
152 		count += hweight32(bs[i]);
153 
154 	return (uint64_t)count;
155 }
156 
157 /*
158  * get_log
159  *
160  * Returns: log if found, NULL otherwise
161  */
get_log(const char * uuid,uint64_t luid)162 static struct log_c *get_log(const char *uuid, uint64_t luid)
163 {
164 	struct log_c *lc;
165 
166 	dm_list_iterate_items(lc, &log_list)
167 		if (!strcmp(lc->uuid, uuid) &&
168 		    (!luid || (luid == lc->luid)))
169 			return lc;
170 
171 	return NULL;
172 }
173 
174 /*
175  * get_pending_log
176  *
177  * Pending logs are logs that have been 'clog_ctr'ed, but
178  * have not joined the CPG (via clog_resume).
179  *
180  * Returns: log if found, NULL otherwise
181  */
get_pending_log(const char * uuid,uint64_t luid)182 static struct log_c *get_pending_log(const char *uuid, uint64_t luid)
183 {
184 	struct log_c *lc;
185 
186 	dm_list_iterate_items(lc, &log_pending_list)
187 		if (!strcmp(lc->uuid, uuid) &&
188 		    (!luid || (luid == lc->luid)))
189 			return lc;
190 
191 	return NULL;
192 }
193 
header_to_disk(struct log_header * mem,struct log_header * disk)194 static void header_to_disk(struct log_header *mem, struct log_header *disk)
195 {
196 	memcpy(disk, mem, sizeof(struct log_header));
197 }
198 
header_from_disk(struct log_header * mem,struct log_header * disk)199 static void header_from_disk(struct log_header *mem, struct log_header *disk)
200 {
201 	memcpy(mem, disk, sizeof(struct log_header));
202 }
203 
rw_log(struct log_c * lc,int do_write)204 static int rw_log(struct log_c *lc, int do_write)
205 {
206 	int r;
207 
208 	r = lseek(lc->disk_fd, 0, SEEK_SET);
209 	if (r < 0) {
210 		LOG_ERROR("[%s] rw_log:  lseek failure: %s",
211 			  SHORT_UUID(lc->uuid), strerror(errno));
212 		return -errno;
213 	}
214 
215 	if (do_write) {
216 		r = write(lc->disk_fd, lc->disk_buffer, lc->disk_size);
217 		if (r < 0) {
218 			LOG_ERROR("[%s] rw_log:  write failure: %s",
219 				  SHORT_UUID(lc->uuid), strerror(errno));
220 			return -EIO; /* Failed disk write */
221 		}
222 		return 0;
223 	}
224 
225 	/* Read */
226 	r = read(lc->disk_fd, lc->disk_buffer, lc->disk_size);
227 	if (r < 0)
228 		LOG_ERROR("[%s] rw_log:  read failure: %s",
229 			  SHORT_UUID(lc->uuid), strerror(errno));
230 	if (r != lc->disk_size)
231 		return -EIO; /* Failed disk read */
232 	return 0;
233 }
234 
235 /*
236  * read_log
237  * @lc
238  *
239  * Valid return codes:
240  *   -EINVAL:  Invalid header, bits not copied
241  *   -EIO:     Unable to read disk log
242  *    0:       Valid header, disk bit -> lc->clean_bits
243  *
244  * Returns: 0 on success, -EXXX on failure
245  */
read_log(struct log_c * lc)246 static int read_log(struct log_c *lc)
247 {
248 	struct log_header lh;
249 	size_t bitset_size;
250 
251 	memset(&lh, 0, sizeof(struct log_header));
252 
253 	if (rw_log(lc, 0))
254 		return -EIO; /* Failed disk read */
255 
256 	header_from_disk(&lh, lc->disk_buffer);
257 	if (lh.magic != MIRROR_MAGIC)
258 		return -EINVAL;
259 
260 	lc->disk_nr_regions = lh.nr_regions;
261 
262 	/* Read disk bits into sync_bits */
263 	bitset_size = lc->region_count / 8;
264 	bitset_size += (lc->region_count % 8) ? 1 : 0;
265 	memcpy(lc->clean_bits, lc->disk_buffer + 1024, bitset_size);
266 
267 	return 0;
268 }
269 
270 /*
271  * write_log
272  * @lc
273  *
274  * Returns: 0 on success, -EIO on failure
275  */
write_log(struct log_c * lc)276 static int write_log(struct log_c *lc)
277 {
278 	struct log_header lh;
279 	size_t bitset_size;
280 
281 	lh.magic = MIRROR_MAGIC;
282 	lh.version = MIRROR_DISK_VERSION;
283 	lh.nr_regions = lc->region_count;
284 
285 	header_to_disk(&lh, lc->disk_buffer);
286 
287 	/* Write disk bits from clean_bits */
288 	bitset_size = lc->region_count / 8;
289 	bitset_size += (lc->region_count % 8) ? 1 : 0;
290 	memcpy(lc->disk_buffer + 1024, lc->clean_bits, bitset_size);
291 
292 	if (rw_log(lc, 1)) {
293 		lc->log_dev_failed = 1;
294 		return -EIO; /* Failed disk write */
295 	}
296 	return 0;
297 }
298 
find_disk_path(char * major_minor_str,char * path_rtn,int * unlink_path)299 static int find_disk_path(char *major_minor_str, char *path_rtn, int *unlink_path)
300 {
301 	int r;
302 	DIR *dp;
303 	struct dirent *dep;
304 	struct stat statbuf;
305 	int major, minor;
306 	mode_t old_umask;
307 
308 	if (!strstr(major_minor_str, ":")) {
309 		r = stat(major_minor_str, &statbuf);
310 		if (r)
311 			return -errno;
312 		if (!S_ISBLK(statbuf.st_mode))
313 			return -EINVAL;
314 		sprintf(path_rtn, "%s", major_minor_str);
315 		return 0;
316 	}
317 
318 	r = sscanf(major_minor_str, "%d:%d", &major, &minor);
319 	if (r != 2)
320 		return -EINVAL;
321 
322 	LOG_DBG("Checking /dev/mapper for device %d:%d", major, minor);
323 	/* Check /dev/mapper dir */
324 	dp = opendir("/dev/mapper");
325 	if (!dp)
326 		return -ENOENT;
327 
328 	while ((dep = readdir(dp)) != NULL) {
329 		/*
330 		 * FIXME: This is racy.  By the time the path is used,
331 		 * it may point to something else.  'fstat' will be
332 		 * required upon opening to ensure we got what we
333 		 * wanted.
334 		 */
335 
336 		sprintf(path_rtn, "/dev/mapper/%s", dep->d_name);
337 		stat(path_rtn, &statbuf);
338 		if (S_ISBLK(statbuf.st_mode) &&
339 		    (major(statbuf.st_rdev) == major) &&
340 		    (minor(statbuf.st_rdev) == minor)) {
341 			LOG_DBG("  %s: YES", dep->d_name);
342 			closedir(dp);
343 			return 0;
344 		} else {
345 			LOG_DBG("  %s: NO", dep->d_name);
346 		}
347 	}
348 
349 	closedir(dp);
350 
351 	LOG_DBG("Path not found for %d/%d", major, minor);
352 	LOG_DBG("Creating /dev/mapper/%d-%d", major, minor);
353 	sprintf(path_rtn, "/dev/mapper/%d-%d", major, minor);
354 	old_umask = umask(0);
355 	r = mknod(path_rtn, S_IFBLK | DM_DEVICE_MODE, MKDEV(major, minor));
356 	umask(old_umask);
357 
358 	if (r != -1)
359 		r = chown(path_rtn, DM_DEVICE_UID, DM_DEVICE_GID);
360 
361 	/*
362 	 * If we have to make the path, we unlink it after we open it
363 	 */
364 	*unlink_path = 1;
365 
366 	return r ? -errno : 0;
367 }
368 
_clog_ctr(char * uuid,uint64_t luid,int argc,char ** argv,uint64_t device_size)369 static int _clog_ctr(char *uuid, uint64_t luid,
370 		     int argc, char **argv, uint64_t device_size)
371 {
372 	int i;
373 	int r = 0;
374 	char *p;
375 	uint64_t region_size;
376 	uint64_t region_count;
377 	struct log_c *lc = NULL;
378 	struct log_c *duplicate;
379 	enum sync sync = DEFAULTSYNC;
380 	uint32_t block_on_error = 0;
381 
382 	int disk_log = 0;
383 	char disk_path[128];
384 	int unlink_path = 0;
385 	size_t page_size;
386 	int pages;
387 
388 	/* If core log request, then argv[0] will be region_size */
389 	if (!strtoll(argv[0], &p, 0) || *p) {
390 		disk_log = 1;
391 
392 		if ((argc < 2) || (argc > 4)) {
393 			LOG_ERROR("Too %s arguments to clustered_disk log type",
394 				  (argc < 3) ? "few" : "many");
395 			r = -EINVAL;
396 			goto fail;
397 		}
398 
399 		r = find_disk_path(argv[0], disk_path, &unlink_path);
400 		if (r) {
401 			LOG_ERROR("Unable to find path to device %s", argv[0]);
402 			goto fail;
403 		}
404 		LOG_DBG("Clustered log disk is %s", disk_path);
405 	} else {
406 		disk_log = 0;
407 
408 		if ((argc < 1) || (argc > 3)) {
409 			LOG_ERROR("Too %s arguments to clustered_core log type",
410 				  (argc < 2) ? "few" : "many");
411 			r = -EINVAL;
412 			goto fail;
413 		}
414 	}
415 
416 	if (!(region_size = strtoll(argv[disk_log], &p, 0)) || *p) {
417 		LOG_ERROR("Invalid region_size argument to clustered_%s log type",
418 			  (disk_log) ? "disk" : "core");
419 		r = -EINVAL;
420 		goto fail;
421 	}
422 
423 	region_count = device_size / region_size;
424 	if (device_size % region_size) {
425 		/*
426 		 * I can't remember if device_size must be a multiple
427 		 * of region_size, so check it anyway.
428 		 */
429 		region_count++;
430 	}
431 
432 	for (i = 0; i < argc; i++) {
433 		if (!strcmp(argv[i], "sync"))
434 			sync = FORCESYNC;
435 		else if (!strcmp(argv[i], "nosync"))
436 			sync = NOSYNC;
437 		else if (!strcmp(argv[i], "block_on_error"))
438 			block_on_error = 1;
439 	}
440 
441 	lc = malloc(sizeof(*lc));
442 	if (!lc) {
443 		LOG_ERROR("Unable to allocate cluster log context");
444 		r = -ENOMEM;
445 		goto fail;
446 	}
447 	memset(lc, 0, sizeof(*lc));
448 
449 	lc->region_size = region_size;
450 	lc->region_count = region_count;
451 	lc->sync = sync;
452 	lc->block_on_error = block_on_error;
453 	lc->sync_search = 0;
454 	lc->recovering_region = (uint64_t)-1;
455 	lc->skip_bit_warning = region_count;
456 	lc->disk_fd = -1;
457 	lc->log_dev_failed = 0;
458 	strncpy(lc->uuid, uuid, DM_UUID_LEN);
459 	lc->luid = luid;
460 
461 	if ((duplicate = get_log(lc->uuid, lc->luid)) ||
462 	    (duplicate = get_pending_log(lc->uuid, lc->luid))) {
463 		LOG_ERROR("[%s/%llu] Log already exists, unable to create.",
464 			  SHORT_UUID(lc->uuid), lc->luid);
465 		free(lc);
466 		return -EINVAL;
467 	}
468 
469 	dm_list_init(&lc->mark_list);
470 
471 	lc->clean_bits = dm_bitset_create(NULL, region_count);
472 	if (!lc->clean_bits) {
473 		LOG_ERROR("Unable to allocate clean bitset");
474 		r = -ENOMEM;
475 		goto fail;
476 	}
477 
478 	lc->sync_bits = dm_bitset_create(NULL, region_count);
479 	if (!lc->sync_bits) {
480 		LOG_ERROR("Unable to allocate sync bitset");
481 		r = -ENOMEM;
482 		goto fail;
483 	}
484 	if (sync == NOSYNC)
485 		dm_bit_set_all(lc->sync_bits);
486 
487 	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
488 	if (disk_log) {
489 		page_size = sysconf(_SC_PAGESIZE);
490 		pages = ((int)lc->clean_bits[0])/page_size;
491 		pages += ((int)lc->clean_bits[0])%page_size ? 1 : 0;
492 		pages += 1; /* for header */
493 
494 		r = open(disk_path, O_RDWR | O_DIRECT);
495 		if (r < 0) {
496 			LOG_ERROR("Unable to open log device, %s: %s",
497 				  disk_path, strerror(errno));
498 			r = errno;
499 			goto fail;
500 		}
501 		if (unlink_path)
502 			unlink(disk_path);
503 
504 		lc->disk_fd = r;
505 		lc->disk_size = pages * page_size;
506 
507 		r = posix_memalign(&(lc->disk_buffer), page_size,
508 				   lc->disk_size);
509 		if (r) {
510 			LOG_ERROR("Unable to allocate memory for disk_buffer");
511 			goto fail;
512 		}
513 		memset(lc->disk_buffer, 0, lc->disk_size);
514 		LOG_DBG("Disk log ready");
515 	}
516 
517 	dm_list_add(&log_pending_list, &lc->list);
518 
519 	return 0;
520 fail:
521 	if (lc) {
522 		if (lc->clean_bits)
523 			free(lc->clean_bits);
524 		if (lc->sync_bits)
525 			free(lc->sync_bits);
526 		if (lc->disk_buffer)
527 			free(lc->disk_buffer);
528 		if (lc->disk_fd >= 0)
529 			close(lc->disk_fd);
530 		free(lc);
531 	}
532 	return r;
533 }
534 
535 /*
536  * clog_ctr
537  * @rq
538  *
539  * rq->data should contain constructor string as follows:
540  *	<log_type> [disk] <region_size> [[no]sync] <device_len>
541  * The kernel is responsible for adding the <dev_len> argument
542  * to the end; otherwise, we cannot compute the region_count.
543  *
544  * FIXME: Currently relies on caller to fill in rq->error
545  */
546 static int clog_dtr(struct dm_ulog_request *rq);
clog_ctr(struct dm_ulog_request * rq)547 static int clog_ctr(struct dm_ulog_request *rq)
548 {
549 	int argc, i, r = 0;
550 	char *p, **argv = NULL;
551 	char *dev_size_str;
552 	uint64_t device_size;
553 
554 	/* Sanity checks */
555 	if (!rq->data_size) {
556 		LOG_ERROR("Received constructor request with no data");
557 		return -EINVAL;
558 	}
559 
560 	if (strlen(rq->data) > rq->data_size) {
561 		LOG_ERROR("Received constructor request with bad data");
562 		LOG_ERROR("strlen(rq->data)[%d] != rq->data_size[%llu]",
563 			  (int)strlen(rq->data),
564 			  (unsigned long long)rq->data_size);
565 		LOG_ERROR("rq->data = '%s' [%d]",
566 			  rq->data, (int)strlen(rq->data));
567 		return -EINVAL;
568 	}
569 
570 	/* Split up args */
571 	for (argc = 0, p = rq->data; (p = strstr(p, " ")); p++, argc++)
572 		*p = '\0';
573 
574 	argv = malloc(argc * sizeof(char *));
575 	if (!argv)
576 		return -ENOMEM;
577 
578 	p = dev_size_str = rq->data;
579 	p += strlen(p) + 1;
580 	for (i = 0; i < argc; i++, p = p + strlen(p) + 1)
581 		argv[i] = p;
582 
583 	if (strcmp(argv[0], "clustered_disk") &&
584 	    strcmp(argv[0], "clustered_core")) {
585 		LOG_ERROR("Unsupported userspace log type, \"%s\"", argv[0]);
586 		free(argv);
587 		return -EINVAL;
588 	}
589 
590 	if (!(device_size = strtoll(dev_size_str, &p, 0)) || *p) {
591 		LOG_ERROR("Invalid device size argument: %s", dev_size_str);
592 		free(argv);
593 		return -EINVAL;
594 	}
595 
596 	r = _clog_ctr(rq->uuid, rq->luid, argc - 1, argv + 1, device_size);
597 
598 	/* We join the CPG when we resume */
599 
600 	/* No returning data */
601 	rq->data_size = 0;
602 
603 	if (r) {
604 		LOG_ERROR("Failed to create cluster log (%s)", rq->uuid);
605 		for (i = 0; i < argc; i++)
606 			LOG_ERROR("argv[%d] = %s", i, argv[i]);
607 	}
608 	else
609 		LOG_DBG("[%s] Cluster log created",
610 			SHORT_UUID(rq->uuid));
611 
612 	free(argv);
613 	return r;
614 }
615 
616 /*
617  * clog_dtr
618  * @rq
619  *
620  */
clog_dtr(struct dm_ulog_request * rq)621 static int clog_dtr(struct dm_ulog_request *rq)
622 {
623 	struct log_c *lc = get_log(rq->uuid, rq->luid);
624 
625 	if (lc) {
626 		/*
627 		 * The log should not be on the official list.  There
628 		 * should have been a suspend first.
629 		 */
630 		LOG_ERROR("[%s] DTR before SUS: leaving CPG",
631 			  SHORT_UUID(rq->uuid));
632 		destroy_cluster_cpg(rq->uuid);
633 	} else if (!(lc = get_pending_log(rq->uuid, rq->luid))) {
634 		LOG_ERROR("clog_dtr called on log that is not official or pending");
635 		return -EINVAL;
636 	}
637 
638 	LOG_DBG("[%s] Cluster log removed", SHORT_UUID(lc->uuid));
639 
640 	dm_list_del(&lc->list);
641 	if (lc->disk_fd != -1)
642 		close(lc->disk_fd);
643 	if (lc->disk_buffer)
644 		free(lc->disk_buffer);
645 	free(lc->clean_bits);
646 	free(lc->sync_bits);
647 	free(lc);
648 
649 	return 0;
650 }
651 
652 /*
653  * clog_presuspend
654  * @rq
655  *
656  */
clog_presuspend(struct dm_ulog_request * rq)657 static int clog_presuspend(struct dm_ulog_request *rq)
658 {
659 	struct log_c *lc = get_log(rq->uuid, rq->luid);
660 
661 	if (!lc)
662 		return -EINVAL;
663 
664 	if (lc->touched)
665 		LOG_DBG("WARNING: log still marked as 'touched' during suspend");
666 
667 	lc->recovery_halted = 1;
668 
669 	return 0;
670 }
671 
672 /*
673  * clog_postsuspend
674  * @rq
675  *
676  */
clog_postsuspend(struct dm_ulog_request * rq)677 static int clog_postsuspend(struct dm_ulog_request *rq)
678 {
679 	struct log_c *lc = get_log(rq->uuid, rq->luid);
680 
681 	if (!lc)
682 		return -EINVAL;
683 
684 	LOG_DBG("[%s] clog_postsuspend: leaving CPG", SHORT_UUID(lc->uuid));
685 	destroy_cluster_cpg(rq->uuid);
686 
687 	lc->state = LOG_SUSPENDED;
688 	lc->recovering_region = (uint64_t)-1;
689 	lc->recoverer = (uint32_t)-1;
690 	lc->delay = time(NULL);
691 
692 	return 0;
693 }
694 
695 /*
696  * cluster_postsuspend
697  * @rq
698  *
699  */
cluster_postsuspend(char * uuid,uint64_t luid)700 int cluster_postsuspend(char *uuid, uint64_t luid)
701 {
702 	struct log_c *lc = get_log(uuid, luid);
703 
704 	if (!lc)
705 		return -EINVAL;
706 
707 	LOG_DBG("[%s] clog_postsuspend: finalizing", SHORT_UUID(lc->uuid));
708 	lc->resume_override = 0;
709 
710 	/* move log to pending list */
711 	dm_list_del(&lc->list);
712 	dm_list_add(&log_pending_list, &lc->list);
713 
714 	return 0;
715 }
716 
717 /*
718  * clog_resume
719  * @rq
720  *
721  * Does the main work of resuming.
722  */
clog_resume(struct dm_ulog_request * rq)723 static int clog_resume(struct dm_ulog_request *rq)
724 {
725 	uint32_t i;
726 	int commit_log = 0;
727 	struct log_c *lc = get_log(rq->uuid, rq->luid);
728 
729 	if (!lc)
730 		return -EINVAL;
731 
732 	switch (lc->resume_override) {
733 	case 1000:
734 		LOG_ERROR("[%s] Additional resume issued before suspend",
735 			  SHORT_UUID(rq->uuid));
736 #ifdef DEBUG
737 		kill(getpid(), SIGUSR1);
738 #endif
739 		return 0;
740 	case 0:
741 		lc->resume_override = 1000;
742 		if (lc->disk_fd == -1) {
743 			LOG_DBG("[%s] Master resume.",
744 				SHORT_UUID(lc->uuid));
745 			goto no_disk;
746 		}
747 
748 		LOG_DBG("[%s] Master resume: reading disk log",
749 			SHORT_UUID(lc->uuid));
750 		commit_log = 1;
751 		break;
752 	case 1:
753 		LOG_ERROR("Error:: partial bit loading (just sync_bits)");
754 		return -EINVAL;
755 	case 2:
756 		LOG_ERROR("Error:: partial bit loading (just clean_bits)");
757 		return -EINVAL;
758 	case 3:
759 		LOG_DBG("[%s] Non-master resume: bits pre-loaded",
760 			SHORT_UUID(lc->uuid));
761 		lc->resume_override = 1000;
762 		goto out;
763 	default:
764 		LOG_ERROR("Error:: multiple loading of bits (%d)",
765 			  lc->resume_override);
766 		return -EINVAL;
767 	}
768 
769 	if (lc->log_dev_failed) {
770 		LOG_ERROR("Log device has failed, unable to read bits");
771 		rq->error = 0;  /* We can handle this so far */
772 		lc->disk_nr_regions = 0;
773 	} else
774 		rq->error = read_log(lc);
775 
776 	switch (rq->error) {
777 	case 0:
778 		if (lc->disk_nr_regions < lc->region_count)
779 			LOG_DBG("[%s] Mirror has grown, updating log bits",
780 				SHORT_UUID(lc->uuid));
781 		else if (lc->disk_nr_regions > lc->region_count)
782 			LOG_DBG("[%s] Mirror has shrunk, updating log bits",
783 				SHORT_UUID(lc->uuid));
784 		break;
785 	case -EINVAL:
786 		LOG_DBG("[%s] (Re)initializing mirror log - resync issued.",
787 			SHORT_UUID(lc->uuid));
788 		lc->disk_nr_regions = 0;
789 		break;
790 	default:
791 		LOG_ERROR("Failed to read disk log");
792 		lc->disk_nr_regions = 0;
793 		break;
794 	}
795 
796 no_disk:
797 	/* If mirror has grown, set bits appropriately */
798 	if (lc->sync == NOSYNC)
799 		for (i = lc->disk_nr_regions; i < lc->region_count; i++)
800 			log_set_bit(lc, lc->clean_bits, i);
801 	else
802 		for (i = lc->disk_nr_regions; i < lc->region_count; i++)
803 			log_clear_bit(lc, lc->clean_bits, i);
804 
805 	/* Clear any old bits if device has shrunk */
806 	for (i = lc->region_count; i % 32; i++)
807 		log_clear_bit(lc, lc->clean_bits, i);
808 
809 	/* copy clean across to sync */
810 	dm_bit_copy(lc->sync_bits, lc->clean_bits);
811 
812 	if (commit_log && (lc->disk_fd >= 0)) {
813 		rq->error = write_log(lc);
814 		if (rq->error)
815 			LOG_ERROR("Failed initial disk log write");
816 		else
817 			LOG_DBG("Disk log initialized");
818 		lc->touched = 0;
819 	}
820 out:
821 	/*
822 	 * Clear any old bits if device has shrunk - necessary
823 	 * for non-master resume
824 	 */
825 	for (i = lc->region_count; i % 32; i++) {
826 		log_clear_bit(lc, lc->clean_bits, i);
827 		log_clear_bit(lc, lc->sync_bits, i);
828 	}
829 
830 	lc->sync_count = count_bits32(lc->sync_bits);
831 
832 	LOG_SPRINT(lc, "[%s] Initial sync_count = %llu",
833 		   SHORT_UUID(lc->uuid), (unsigned long long)lc->sync_count);
834 	lc->sync_search = 0;
835 	lc->state = LOG_RESUMED;
836 	lc->recovery_halted = 0;
837 
838 	return rq->error;
839 }
840 
841 /*
842  * local_resume
843  * @rq
844  *
845  * If the log is pending, we must first join the cpg and
846  * put the log in the official list.
847  *
848  */
local_resume(struct dm_ulog_request * rq)849 int local_resume(struct dm_ulog_request *rq)
850 {
851 	int r;
852 	time_t t;
853 	struct log_c *lc = get_log(rq->uuid, rq->luid);
854 
855 	if (!lc) {
856 		/* Is the log in the pending list? */
857 		lc = get_pending_log(rq->uuid, rq->luid);
858 		if (!lc) {
859 			LOG_ERROR("clog_resume called on log that is not official or pending");
860 			return -EINVAL;
861 		}
862 
863 		t = time(NULL);
864 		t -= lc->delay;
865 		/*
866 		 * This should be considered a temporary fix.  It addresses
867 		 * a problem that exists when nodes suspend/resume in rapid
868 		 * succession.  While the problem is very rare, it has been
869 		 * seen to happen in real-world-like testing.
870 		 *
871 		 * The problem:
872 		 * - Node A joins cluster
873 		 * - Node B joins cluster
874 		 * - Node A prepares checkpoint
875 		 * - Node A gets ready to write checkpoint
876 		 * - Node B leaves
877 		 * - Node B joins
878 		 * - Node A finishes write of checkpoint
879 		 * - Node B receives checkpoint meant for previous session
880 		 * -- Node B can now be non-coherent
881 		 *
882 		 * This timer will solve the problem for now, but could be
883 		 * replaced by a generation number sent with the resume
884 		 * command from the kernel.  The generation number would
885 		 * be included in the name of the checkpoint to prevent
886 		 * reading stale data.
887 		 */
888 		if ((t < 3) && (t >= 0))
889 			sleep(3 - t);
890 
891 		/* Join the CPG */
892 		r = create_cluster_cpg(rq->uuid, rq->luid);
893 		if (r) {
894 			LOG_ERROR("clog_resume:  Failed to create cluster CPG");
895 			return r;
896 		}
897 
898 		/* move log to official list */
899 		dm_list_del(&lc->list);
900 		dm_list_add(&log_list, &lc->list);
901 	}
902 
903 	return 0;
904 }
905 
906 /*
907  * clog_get_region_size
908  * @rq
909  *
910  * Since this value doesn't change, the kernel
911  * should not need to talk to server to get this
912  * The function is here for completness
913  *
914  * Returns: 0 on success, -EXXX on failure
915  */
clog_get_region_size(struct dm_ulog_request * rq)916 static int clog_get_region_size(struct dm_ulog_request *rq)
917 {
918 	uint64_t *rtn = (uint64_t *)rq->data;
919 	struct log_c *lc = get_log(rq->uuid, rq->luid);
920 
921 	if (!lc && !(lc = get_pending_log(rq->uuid, rq->luid)))
922 		return -EINVAL;
923 
924 	*rtn = lc->region_size;
925 	rq->data_size = sizeof(*rtn);
926 
927 	return 0;
928 }
929 
930 /*
931  * clog_is_clean
932  * @rq
933  *
934  * Returns: 1 if clean, 0 otherwise
935  */
clog_is_clean(struct dm_ulog_request * rq)936 static int clog_is_clean(struct dm_ulog_request *rq)
937 {
938 	int64_t *rtn = (int64_t *)rq->data;
939 	uint64_t region = *((uint64_t *)(rq->data));
940 	struct log_c *lc = get_log(rq->uuid, rq->luid);
941 
942 	if (!lc)
943 		return -EINVAL;
944 
945 	*rtn = log_test_bit(lc->clean_bits, region);
946 	rq->data_size = sizeof(*rtn);
947 
948 	return 0;
949 }
950 
951 /*
952  * clog_in_sync
953  * @rq
954  *
955  * We ignore any request for non-block.  That
956  * should be handled elsewhere.  (If the request
957  * has come this far, it has already blocked.)
958  *
959  * Returns: 1 if in-sync, 0 otherwise
960  */
clog_in_sync(struct dm_ulog_request * rq)961 static int clog_in_sync(struct dm_ulog_request *rq)
962 {
963 	int64_t *rtn = (int64_t *)rq->data;
964 	uint64_t region = *((uint64_t *)(rq->data));
965 	struct log_c *lc = get_log(rq->uuid, rq->luid);
966 
967 	if (!lc)
968 		return -EINVAL;
969 
970 	if (region > lc->region_count)
971 		return -EINVAL;
972 
973 	*rtn = log_test_bit(lc->sync_bits, region);
974 	if (*rtn)
975 		LOG_DBG("[%s] Region is in-sync: %llu",
976 			SHORT_UUID(lc->uuid), (unsigned long long)region);
977 	else
978 		LOG_DBG("[%s] Region is not in-sync: %llu",
979 			SHORT_UUID(lc->uuid), (unsigned long long)region);
980 
981 	rq->data_size = sizeof(*rtn);
982 
983 	return 0;
984 }
985 
986 /*
987  * clog_flush
988  * @rq
989  *
990  */
clog_flush(struct dm_ulog_request * rq,int server)991 static int clog_flush(struct dm_ulog_request *rq, int server)
992 {
993 	int r = 0;
994 	struct log_c *lc = get_log(rq->uuid, rq->luid);
995 
996 	if (!lc)
997 		return -EINVAL;
998 
999 	if (!lc->touched)
1000 		return 0;
1001 
1002 	/*
1003 	 * Do the actual flushing of the log only
1004 	 * if we are the server.
1005 	 */
1006 	if (server && (lc->disk_fd >= 0)) {
1007 		r = rq->error = write_log(lc);
1008 		if (r)
1009 			LOG_ERROR("[%s] Error writing to disk log",
1010 				  SHORT_UUID(lc->uuid));
1011 		else
1012 			LOG_DBG("[%s] Disk log written", SHORT_UUID(lc->uuid));
1013 	}
1014 
1015 	lc->touched = 0;
1016 
1017 	return r;
1018 
1019 }
1020 
1021 /*
1022  * mark_region
1023  * @lc
1024  * @region
1025  * @who
1026  *
1027  * Put a mark region request in the tree for tracking.
1028  *
1029  * Returns: 0 on success, -EXXX on error
1030  */
mark_region(struct log_c * lc,uint64_t region,uint32_t who)1031 static int mark_region(struct log_c *lc, uint64_t region, uint32_t who)
1032 {
1033 	int found = 0;
1034 	struct mark_entry *m;
1035 
1036 	dm_list_iterate_items(m, &lc->mark_list)
1037 		if (m->region == region) {
1038 			found = 1;
1039 			if (m->nodeid == who)
1040 				return 0;
1041 		}
1042 
1043 	if (!found)
1044 		log_clear_bit(lc, lc->clean_bits, region);
1045 
1046 	/*
1047 	 * Save allocation until here - if there is a failure,
1048 	 * at least we have cleared the bit.
1049 	 */
1050 	m = malloc(sizeof(*m));
1051 	if (!m) {
1052 		LOG_ERROR("Unable to allocate space for mark_entry: %llu/%u",
1053 			  (unsigned long long)region, who);
1054 		return -ENOMEM;
1055 	}
1056 
1057 	m->nodeid = who;
1058 	m->region = region;
1059 	dm_list_add(&lc->mark_list, &m->list);
1060 
1061 	return 0;
1062 }
1063 
1064 /*
1065  * clog_mark_region
1066  * @rq
1067  *
1068  * rq may contain more than one mark request.  We
1069  * can determine the number from the 'data_size' field.
1070  *
1071  * Returns: 0 on success, -EXXX on failure
1072  */
clog_mark_region(struct dm_ulog_request * rq,uint32_t originator)1073 static int clog_mark_region(struct dm_ulog_request *rq, uint32_t originator)
1074 {
1075 	int r;
1076 	int count;
1077 	uint64_t *region;
1078 	struct log_c *lc = get_log(rq->uuid, rq->luid);
1079 
1080 	if (!lc)
1081 		return -EINVAL;
1082 
1083 	if (rq->data_size % sizeof(uint64_t)) {
1084 		LOG_ERROR("Bad data size given for mark_region request");
1085 		return -EINVAL;
1086 	}
1087 
1088 	count = rq->data_size / sizeof(uint64_t);
1089 	region = (uint64_t *)&rq->data;
1090 
1091 	for (; count > 0; count--, region++) {
1092 		r = mark_region(lc, *region, originator);
1093 		if (r)
1094 			return r;
1095 	}
1096 
1097 	rq->data_size = 0;
1098 
1099 	return 0;
1100 }
1101 
clear_region(struct log_c * lc,uint64_t region,uint32_t who)1102 static int clear_region(struct log_c *lc, uint64_t region, uint32_t who)
1103 {
1104 	int other_matches = 0;
1105 	struct mark_entry *m, *n;
1106 
1107 	dm_list_iterate_items_safe(m, n, &lc->mark_list)
1108 		if (m->region == region) {
1109 			if (m->nodeid == who) {
1110 				dm_list_del(&m->list);
1111 				free(m);
1112 			} else
1113 				other_matches = 1;
1114 		}
1115 
1116 	/*
1117 	 * Clear region if:
1118 	 *  1) It is in-sync
1119 	 *  2) There are no other machines that have it marked
1120 	 */
1121 	if (!other_matches && log_test_bit(lc->sync_bits, region))
1122 		log_set_bit(lc, lc->clean_bits, region);
1123 
1124 	return 0;
1125 }
1126 
1127 /*
1128  * clog_clear_region
1129  * @rq
1130  *
1131  * rq may contain more than one clear request.  We
1132  * can determine the number from the 'data_size' field.
1133  *
1134  * Returns: 0 on success, -EXXX on failure
1135  */
clog_clear_region(struct dm_ulog_request * rq,uint32_t originator)1136 static int clog_clear_region(struct dm_ulog_request *rq, uint32_t originator)
1137 {
1138 	int r;
1139 	int count;
1140 	uint64_t *region;
1141 	struct log_c *lc = get_log(rq->uuid, rq->luid);
1142 
1143 	if (!lc)
1144 		return -EINVAL;
1145 
1146 	if (rq->data_size % sizeof(uint64_t)) {
1147 		LOG_ERROR("Bad data size given for clear_region request");
1148 		return -EINVAL;
1149 	}
1150 
1151 	count = rq->data_size / sizeof(uint64_t);
1152 	region = (uint64_t *)&rq->data;
1153 
1154 	for (; count > 0; count--, region++) {
1155 		r = clear_region(lc, *region, originator);
1156 		if (r)
1157 			return r;
1158 	}
1159 
1160 	rq->data_size = 0;
1161 
1162 	return 0;
1163 }
1164 
1165 /*
1166  * clog_get_resync_work
1167  * @rq
1168  *
1169  */
clog_get_resync_work(struct dm_ulog_request * rq,uint32_t originator)1170 static int clog_get_resync_work(struct dm_ulog_request *rq, uint32_t originator)
1171 {
1172 	struct {
1173 		int64_t i;
1174 		uint64_t r;
1175 	} *pkg = (void *)rq->data;
1176 	struct log_c *lc = get_log(rq->uuid, rq->luid);
1177 
1178 	if (!lc)
1179 		return -EINVAL;
1180 
1181 	rq->data_size = sizeof(*pkg);
1182 	pkg->i = 0;
1183 
1184 	if (lc->sync_search >= lc->region_count) {
1185 		/*
1186 		 * FIXME: handle intermittent errors during recovery
1187 		 * by resetting sync_search... but not to many times.
1188 		 */
1189 		LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1190 			   "Recovery finished",
1191 			   rq->seq, SHORT_UUID(lc->uuid), originator);
1192 		return 0;
1193 	}
1194 
1195 	if (lc->recovering_region != (uint64_t)-1) {
1196 		if (lc->recoverer == originator) {
1197 			LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1198 				   "Re-requesting work (%llu)",
1199 				   rq->seq, SHORT_UUID(lc->uuid), originator,
1200 				   (unsigned long long)lc->recovering_region);
1201 			pkg->r = lc->recovering_region;
1202 			pkg->i = 1;
1203 			LOG_COND(log_resend_requests, "***** RE-REQUEST *****");
1204 		} else {
1205 			LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1206 				   "Someone already recovering (%llu)",
1207 				   rq->seq, SHORT_UUID(lc->uuid), originator,
1208 				   (unsigned long long)lc->recovering_region);
1209 		}
1210 
1211 		return 0;
1212 	}
1213 
1214 	while (lc->recovery_request_list) {
1215 		struct recovery_request *del;
1216 
1217 		del = lc->recovery_request_list;
1218 		lc->recovery_request_list = del->next;
1219 
1220 		pkg->r = del->region;
1221 		free(del);
1222 
1223 		if (!log_test_bit(lc->sync_bits, pkg->r)) {
1224 			LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1225 				   "Assigning priority resync work (%llu)",
1226 				   rq->seq, SHORT_UUID(lc->uuid), originator,
1227 				   (unsigned long long)pkg->r);
1228 			pkg->i = 1;
1229 			lc->recovering_region = pkg->r;
1230 			lc->recoverer = originator;
1231 			return 0;
1232 		}
1233 	}
1234 
1235 	pkg->r = find_next_zero_bit(lc->sync_bits,
1236 				    lc->sync_search);
1237 
1238 	if (pkg->r >= lc->region_count) {
1239 		LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1240 			   "Resync work complete.",
1241 			   rq->seq, SHORT_UUID(lc->uuid), originator);
1242 		return 0;
1243 	}
1244 
1245 	lc->sync_search = pkg->r + 1;
1246 
1247 	LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1248 		   "Assigning resync work (%llu)",
1249 		   rq->seq, SHORT_UUID(lc->uuid), originator,
1250 		   (unsigned long long)pkg->r);
1251 	pkg->i = 1;
1252 	lc->recovering_region = pkg->r;
1253 	lc->recoverer = originator;
1254 
1255 	return 0;
1256 }
1257 
1258 /*
1259  * clog_set_region_sync
1260  * @rq
1261  */
clog_set_region_sync(struct dm_ulog_request * rq,uint32_t originator)1262 static int clog_set_region_sync(struct dm_ulog_request *rq, uint32_t originator)
1263 {
1264 	struct {
1265 		uint64_t region;
1266 		int64_t in_sync;
1267 	} *pkg = (void *)rq->data;
1268 	struct log_c *lc = get_log(rq->uuid, rq->luid);
1269 
1270 	if (!lc)
1271 		return -EINVAL;
1272 
1273 	lc->recovering_region = (uint64_t)-1;
1274 
1275 	if (pkg->in_sync) {
1276 		if (log_test_bit(lc->sync_bits, pkg->region)) {
1277 			LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1278 				   "Region already set (%llu)",
1279 				   rq->seq, SHORT_UUID(lc->uuid), originator,
1280 				   (unsigned long long)pkg->region);
1281 		} else {
1282 			log_set_bit(lc, lc->sync_bits, pkg->region);
1283 			lc->sync_count++;
1284 
1285 			/* The rest of this section is all for debugging */
1286 			LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1287 				   "Setting region (%llu)",
1288 				   rq->seq, SHORT_UUID(lc->uuid), originator,
1289 				   (unsigned long long)pkg->region);
1290 			if (pkg->region == lc->skip_bit_warning)
1291 				lc->skip_bit_warning = lc->region_count;
1292 
1293 			if (pkg->region > (lc->skip_bit_warning + 5)) {
1294 				LOG_ERROR("*** Region #%llu skipped during recovery ***",
1295 					  (unsigned long long)lc->skip_bit_warning);
1296 				lc->skip_bit_warning = lc->region_count;
1297 #ifdef DEBUG
1298 				kill(getpid(), SIGUSR1);
1299 #endif
1300 			}
1301 
1302 			if (!log_test_bit(lc->sync_bits,
1303 					  (pkg->region) ? pkg->region - 1 : 0)) {
1304 				LOG_SPRINT(lc, "*** Previous bit not set ***");
1305 				lc->skip_bit_warning = (pkg->region) ?
1306 					pkg->region - 1 : 0;
1307 			}
1308 		}
1309 	} else if (log_test_bit(lc->sync_bits, pkg->region)) {
1310 		lc->sync_count--;
1311 		log_clear_bit(lc, lc->sync_bits, pkg->region);
1312 		LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1313 			   "Unsetting region (%llu)",
1314 			   rq->seq, SHORT_UUID(lc->uuid), originator,
1315 			   (unsigned long long)pkg->region);
1316 	}
1317 
1318 	if (lc->sync_count != count_bits32(lc->sync_bits)) {
1319 		unsigned long long reset = count_bits32(lc->sync_bits);
1320 
1321 		LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1322 			   "sync_count(%llu) != bitmap count(%llu)",
1323 			   rq->seq, SHORT_UUID(lc->uuid), originator,
1324 			   (unsigned long long)lc->sync_count, reset);
1325 #ifdef DEBUG
1326 		kill(getpid(), SIGUSR1);
1327 #endif
1328 		lc->sync_count = reset;
1329 	}
1330 
1331 	if (lc->sync_count > lc->region_count)
1332 		LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
1333 			   "(lc->sync_count > lc->region_count) - this is bad",
1334 			   rq->seq, SHORT_UUID(lc->uuid), originator);
1335 
1336 	rq->data_size = 0;
1337 	return 0;
1338 }
1339 
1340 /*
1341  * clog_get_sync_count
1342  * @rq
1343  */
clog_get_sync_count(struct dm_ulog_request * rq,uint32_t originator)1344 static int clog_get_sync_count(struct dm_ulog_request *rq, uint32_t originator)
1345 {
1346 	uint64_t *sync_count = (uint64_t *)rq->data;
1347 	struct log_c *lc = get_log(rq->uuid, rq->luid);
1348 
1349 	/*
1350 	 * FIXME: Mirror requires us to be able to ask for
1351 	 * the sync count while pending... but I don't like
1352 	 * it because other machines may not be suspended and
1353 	 * the stored value may not be accurate.
1354 	 */
1355 	if (!lc)
1356 		lc = get_pending_log(rq->uuid, rq->luid);
1357 
1358 	if (!lc)
1359 		return -EINVAL;
1360 
1361 	*sync_count = lc->sync_count;
1362 
1363 	rq->data_size = sizeof(*sync_count);
1364 
1365 	if (lc->sync_count != count_bits32(lc->sync_bits)) {
1366 		unsigned long long reset = count_bits32(lc->sync_bits);
1367 
1368 		LOG_SPRINT(lc, "get_sync_count - SEQ#=%u, UUID=%s, nodeid = %u:: "
1369 			   "sync_count(%llu) != bitmap count(%llu)",
1370 			   rq->seq, SHORT_UUID(lc->uuid), originator,
1371 			   (unsigned long long)lc->sync_count, reset);
1372 #ifdef DEBUG
1373 		kill(getpid(), SIGUSR1);
1374 #endif
1375 		lc->sync_count = reset;
1376 	}
1377 
1378 	return 0;
1379 }
1380 
core_status_info(struct log_c * lc,struct dm_ulog_request * rq)1381 static int core_status_info(struct log_c *lc, struct dm_ulog_request *rq)
1382 {
1383 	char *data = (char *)rq->data;
1384 
1385 	rq->data_size = sprintf(data, "1 clustered_core");
1386 
1387 	return 0;
1388 }
1389 
disk_status_info(struct log_c * lc,struct dm_ulog_request * rq)1390 static int disk_status_info(struct log_c *lc, struct dm_ulog_request *rq)
1391 {
1392 	char *data = (char *)rq->data;
1393 	struct stat statbuf;
1394 
1395 	if(fstat(lc->disk_fd, &statbuf)) {
1396 		rq->error = -errno;
1397 		return -errno;
1398 	}
1399 
1400 	rq->data_size = sprintf(data, "3 clustered_disk %d:%d %c",
1401 				major(statbuf.st_rdev), minor(statbuf.st_rdev),
1402 				(lc->log_dev_failed) ? 'D' : 'A');
1403 
1404 	return 0;
1405 }
1406 
1407 /*
1408  * clog_status_info
1409  * @rq
1410  *
1411  */
clog_status_info(struct dm_ulog_request * rq)1412 static int clog_status_info(struct dm_ulog_request *rq)
1413 {
1414 	int r;
1415 	struct log_c *lc = get_log(rq->uuid, rq->luid);
1416 
1417 	if (!lc)
1418 		lc = get_pending_log(rq->uuid, rq->luid);
1419 
1420 	if (!lc)
1421 		return -EINVAL;
1422 
1423 	if (lc->disk_fd == -1)
1424 		r = core_status_info(lc, rq);
1425 	else
1426 		r = disk_status_info(lc, rq);
1427 
1428 	return r;
1429 }
1430 
core_status_table(struct log_c * lc,struct dm_ulog_request * rq)1431 static int core_status_table(struct log_c *lc, struct dm_ulog_request *rq)
1432 {
1433 	char *data = (char *)rq->data;
1434 
1435 	rq->data_size = sprintf(data, "clustered_core %u %s%s ",
1436 				lc->region_size,
1437 				(lc->sync == DEFAULTSYNC) ? "" :
1438 				(lc->sync == NOSYNC) ? "nosync " : "sync ",
1439 				(lc->block_on_error) ? "block_on_error" : "");
1440 	return 0;
1441 }
1442 
disk_status_table(struct log_c * lc,struct dm_ulog_request * rq)1443 static int disk_status_table(struct log_c *lc, struct dm_ulog_request *rq)
1444 {
1445 	char *data = (char *)rq->data;
1446 	struct stat statbuf;
1447 
1448 	if(fstat(lc->disk_fd, &statbuf)) {
1449 		rq->error = -errno;
1450 		return -errno;
1451 	}
1452 
1453 	rq->data_size = sprintf(data, "clustered_disk %d:%d %u %s%s ",
1454 				major(statbuf.st_rdev), minor(statbuf.st_rdev),
1455 				lc->region_size,
1456 				(lc->sync == DEFAULTSYNC) ? "" :
1457 				(lc->sync == NOSYNC) ? "nosync " : "sync ",
1458 				(lc->block_on_error) ? "block_on_error" : "");
1459 	return 0;
1460 }
1461 
1462 /*
1463  * clog_status_table
1464  * @rq
1465  *
1466  */
clog_status_table(struct dm_ulog_request * rq)1467 static int clog_status_table(struct dm_ulog_request *rq)
1468 {
1469 	int r;
1470 	struct log_c *lc = get_log(rq->uuid, rq->luid);
1471 
1472 	if (!lc)
1473 		lc = get_pending_log(rq->uuid, rq->luid);
1474 
1475 	if (!lc)
1476 		return -EINVAL;
1477 
1478 	if (lc->disk_fd == -1)
1479 		r = core_status_table(lc, rq);
1480 	else
1481 		r = disk_status_table(lc, rq);
1482 
1483 	return r;
1484 }
1485 
1486 /*
1487  * clog_is_remote_recovering
1488  * @rq
1489  *
1490  */
clog_is_remote_recovering(struct dm_ulog_request * rq)1491 static int clog_is_remote_recovering(struct dm_ulog_request *rq)
1492 {
1493 	uint64_t region = *((uint64_t *)(rq->data));
1494 	struct {
1495 		int64_t is_recovering;
1496 		uint64_t in_sync_hint;
1497 	} *pkg = (void *)rq->data;
1498 	struct log_c *lc = get_log(rq->uuid, rq->luid);
1499 
1500 	if (!lc)
1501 		return -EINVAL;
1502 
1503 	if (region > lc->region_count)
1504 		return -EINVAL;
1505 
1506 	if (lc->recovery_halted) {
1507 		LOG_DBG("[%s] Recovery halted... [not remote recovering]: %llu",
1508 			SHORT_UUID(lc->uuid), (unsigned long long)region);
1509 		pkg->is_recovering = 0;
1510 		pkg->in_sync_hint = lc->region_count; /* none are recovering */
1511 	} else {
1512 		pkg->is_recovering = !log_test_bit(lc->sync_bits, region);
1513 
1514 		/*
1515 		 * Remember, 'lc->sync_search' is 1 plus the region
1516 		 * currently being recovered.  So, we must take off 1
1517 		 * to account for that; but only if 'sync_search > 1'.
1518 		 */
1519 		pkg->in_sync_hint = lc->sync_search ? (lc->sync_search - 1) : 0;
1520 		LOG_DBG("[%s] Region is %s: %llu",
1521 			SHORT_UUID(lc->uuid),
1522 			(region == lc->recovering_region) ?
1523 			"currently remote recovering" :
1524 			(pkg->is_recovering) ? "pending remote recovery" :
1525 			"not remote recovering", (unsigned long long)region);
1526 	}
1527 
1528 	if (pkg->is_recovering &&
1529 	    (region != lc->recovering_region)) {
1530 		struct recovery_request *rr;
1531 
1532 		/* Already in the list? */
1533 		for (rr = lc->recovery_request_list; rr; rr = rr->next)
1534 			if (rr->region == region)
1535 				goto out;
1536 
1537 		/* Failure to allocated simply means we can't prioritize it */
1538 		rr = malloc(sizeof(*rr));
1539 		if (!rr)
1540 			goto out;
1541 
1542 		LOG_DBG("[%s] Adding region to priority list: %llu",
1543 			SHORT_UUID(lc->uuid), (unsigned long long)region);
1544 		rr->region = region;
1545 		rr->next = lc->recovery_request_list;
1546 		lc->recovery_request_list = rr;
1547 	}
1548 
1549 out:
1550 
1551 	rq->data_size = sizeof(*pkg);
1552 
1553 	return 0;
1554 }
1555 
1556 
1557 /*
1558  * do_request
1559  * @rq: the request
1560  * @server: is this request performed by the server
1561  *
1562  * An inability to perform this function will return an error
1563  * from this function.  However, an inability to successfully
1564  * perform the request will fill in the 'rq->error' field.
1565  *
1566  * Returns: 0 on success, -EXXX on error
1567  */
do_request(struct clog_request * rq,int server)1568 int do_request(struct clog_request *rq, int server)
1569 {
1570 	int r;
1571 
1572 	if (!rq)
1573 		return 0;
1574 
1575 	if (rq->u_rq.error)
1576 		LOG_DBG("Programmer error: rq struct has error set");
1577 
1578 	switch (rq->u_rq.request_type) {
1579 	case DM_ULOG_CTR:
1580 		r = clog_ctr(&rq->u_rq);
1581 		break;
1582 	case DM_ULOG_DTR:
1583 		r = clog_dtr(&rq->u_rq);
1584 		break;
1585 	case DM_ULOG_PRESUSPEND:
1586 		r = clog_presuspend(&rq->u_rq);
1587 		break;
1588 	case DM_ULOG_POSTSUSPEND:
1589 		r = clog_postsuspend(&rq->u_rq);
1590 		break;
1591 	case DM_ULOG_RESUME:
1592 		r = clog_resume(&rq->u_rq);
1593 		break;
1594 	case DM_ULOG_GET_REGION_SIZE:
1595 		r = clog_get_region_size(&rq->u_rq);
1596 		break;
1597 	case DM_ULOG_IS_CLEAN:
1598 		r = clog_is_clean(&rq->u_rq);
1599 		break;
1600 	case DM_ULOG_IN_SYNC:
1601 		r = clog_in_sync(&rq->u_rq);
1602 		break;
1603 	case DM_ULOG_FLUSH:
1604 		r = clog_flush(&rq->u_rq, server);
1605 		break;
1606 	case DM_ULOG_MARK_REGION:
1607 		r = clog_mark_region(&rq->u_rq, rq->originator);
1608 		break;
1609 	case DM_ULOG_CLEAR_REGION:
1610 		r = clog_clear_region(&rq->u_rq, rq->originator);
1611 		break;
1612 	case DM_ULOG_GET_RESYNC_WORK:
1613 		r = clog_get_resync_work(&rq->u_rq, rq->originator);
1614 		break;
1615 	case DM_ULOG_SET_REGION_SYNC:
1616 		r = clog_set_region_sync(&rq->u_rq, rq->originator);
1617 		break;
1618 	case DM_ULOG_GET_SYNC_COUNT:
1619 		r = clog_get_sync_count(&rq->u_rq, rq->originator);
1620 		break;
1621 	case DM_ULOG_STATUS_INFO:
1622 		r = clog_status_info(&rq->u_rq);
1623 		break;
1624 	case DM_ULOG_STATUS_TABLE:
1625 		r = clog_status_table(&rq->u_rq);
1626 		break;
1627 	case DM_ULOG_IS_REMOTE_RECOVERING:
1628 		r = clog_is_remote_recovering(&rq->u_rq);
1629 		break;
1630 	default:
1631 		LOG_ERROR("Unknown request");
1632 		r = rq->u_rq.error = -EINVAL;
1633 		break;
1634 	}
1635 
1636 	if (r && !rq->u_rq.error)
1637 		rq->u_rq.error = r;
1638 	else if (r != rq->u_rq.error)
1639 		LOG_DBG("Warning:  error from function != rq->u_rq.error");
1640 
1641 	if (rq->u_rq.error && rq->u_rq.data_size) {
1642 		/* Make sure I'm handling errors correctly above */
1643 		LOG_DBG("Programmer error: rq->u_rq.error && rq->u_rq.data_size");
1644 		rq->u_rq.data_size = 0;
1645 	}
1646 
1647 	return 0;
1648 }
1649 
print_bits(char * buf,int size,int print)1650 static void print_bits(char *buf, int size, int print)
1651 {
1652 	int i;
1653 	char outbuf[128];
1654 
1655 	memset(outbuf, 0, sizeof(outbuf));
1656 
1657 	for (i = 0; i < size; i++) {
1658 		if (!(i % 16)) {
1659 			if (outbuf[0] != '\0') {
1660 				if (print)
1661 					LOG_PRINT("%s", outbuf);
1662 				else
1663 					LOG_DBG("%s", outbuf);
1664 			}
1665 			memset(outbuf, 0, sizeof(outbuf));
1666 			sprintf(outbuf, "[%3d - %3d]", i, i+15);
1667 		}
1668 		sprintf(outbuf + strlen(outbuf), " %.2X", (unsigned char)buf[i]);
1669 	}
1670 	if (outbuf[0] != '\0') {
1671 		if (print)
1672 			LOG_PRINT("%s", outbuf);
1673 		else
1674 			LOG_DBG("%s", outbuf);
1675 	}
1676 }
1677 
1678 /* int store_bits(const char *uuid, const char *which, char **buf)*/
push_state(const char * uuid,uint64_t luid,const char * which,char ** buf,uint32_t debug_who)1679 int push_state(const char *uuid, uint64_t luid,
1680 	       const char *which, char **buf, uint32_t debug_who)
1681 {
1682 	int bitset_size;
1683 	struct log_c *lc;
1684 
1685 	if (*buf)
1686 		LOG_ERROR("store_bits: *buf != NULL");
1687 
1688 	lc = get_log(uuid, luid);
1689 	if (!lc) {
1690 		LOG_ERROR("store_bits: No log found for %s", uuid);
1691 		return -EINVAL;
1692 	}
1693 
1694 	if (!strcmp(which, "recovering_region")) {
1695 		*buf = malloc(64); /* easily handles the 2 written numbers */
1696 		if (!*buf)
1697 			return -ENOMEM;
1698 		sprintf(*buf, "%llu %u", (unsigned long long)lc->recovering_region,
1699 			lc->recoverer);
1700 
1701 		LOG_SPRINT(lc, "CKPT SEND - SEQ#=X, UUID=%s, nodeid = %u:: "
1702 			   "recovering_region=%llu, recoverer=%u, sync_count=%llu",
1703 			   SHORT_UUID(lc->uuid), debug_who,
1704 			   (unsigned long long)lc->recovering_region,
1705 			   lc->recoverer,
1706 			   (unsigned long long)count_bits32(lc->sync_bits));
1707 		return 64;
1708 	}
1709 
1710 	/* Size in 'int's */
1711 	bitset_size = ((int)lc->clean_bits[0]/DM_BITS_PER_INT) + 1;
1712 
1713 	/* Size in bytes */
1714 	bitset_size *= 4;
1715 
1716 	*buf = malloc(bitset_size);
1717 
1718 	if (!*buf) {
1719 		LOG_ERROR("store_bits: Unable to allocate memory");
1720 		return -ENOMEM;
1721 	}
1722 
1723 	if (!strncmp(which, "sync_bits", 9)) {
1724 		memcpy(*buf, lc->sync_bits + 1, bitset_size);
1725 		LOG_DBG("[%s] storing sync_bits (sync_count = %llu):",
1726 			SHORT_UUID(uuid), (unsigned long long)
1727 			count_bits32(lc->sync_bits));
1728 		print_bits(*buf, bitset_size, 0);
1729 	} else if (!strncmp(which, "clean_bits", 9)) {
1730 		memcpy(*buf, lc->clean_bits + 1, bitset_size);
1731 		LOG_DBG("[%s] storing clean_bits:", SHORT_UUID(lc->uuid));
1732 		print_bits(*buf, bitset_size, 0);
1733 	}
1734 
1735 	return bitset_size;
1736 }
1737 
1738 /*int load_bits(const char *uuid, const char *which, char *buf, int size)*/
pull_state(const char * uuid,uint64_t luid,const char * which,char * buf,int size)1739 int pull_state(const char *uuid, uint64_t luid,
1740 	       const char *which, char *buf, int size)
1741 {
1742 	int bitset_size;
1743 	struct log_c *lc;
1744 
1745 	if (!buf)
1746 		LOG_ERROR("pull_state: buf == NULL");
1747 
1748 	lc = get_log(uuid, luid);
1749 	if (!lc) {
1750 		LOG_ERROR("pull_state: No log found for %s", uuid);
1751 		return -EINVAL;
1752 	}
1753 
1754 	if (!strncmp(which, "recovering_region", 17)) {
1755 		sscanf(buf, "%llu %u", (unsigned long long *)&lc->recovering_region,
1756 		       &lc->recoverer);
1757 		LOG_SPRINT(lc, "CKPT INIT - SEQ#=X, UUID=%s, nodeid = X:: "
1758 			   "recovering_region=%llu, recoverer=%u",
1759 			   SHORT_UUID(lc->uuid),
1760 			   (unsigned long long)lc->recovering_region, lc->recoverer);
1761 		return 0;
1762 	}
1763 
1764 	/* Size in 'int's */
1765 	bitset_size = ((int)lc->clean_bits[0]/DM_BITS_PER_INT) + 1;
1766 
1767 	/* Size in bytes */
1768 	bitset_size *= 4;
1769 
1770 	if (bitset_size != size) {
1771 		LOG_ERROR("pull_state(%s): bad bitset_size (%d vs %d)",
1772 			  which, size, bitset_size);
1773 		return -EINVAL;
1774 	}
1775 
1776 	if (!strncmp(which, "sync_bits", 9)) {
1777 		lc->resume_override += 1;
1778 		memcpy(lc->sync_bits + 1, buf, bitset_size);
1779 		LOG_DBG("[%s] loading sync_bits (sync_count = %llu):",
1780 			SHORT_UUID(lc->uuid),(unsigned long long)
1781 			count_bits32(lc->sync_bits));
1782 		print_bits((char *)lc->sync_bits, bitset_size, 0);
1783 	} else if (!strncmp(which, "clean_bits", 9)) {
1784 		lc->resume_override += 2;
1785 		memcpy(lc->clean_bits + 1, buf, bitset_size);
1786 		LOG_DBG("[%s] loading clean_bits:", SHORT_UUID(lc->uuid));
1787 		print_bits((char *)lc->clean_bits, bitset_size, 0);
1788 	}
1789 
1790 	return 0;
1791 }
1792 
log_get_state(struct dm_ulog_request * rq)1793 int log_get_state(struct dm_ulog_request *rq)
1794 {
1795 	struct log_c *lc;
1796 
1797 	lc = get_log(rq->uuid, rq->luid);
1798 	if (!lc)
1799 		return -EINVAL;
1800 
1801 	return lc->state;
1802 }
1803 
1804 /*
1805  * log_status
1806  *
1807  * Returns: 1 if logs are still present, 0 otherwise
1808  */
log_status(void)1809 int log_status(void)
1810 {
1811 	if (!dm_list_empty(&log_list) || !dm_list_empty(&log_pending_list))
1812 		return 1;
1813 
1814 	return 0;
1815 }
1816 
log_debug(void)1817 void log_debug(void)
1818 {
1819 	struct log_c *lc;
1820 	uint64_t r;
1821 	int i;
1822 
1823 	LOG_ERROR("");
1824 	LOG_ERROR("LOG COMPONENT DEBUGGING::");
1825 	LOG_ERROR("Official log list:");
1826 	LOG_ERROR("Pending log list:");
1827 	dm_list_iterate_items(lc, &log_pending_list) {
1828 		LOG_ERROR("%s", lc->uuid);
1829 		LOG_ERROR("sync_bits:");
1830 		print_bits((char *)lc->sync_bits, (int)lc->sync_bits[0], 1);
1831 		LOG_ERROR("clean_bits:");
1832 		print_bits((char *)lc->clean_bits, (int)lc->sync_bits[0], 1);
1833 	}
1834 
1835 	dm_list_iterate_items(lc, &log_list) {
1836 		LOG_ERROR("%s", lc->uuid);
1837 		LOG_ERROR("  recoverer        : %u", lc->recoverer);
1838 		LOG_ERROR("  recovering_region: %llu",
1839 			  (unsigned long long)lc->recovering_region);
1840 		LOG_ERROR("  recovery_halted  : %s", (lc->recovery_halted) ?
1841 			  "YES" : "NO");
1842 		LOG_ERROR("sync_bits:");
1843 		print_bits((char *)lc->sync_bits, (int)lc->sync_bits[0], 1);
1844 		LOG_ERROR("clean_bits:");
1845 		print_bits((char *)lc->clean_bits, (int)lc->sync_bits[0], 1);
1846 
1847 		LOG_ERROR("Validating %s::", SHORT_UUID(lc->uuid));
1848 		r = find_next_zero_bit(lc->sync_bits, 0);
1849 		LOG_ERROR("  lc->region_count = %llu",
1850 			  (unsigned long long)lc->region_count);
1851 		LOG_ERROR("  lc->sync_count = %llu",
1852 			  (unsigned long long)lc->sync_count);
1853 		LOG_ERROR("  next zero bit  = %llu",
1854 			  (unsigned long long)r);
1855 		if ((r > lc->region_count) ||
1856 		    ((r == lc->region_count) && (lc->sync_count > lc->region_count))) {
1857 			LOG_ERROR("ADJUSTING SYNC_COUNT");
1858 			lc->sync_count = lc->region_count;
1859 		}
1860 
1861 		LOG_ERROR("Resync request history:");
1862 		for (i = 0; i < RESYNC_HISTORY; i++) {
1863 			lc->idx++;
1864 			lc->idx = lc->idx % RESYNC_HISTORY;
1865 			if (lc->resync_history[lc->idx][0] == '\0')
1866 				continue;
1867 			LOG_ERROR("%d:%d) %s", i, lc->idx,
1868 				  lc->resync_history[lc->idx]);
1869 		}
1870 	}
1871 }
1872