xref: /dflybsd-src/sys/vfs/hammer/hammer_recover.c (revision c4bf625e67439f34b29bfd33c4e2555ffea63ce9)
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.9 2008/03/18 05:19:16 dillon Exp $
35  */
36 
37 #include "hammer.h"
38 
39 #if 0
40 
41 static int hammer_recover_buffer_stage2(hammer_cluster_t cluster,
42 				int32_t buf_no);
43 static int hammer_recover_record(hammer_cluster_t cluster,
44 				hammer_buffer_t buffer, int32_t rec_offset,
45 				hammer_record_ondisk_t rec);
46 static int hammer_recover_btree(hammer_cluster_t cluster,
47 				hammer_buffer_t buffer, int32_t rec_offset,
48 				hammer_record_ondisk_t rec);
49 
50 /*
51  * Recover a cluster.  The caller has referenced and locked the cluster.
52  *
53  * Generally returns 0 on success and EIO if the recovery was unsuccessful.
54  *
55  * WARNING!  The cluster being recovered must not have any cached buffers
56  * (and hence no cached b-tree nodes).  Any cached nodes will become seriously
57  * corrupted since we rip it all up and regenerate the B-Tree.
58  */
59 int
60 hammer_recover(hammer_cluster_t cluster)
61 {
62 	int buf_no;
63 	int rec_no;
64 	int maxblk;
65 	int nbuffers;
66 	int buffer_count;
67 	int record_count;
68 
69 	kprintf("HAMMER_RECOVER %d:%d\n",
70 		cluster->volume->vol_no, cluster->clu_no);
71 	/*Debugger("RECOVER");*/
72 	KKASSERT(cluster->ondisk->synchronized_rec_id);
73 	if (RB_ROOT(&cluster->rb_bufs_root)) {
74 		panic("hammer_recover: cluster %d:%d has cached buffers!",
75 			cluster->volume->vol_no,
76 			cluster->clu_no);
77 	}
78 
79 	if (hammer_alist_find(&cluster->volume->alist, cluster->clu_no,
80 			      cluster->clu_no + 1, 0) != cluster->clu_no) {
81 		Debugger("hammer_recover: cluster not allocated!");
82 	}
83 
84 	nbuffers = cluster->ondisk->clu_limit / HAMMER_BUFSIZE;
85 	hammer_modify_cluster(cluster);
86 
87 	/*
88 	 * Clear statistics.
89 	 */
90 	cluster->ondisk->stat_inodes = 0;
91 	cluster->ondisk->stat_records = 0;
92 	cluster->ondisk->stat_data_bufs = 0;
93 	cluster->ondisk->stat_rec_bufs = 0;
94 	cluster->ondisk->stat_idx_bufs = 0;
95 
96 	/*
97 	 * Reset allocation heuristics.
98 	 */
99 	cluster->ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS;
100 	cluster->ondisk->idx_index = 0 * HAMMER_FSBUF_MAXBLKS;
101 	cluster->ondisk->idx_record = nbuffers * HAMMER_FSBUF_MAXBLKS;
102 
103 	/*
104 	 * Re-initialize the master, B-Tree, and mdata A-lists, and
105 	 * recover the record A-list.
106 	 */
107 	hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1,
108 			  HAMMER_ASTATE_FREE);
109 	hammer_alist_init(&cluster->alist_btree,
110 			  HAMMER_FSBUF_MAXBLKS,
111 			  (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
112 			  HAMMER_ASTATE_ALLOC);
113 	hammer_alist_init(&cluster->alist_mdata,
114 			  HAMMER_FSBUF_MAXBLKS,
115 			  (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
116 			  HAMMER_ASTATE_ALLOC);
117 	hammer_alist_recover(&cluster->alist_record,
118 			  0,
119 			  HAMMER_FSBUF_MAXBLKS,
120 			  (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS);
121 	kprintf("\n");
122 
123 	kprintf("hammer_recover(1): cluster_free %d\n",
124 		cluster->alist_master.meta->bm_alist_freeblks);
125 
126 	/*
127 	 * The cluster is now in good enough shape that general allocations
128 	 * are possible.  Construct an empty B-Tree root.
129 	 */
130 	{
131 		hammer_node_t croot;
132 		int error;
133 
134 		croot = hammer_alloc_btree(cluster, &error);
135 		if (error == 0) {
136 			hammer_modify_node_noundo(croot);
137 			bzero(croot->ondisk, sizeof(*croot->ondisk));
138 			croot->ondisk->count = 0;
139 			croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
140 			cluster->ondisk->clu_btree_root = croot->node_offset;
141 			hammer_rel_node(croot);
142 		}
143 		KKASSERT(error == 0);
144 	}
145 	kprintf("hammer_recover(2): cluster_free %d\n",
146 		cluster->alist_master.meta->bm_alist_freeblks);
147 
148 	/*
149 	 * Scan the cluster's recovered record A-list.  Just get the meta
150 	 * blocks and ignore all-allocated/uninitialized sections (which
151 	 * we use to indicate reserved areas not assigned to record buffers).
152 	 *
153 	 * The all-free sections are initialized and this is indicated by
154 	 * the alist config's bl_inverted flag being set.  These sections
155 	 * will be returned for recovery purposes.
156 	 */
157 	buffer_count = 0;
158 	record_count = 0;
159 
160 	rec_no = HAMMER_FSBUF_MAXBLKS;
161 	maxblk = nbuffers * HAMMER_FSBUF_MAXBLKS;
162 	for (;;) {
163 		rec_no = hammer_alist_find(&cluster->alist_record,
164 					   rec_no,
165 					   maxblk,
166 					   HAMMER_ALIST_FIND_NOSTACK |
167 					   HAMMER_ALIST_FIND_INITONLY);
168 		if (rec_no == HAMMER_ALIST_BLOCK_NONE)
169 			break;
170 		buf_no = rec_no / HAMMER_FSBUF_MAXBLKS;
171 		KKASSERT(buf_no > 0 && buf_no <= nbuffers);
172 		++buffer_count;
173 		kprintf("(%d)", buf_no);
174 		record_count += hammer_recover_buffer_stage2(cluster, buf_no);
175 		rec_no += HAMMER_FSBUF_MAXBLKS;
176 	}
177 	kprintf("HAMMER_RECOVER DONE %d:%d buffers=%d records=%d\n",
178 		cluster->volume->vol_no, cluster->clu_no,
179 		buffer_count, record_count);
180 
181 	/*
182 	 * Validate the parent cluster pointer. XXX
183 	 */
184 
185 	/*
186 	 * On successful recovery mark the cluster validated.
187 	 */
188 	cluster->io.validated = 1;
189 	return(0);
190 }
191 
192 /*
193  * This is used in the alist callback and must return a negative error
194  * code or a positive free block count.
195  */
196 int
197 buffer_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count)
198 {
199 	hammer_cluster_t cluster;
200 	hammer_record_ondisk_t rec;
201 	hammer_buffer_t buffer;
202 	int32_t buf_no;
203 	int32_t rec_no;
204 	int32_t rec_offset;
205 	int32_t r;
206 	int error;
207 	int xcount;
208 
209 	/*
210 	 * Extract cluster and buffer number to recover
211 	 */
212 	cluster = info;
213 	buf_no = blk / HAMMER_FSBUF_MAXBLKS;
214 
215 	kprintf("(%d)", buf_no);
216 	buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
217 	if (error) {
218 		/*
219 		 * If we are unable to access the buffer leave it in a
220 		 * reserved state on the master alist.
221 		 */
222 		kprintf("hammer_recover_buffer_stage1: error "
223 			"recovering %d:%d:%d\n",
224 			cluster->volume->vol_no, cluster->clu_no, buf_no);
225 		r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no);
226 		KKASSERT(r == buf_no);
227 		return(-error);
228 	}
229 	KKASSERT(buffer->buf_type == HAMMER_FSBUF_RECORDS);
230 
231 	/*
232 	 * If the buffer contains no allocated records tell our parent to
233 	 * mark it as all-allocated/uninitialized and do not reserve it
234 	 * in the master list.
235 	 */
236 	if (hammer_alist_find(&buffer->alist, 0, HAMMER_RECORD_NODES, 0) ==
237 	    HAMMER_ALIST_BLOCK_NONE) {
238 		kprintf("GENERAL RECOVERY BUFFER %d\n",
239 			blk / HAMMER_FSBUF_MAXBLKS);
240 		hammer_rel_buffer(buffer, 0);
241 		return(-EDOM);
242 	}
243 
244 
245 	/*
246 	 * Mark the buffer as allocated in the cluster's master A-list.
247 	 */
248 	r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no);
249 	KKASSERT(r == buf_no);
250 	++cluster->ondisk->stat_rec_bufs;
251 
252 	kprintf("recover buffer1 %d:%d:%d cluster_free %d\n",
253 		cluster->volume->vol_no,
254 		cluster->clu_no, buf_no,
255 		cluster->alist_master.meta->bm_alist_freeblks);
256 
257 	/*
258 	 * Recover the buffer, scan and validate allocated records.  Records
259 	 * which cannot be recovered are freed.
260 	 *
261 	 * The parent a-list must be properly adjusted so don't just call
262 	 * hammer_alist_recover() on the underlying buffer.  Go through the
263 	 * parent.
264 	 */
265 	hammer_modify_buffer(buffer);
266 	count = hammer_alist_recover(&buffer->alist, 0, 0, HAMMER_RECORD_NODES);
267 	xcount = 0;
268 	kprintf("hammer_recover_buffer count1 %d/%d\n",
269 		HAMMER_RECORD_NODES - count, HAMMER_RECORD_NODES);
270 	rec_no = 0;
271 	for (;;) {
272 		rec_no = hammer_alist_find(&buffer->alist, rec_no,
273 					   HAMMER_RECORD_NODES, 0);
274 		if (rec_no == HAMMER_ALIST_BLOCK_NONE)
275 			break;
276 #if 0
277 		kprintf("recover record %d:%d:%d %d\n",
278 			cluster->volume->vol_no,
279 			cluster->clu_no, buf_no, rec_no);
280 #endif
281 		rec_offset = offsetof(union hammer_fsbuf_ondisk,
282 				      record.recs[rec_no]);
283 		rec_offset += buf_no * HAMMER_BUFSIZE;
284 		rec = &buffer->ondisk->record.recs[rec_no];
285 		error = hammer_recover_record(cluster, buffer, rec_offset, rec);
286 		if (error) {
287 			kprintf("hammer_recover_record: failed %d:%d@%d\n",
288 				cluster->clu_no, buffer->buf_no, rec_offset);
289 			hammer_alist_free(&buffer->alist, rec_no, 1);
290 			if (hammer_debug_recover_faults)
291 				Debugger("FAILED");
292 			++count;	/* free count */
293 			--xcount;
294 		}
295 		++rec_no;
296 		++xcount;
297 	}
298 	kprintf("hammer_recover_buffer count2 %d/%d/%d\n",
299 		HAMMER_RECORD_NODES - count, xcount, HAMMER_RECORD_NODES);
300 	KKASSERT(HAMMER_RECORD_NODES - count == xcount);
301 	hammer_rel_buffer(buffer, 0);
302 	return(count);
303 }
304 
305 /*
306  * Recover a record, at least into a state that doesn't blow up the
307  * filesystem.  Returns 0 on success, non-zero if the record is
308  * unrecoverable.
309  */
310 static int
311 hammer_recover_record(hammer_cluster_t cluster, hammer_buffer_t buffer,
312 			     int32_t rec_offset, hammer_record_ondisk_t rec)
313 {
314 	hammer_buffer_t dbuf;
315 	u_int64_t syncid = cluster->ondisk->synchronized_rec_id;
316 	int32_t data_offset;
317 	int32_t data_len;
318 	int32_t nblks;
319 	int32_t dbuf_no;
320 	int32_t dblk_no;
321 	int32_t base_blk;
322 	int32_t r;
323 	int error = 0;
324 
325 	/*
326 	 * We have to discard any records with rec_id's greater then the
327 	 * last sync of the cluster header (which guarenteed all related
328 	 * buffers had been synced).  Otherwise the record may reference
329 	 * information that was never synced to disk.
330 	 */
331 	if (rec->base.rec_id >= syncid) {
332 		kprintf("recover record: syncid too large %016llx/%016llx\n",
333 			rec->base.rec_id, syncid);
334 		if (hammer_debug_recover_faults)
335 			Debugger("DebugSyncid");
336 		return(EINVAL);
337 	}
338 
339 #if 0
340 	/* XXX undo incomplete deletions */
341 	if (rec->base.base.delete_tid > syncid)
342 		rec->base.base.delete_tid = 0;
343 #endif
344 
345 	/*
346 	 * Validate the record's B-Tree key
347 	 */
348 	KKASSERT(rec->base.base.rec_type != 0);
349 	if (rec->base.base.rec_type != HAMMER_RECTYPE_CLUSTER) {
350 		if (hammer_btree_cmp(&rec->base.base,
351 				     &cluster->ondisk->clu_btree_beg) < 0)  {
352 			kprintf("recover record: range low\n");
353 			Debugger("RANGE LOW");
354 			return(EINVAL);
355 		}
356 		if (hammer_btree_cmp(&rec->base.base,
357 				     &cluster->ondisk->clu_btree_end) >= 0)  {
358 			kprintf("recover record: range high\n");
359 			Debugger("RANGE HIGH");
360 			return(EINVAL);
361 		}
362 	}
363 
364 	/*
365 	 * Validate the record's data.  If the offset is 0 there is no data
366 	 * (or it is zero-fill) and we can return success immediately.
367 	 * Otherwise make sure everything is ok.
368 	 */
369 	data_offset = rec->base.data_offset;
370 	data_len = rec->base.data_len;
371 
372 	if (data_len == 0)
373 		rec->base.data_offset = data_offset = 0;
374 	if (data_offset == 0)
375 		goto done;
376 
377 	/*
378 	 * Non-zero data offset, recover the data
379 	 */
380 	if (data_offset < HAMMER_BUFSIZE ||
381 	    data_offset >= cluster->ondisk->clu_limit ||
382 	    data_len < 0 || data_len > HAMMER_MAXDATA ||
383 	    data_offset + data_len > cluster->ondisk->clu_limit) {
384 		kprintf("recover record: bad offset/len %d/%d\n",
385 			data_offset, data_len);
386 		Debugger("BAD OFFSET");
387 		return(EINVAL);
388 	}
389 
390 	/*
391 	 * Check data_offset relative to rec_offset
392 	 */
393 	if (data_offset < rec_offset && data_offset + data_len > rec_offset) {
394 		kprintf("recover record: bad offset: overlapping1\n");
395 		Debugger("BAD OFFSET - OVERLAP1");
396 		return(EINVAL);
397 	}
398 	if (data_offset >= rec_offset &&
399 	    data_offset < rec_offset + sizeof(struct hammer_base_record)) {
400 		kprintf("recover record: bad offset: overlapping2\n");
401 		Debugger("BAD OFFSET - OVERLAP2");
402 		return(EINVAL);
403 	}
404 
405 	/*
406 	 * Check for data embedded in the record
407 	 */
408 	if (data_offset >= rec_offset &&
409 	    data_offset < rec_offset + HAMMER_RECORD_SIZE) {
410 		if (data_offset + data_len > rec_offset + HAMMER_RECORD_SIZE) {
411 			kprintf("recover record: bad offset: overlapping3\n");
412 			Debugger("BAD OFFSET - OVERLAP3");
413 			return(EINVAL);
414 		}
415 		goto done;
416 	}
417 
418 	KKASSERT(cluster->io.modified);
419 	/*
420 	 * Recover the allocated data either out of the cluster's master alist
421 	 * or as a buffer sub-allocation.
422 	 */
423 	if ((data_len & HAMMER_BUFMASK) == 0) {
424 		if (data_offset & HAMMER_BUFMASK) {
425 			kprintf("recover record: bad offset: unaligned\n");
426 			Debugger("BAD OFFSET - UNALIGNED");
427 			return(EINVAL);
428 		}
429 		nblks = data_len / HAMMER_BUFSIZE;
430 		dbuf_no = data_offset / HAMMER_BUFSIZE;
431 		/* XXX power-of-2 check data_len */
432 
433 		r = hammer_alist_alloc_fwd(&cluster->alist_master,
434 					   nblks, dbuf_no);
435 		if (r == HAMMER_ALIST_BLOCK_NONE) {
436 			kprintf("recover record: cannot recover offset1\n");
437 			Debugger("CANNOT ALLOC DATABUFFER");
438 			return(EINVAL);
439 		}
440 		if (r != dbuf_no) {
441 			kprintf("recover record: cannot recover offset2\n");
442 			hammer_alist_free(&cluster->alist_master, r, nblks);
443 			KKASSERT(0);
444 			return(EINVAL);
445 		}
446 		++cluster->ondisk->stat_data_bufs;
447 	} else {
448 		if ((data_offset & ~HAMMER_BUFMASK) !=
449 		    ((data_offset + data_len - 1) & ~HAMMER_BUFMASK)) {
450 			kprintf("recover record: overlaps multiple bufs\n");
451 			Debugger("OVERLAP MULT");
452 			return(EINVAL);
453 		}
454 		if ((data_offset & HAMMER_BUFMASK) <
455 		    sizeof(struct hammer_fsbuf_head)) {
456 			kprintf("recover record: data in header area\n");
457 			Debugger("DATA IN HEADER AREA");
458 			return(EINVAL);
459 		}
460 		if (data_offset & HAMMER_DATA_BLKMASK) {
461 			kprintf("recover record: data blk unaligned\n");
462 			Debugger("DATA BLK UNALIGNED");
463 			return(EINVAL);
464 		}
465 
466 		/*
467 		 * Ok, recover the space in the data buffer.
468 		 */
469 		dbuf_no = data_offset / HAMMER_BUFSIZE;
470 		r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, dbuf_no);
471 		if (r != dbuf_no && r != HAMMER_ALIST_BLOCK_NONE)
472 			hammer_alist_free(&cluster->alist_master, r, 1);
473 		if (r == dbuf_no) {
474 			/*
475 			 * This is the first time we've tried to recover
476 			 * data in this data buffer, reinit it (but don't
477 			 * zero it out, obviously).
478 			 *
479 			 * Calling initbuffer marks the data blocks within
480 			 * the buffer as being all-allocated.  We have to
481 			 * mark it free.
482 			 */
483 			dbuf = hammer_get_buffer(cluster, dbuf_no,
484 						 0, &error);
485 			if (error == 0) {
486 				KKASSERT(dbuf->buf_type == HAMMER_FSBUF_DATA);
487 				hammer_modify_buffer(dbuf);
488 				hammer_initbuffer(&dbuf->alist,
489 						  &dbuf->ondisk->head,
490 						  HAMMER_FSBUF_DATA);
491 				/*dbuf->buf_type = HAMMER_FSBUF_DATA;*/
492 				base_blk = dbuf_no * HAMMER_FSBUF_MAXBLKS;
493 				hammer_alist_free(&cluster->alist_mdata,
494 						  base_blk,
495 						  HAMMER_DATA_NODES);
496 				kprintf("FREE DATA %d/%d\n", base_blk, HAMMER_DATA_NODES);
497 				++cluster->ondisk->stat_data_bufs;
498 			}
499 		} else {
500 			/*
501 			 * We've seen this data buffer before.
502 			 */
503 			dbuf = hammer_get_buffer(cluster, dbuf_no,
504 						 0, &error);
505 		}
506 		if (error) {
507 			kprintf("recover record: data: getbuf failed\n");
508 			KKASSERT(0);
509 			return(EINVAL);
510 		}
511 
512 		if (dbuf->buf_type != HAMMER_FSBUF_DATA) {
513 			hammer_rel_buffer(dbuf, 0);
514 			kprintf("recover record: data: wrong buffer type\n");
515 			KKASSERT(0);
516 			return(EINVAL);
517 		}
518 
519 		/*
520 		 * Figure out the data block number and number of blocks.
521 		 */
522 		nblks = (data_len + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
523 		nblks /= HAMMER_DATA_BLKSIZE;
524 		dblk_no = ((data_offset & HAMMER_BUFMASK) - offsetof(union hammer_fsbuf_ondisk, data.data)) / HAMMER_DATA_BLKSIZE;
525 		if ((data_offset & HAMMER_BUFMASK) != offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no])) {
526 			kprintf("dblk_no %d does not match data_offset %d/%d\n",
527 				dblk_no,
528 				offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no]),
529 				(data_offset & HAMMER_BUFMASK));
530 			hammer_rel_buffer(dbuf, 0);
531 			kprintf("recover record: data: not block aligned\n");
532 			Debugger("bad data");
533 			return(EINVAL);
534 		}
535 		hammer_modify_buffer(dbuf);
536 		dblk_no += dbuf_no * HAMMER_FSBUF_MAXBLKS;
537 		r = hammer_alist_alloc_fwd(&cluster->alist_mdata, nblks, dblk_no);
538 		if (r != dblk_no) {
539 			if (r != HAMMER_ALIST_BLOCK_NONE)
540 				hammer_alist_free(&cluster->alist_mdata, r, nblks);
541 			hammer_rel_buffer(dbuf, 0);
542 			kprintf("recover record: data: unable to realloc dbuf %d dblk %d\n", dbuf_no, dblk_no % HAMMER_FSBUF_MAXBLKS);
543 			KKASSERT(0);
544 			return(EINVAL);
545 		}
546 		hammer_rel_buffer(dbuf, 0);
547 	}
548 done:
549 	return(0);
550 }
551 
552 /*
553  * Rebuild the B-Tree for the records residing in the specified buffer.
554  *
555  * Return the number of records recovered.
556  */
557 static int
558 hammer_recover_buffer_stage2(hammer_cluster_t cluster, int32_t buf_no)
559 {
560 	hammer_record_ondisk_t rec;
561 	hammer_buffer_t buffer;
562 	int32_t rec_no;
563 	int32_t rec_offset;
564 	int record_count = 0;
565 	int error;
566 
567 	buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
568 	if (error) {
569 		/*
570 		 * If we are unable to access the buffer leave it in a
571 		 * reserved state on the master alist.
572 		 */
573 		kprintf("hammer_recover_buffer_stage2: error "
574 			"recovering %d:%d:%d\n",
575 			cluster->volume->vol_no, cluster->clu_no, buf_no);
576 		Debugger("RECOVER BUFFER STAGE2 FAIL");
577 		return(0);
578 	}
579 
580 	/*
581 	 * Recover the buffer, scan and validate allocated records.  Records
582 	 * which cannot be recovered are freed.
583 	 */
584 	rec_no = 0;
585 	for (;;) {
586 		rec_no = hammer_alist_find(&buffer->alist, rec_no,
587 					   HAMMER_RECORD_NODES, 0);
588 		if (rec_no == HAMMER_ALIST_BLOCK_NONE)
589 			break;
590 		rec_offset = offsetof(union hammer_fsbuf_ondisk,
591 				      record.recs[rec_no]);
592 		rec_offset += buf_no * HAMMER_BUFSIZE;
593 		rec = &buffer->ondisk->record.recs[rec_no];
594 		error = hammer_recover_btree(cluster, buffer, rec_offset, rec);
595 		if (error) {
596 			kprintf("hammer_recover_btree: failed %d:%d@%08x "
597 				"error %d buffer %p rec %p rec_no %d "
598 				" cluster_free %d\n",
599 				cluster->clu_no, buffer->buf_no, rec_offset,
600 				error, buffer, rec, rec_no,
601 				cluster->alist_master.meta->bm_alist_freeblks
602 			);
603 			Debugger("recover_btree failed");
604 			/* XXX free the record and its data? */
605 			/*hammer_alist_free(&buffer->alist, rec_no, 1);*/
606 		} else {
607 			++record_count;
608 		}
609 		++rec_no;
610 	}
611 	hammer_rel_buffer(buffer, 0);
612 	return(record_count);
613 }
614 
615 /*
616  * Enter a single record into the B-Tree.
617  */
618 static int
619 hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer,
620 		      int32_t rec_offset, hammer_record_ondisk_t rec)
621 {
622 	struct hammer_cursor cursor;
623 	union hammer_btree_elm elm;
624 	hammer_cluster_t ncluster;
625 	int error = 0;
626 
627 	/*
628 	 * Check for a spike record.  When spiking into a new cluster do
629 	 * NOT allow a recursive recovery to occur.  We use a lot of
630 	 * stack and the only thing we actually modify in the target
631 	 * cluster is its parent pointer.
632 	 */
633 	if (rec->base.base.rec_type == HAMMER_RECTYPE_CLUSTER) {
634 		hammer_volume_t ovolume = cluster->volume;
635 		hammer_volume_t nvolume;
636 
637 		nvolume = hammer_get_volume(ovolume->hmp, rec->spike.vol_no,
638 					    &error);
639 		if (error) {
640 			Debugger("recover_btree1");
641 			return(error);
642 		}
643 		ncluster = hammer_get_cluster(nvolume, rec->spike.clu_no,
644 					      &error, GET_CLUSTER_NORECOVER);
645 		hammer_rel_volume(nvolume, 0);
646 		if (error) {
647 			Debugger("recover_btree2");
648 			return(error);
649 		}
650 
651 		/*
652 		 * Validate the cluster.  Allow the offset to be fixed up.
653 		 */
654 		if (ncluster->ondisk->clu_btree_parent_vol_no != ovolume->vol_no ||
655 		    ncluster->ondisk->clu_btree_parent_clu_no != cluster->clu_no) {
656 			kprintf("hammer_recover: Bad cluster spike hookup: "
657 				"%d:%d != %d:%d\n",
658 				ncluster->ondisk->clu_btree_parent_vol_no,
659 				ncluster->ondisk->clu_btree_parent_clu_no,
660 				ovolume->vol_no,
661 				cluster->clu_no);
662 			error = EINVAL;
663 			hammer_rel_cluster(ncluster, 0);
664 			Debugger("recover_btree3");
665 			return(error);
666 		}
667 	} else {
668 		ncluster = NULL;
669 	}
670 
671 	/*
672 	 * Locate the insertion point.  Note that we are using the cluster-
673 	 * localized cursor init so parent will start out NULL.
674 	 *
675 	 * The key(s) used for spike's are bounds and different from the
676 	 * key embedded in the spike record.  A special B-Tree insertion
677 	 * call is made to deal with spikes.
678 	 */
679 	error = hammer_init_cursor_cluster(&cursor, cluster);
680 	if (error) {
681 		Debugger("recover_btree6");
682 		goto failed;
683 	}
684 	KKASSERT(cursor.node);
685 	if (ncluster)
686 		cursor.key_beg = ncluster->ondisk->clu_btree_beg;
687 	else
688 		cursor.key_beg = rec->base.base;
689 	cursor.flags |= HAMMER_CURSOR_INSERT | HAMMER_CURSOR_RECOVER;
690 
691 	error = hammer_btree_lookup(&cursor);
692 	KKASSERT(error != EDEADLK);
693 	KKASSERT(cursor.node);
694 	if (error == 0) {
695 		kprintf("hammer_recover_btree: Duplicate record cursor %p rec %p ncluster %p\n",
696 			&cursor, rec, ncluster);
697 		hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index], HAMMER_BTREE_TYPE_LEAF, cursor.index);
698 		Debugger("duplicate record");
699 	}
700 	if (error != ENOENT) {
701 		Debugger("recover_btree5");
702 		goto failed;
703 	}
704 
705 
706 	if (ncluster) {
707 		/*
708 		 * Spike record
709 		 */
710 		kprintf("recover spike clu %d %016llx-%016llx clusterfree %d\n",
711 			ncluster->clu_no,
712 			ncluster->ondisk->clu_btree_beg.obj_id,
713 			ncluster->ondisk->clu_btree_end.obj_id,
714 			cluster->alist_master.meta->bm_alist_freeblks);
715 		error = hammer_btree_insert_cluster(&cursor, ncluster,
716 						    rec_offset);
717 		kprintf("recover spike record error %d clusterfree %d\n",
718 			error,
719 			cluster->alist_master.meta->bm_alist_freeblks);
720 		KKASSERT(error != EDEADLK);
721 		if (error)
722 			Debugger("spike recovery");
723 	} else {
724 		/*
725 		 * Normal record
726 		 */
727 #if 0
728 		kprintf("recover recrd clu %d %016llx\n",
729 			cluster->clu_no, rec->base.base.obj_id);
730 #endif
731 		elm.leaf.base = rec->base.base;
732 		elm.leaf.rec_offset = rec_offset;
733 		elm.leaf.data_offset = rec->base.data_offset;
734 		elm.leaf.data_len = rec->base.data_len;
735 		elm.leaf.data_crc = rec->base.data_crc;
736 
737 		error = hammer_btree_insert(&cursor, &elm);
738 		KKASSERT(error != EDEADLK);
739 	}
740 
741 	/*
742 	 * Success if error is 0!
743 	 */
744 	if (error == 0) {
745 		/*
746 		 * Update the cluster header's statistics count.  stat_records
747 		 * is very important for proper reservation of B-Tree space.
748 		 * Note that a spike record counts as 2.
749 		 */
750 		++cluster->ondisk->stat_records;
751 		if (rec->base.base.rec_type == HAMMER_RECTYPE_INODE)
752 			++cluster->ondisk->stat_inodes;
753 		if (rec->base.base.rec_type == HAMMER_RECTYPE_CLUSTER)
754 			++cluster->ondisk->stat_records;
755 	}
756 	if (error) {
757 		kprintf("hammer_recover_btree: insertion failed\n");
758 	}
759 
760 failed:
761 	if (ncluster)
762 		hammer_rel_cluster(ncluster, 0);
763 	hammer_done_cursor(&cursor);
764 	return(error);
765 }
766 
767 #endif
768