1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.19 2008/09/23 21:03:52 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 39 /* 40 * Iterate through the specified range of object ids and remove any 41 * deleted records that fall entirely within a prune modulo. 42 * 43 * A reverse iteration is used to prevent overlapping records from being 44 * created during the iteration due to alignments. This also allows us 45 * to adjust alignments without blowing up the B-Tree. 46 */ 47 static int prune_should_delete(struct hammer_ioc_prune *prune, 48 hammer_btree_leaf_elm_t elm); 49 static void prune_check_nlinks(hammer_cursor_t cursor, 50 hammer_btree_leaf_elm_t elm); 51 52 int 53 hammer_ioc_prune(hammer_transaction_t trans, hammer_inode_t ip, 54 struct hammer_ioc_prune *prune) 55 { 56 struct hammer_cursor cursor; 57 hammer_btree_leaf_elm_t elm; 58 struct hammer_ioc_prune_elm *copy_elms; 59 struct hammer_ioc_prune_elm *user_elms; 60 int error; 61 int isdir; 62 int elm_array_size; 63 int seq; 64 65 if (prune->nelms < 0 || prune->nelms > HAMMER_MAX_PRUNE_ELMS) 66 return(EINVAL); 67 if ((prune->key_beg.localization | prune->key_end.localization) & 68 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 69 return(EINVAL); 70 } 71 if (prune->key_beg.localization > prune->key_end.localization) 72 return(EINVAL); 73 if (prune->key_beg.localization == prune->key_end.localization) { 74 if (prune->key_beg.obj_id > prune->key_end.obj_id) 75 return(EINVAL); 76 /* key-space limitations - no check needed */ 77 } 78 if ((prune->head.flags & HAMMER_IOC_PRUNE_ALL) && prune->nelms) 79 return(EINVAL); 80 81 prune->key_cur.localization = (prune->key_end.localization & 82 HAMMER_LOCALIZE_MASK) + 83 ip->obj_localization; 84 prune->key_cur.obj_id = prune->key_end.obj_id; 85 prune->key_cur.key = HAMMER_MAX_KEY; 86 87 /* 88 * Copy element array from userland 89 */ 90 elm_array_size = sizeof(*copy_elms) * prune->nelms; 91 user_elms = prune->elms; 92 copy_elms = kmalloc(elm_array_size, M_TEMP, M_WAITOK); 93 if ((error = copyin(user_elms, copy_elms, elm_array_size)) != 0) 94 goto failed; 95 prune->elms = copy_elms; 96 97 seq = trans->hmp->flusher.act; 98 99 /* 100 * Scan backwards. Retries typically occur if a deadlock is detected. 101 */ 102 retry: 103 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 104 if (error) { 105 hammer_done_cursor(&cursor); 106 goto failed; 107 } 108 cursor.key_beg.localization = (prune->key_beg.localization & 109 HAMMER_LOCALIZE_MASK) + 110 ip->obj_localization; 111 cursor.key_beg.obj_id = prune->key_beg.obj_id; 112 cursor.key_beg.key = HAMMER_MIN_KEY; 113 cursor.key_beg.create_tid = 1; 114 cursor.key_beg.delete_tid = 0; 115 cursor.key_beg.rec_type = HAMMER_MIN_RECTYPE; 116 cursor.key_beg.obj_type = 0; 117 118 cursor.key_end.localization = prune->key_cur.localization; 119 cursor.key_end.obj_id = prune->key_cur.obj_id; 120 cursor.key_end.key = prune->key_cur.key; 121 cursor.key_end.create_tid = HAMMER_MAX_TID - 1; 122 cursor.key_end.delete_tid = 0; 123 cursor.key_end.rec_type = HAMMER_MAX_RECTYPE; 124 cursor.key_end.obj_type = 0; 125 126 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 127 cursor.flags |= HAMMER_CURSOR_BACKEND; 128 129 /* 130 * This flag allows the B-Tree code to clean up loose ends. At 131 * the moment (XXX) it also means we have to hold the sync lock 132 * through the iteration. 133 */ 134 cursor.flags |= HAMMER_CURSOR_PRUNING; 135 136 hammer_sync_lock_sh(trans); 137 error = hammer_btree_last(&cursor); 138 hammer_sync_unlock(trans); 139 140 while (error == 0) { 141 /* 142 * Check for work 143 */ 144 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 145 prune->key_cur = elm->base; 146 147 /* 148 * Yield to more important tasks 149 */ 150 if ((error = hammer_signal_check(trans->hmp)) != 0) 151 break; 152 153 if (prune->stat_oldest_tid > elm->base.create_tid) 154 prune->stat_oldest_tid = elm->base.create_tid; 155 156 if (hammer_debug_general & 0x0200) { 157 kprintf("check %016llx %016llx cre=%016llx del=%016llx\n", 158 (long long)elm->base.obj_id, 159 (long long)elm->base.key, 160 (long long)elm->base.create_tid, 161 (long long)elm->base.delete_tid); 162 } 163 164 if (prune_should_delete(prune, elm)) { 165 if (hammer_debug_general & 0x0200) { 166 kprintf("check %016llx %016llx: DELETE\n", 167 (long long)elm->base.obj_id, 168 (long long)elm->base.key); 169 } 170 171 /* 172 * NOTE: This can return EDEADLK 173 * 174 * Acquiring the sync lock guarantees that the 175 * operation will not cross a synchronization 176 * boundary (see the flusher). 177 * 178 * We dont need to track inodes or next_tid when 179 * we are destroying deleted records. 180 */ 181 isdir = (elm->base.rec_type == HAMMER_RECTYPE_DIRENTRY); 182 183 hammer_sync_lock_sh(trans); 184 error = hammer_delete_at_cursor(&cursor, 185 HAMMER_DELETE_DESTROY, 186 cursor.trans->tid, 187 cursor.trans->time32, 188 0, &prune->stat_bytes); 189 hammer_sync_unlock(trans); 190 if (error) 191 break; 192 193 if (isdir) 194 ++prune->stat_dirrecords; 195 else 196 ++prune->stat_rawrecords; 197 198 /* 199 * The current record might now be the one after 200 * the one we deleted, set ATEDISK to force us 201 * to skip it (since we are iterating backwards). 202 */ 203 cursor.flags |= HAMMER_CURSOR_ATEDISK; 204 } else { 205 /* 206 * Nothing to delete, but we may have to check other 207 * things. 208 */ 209 prune_check_nlinks(&cursor, elm); 210 cursor.flags |= HAMMER_CURSOR_ATEDISK; 211 if (hammer_debug_general & 0x0100) { 212 kprintf("check %016llx %016llx: SKIP\n", 213 (long long)elm->base.obj_id, 214 (long long)elm->base.key); 215 } 216 } 217 ++prune->stat_scanrecords; 218 219 while (hammer_flusher_meta_halflimit(trans->hmp) || 220 hammer_flusher_undo_exhausted(trans, 2)) { 221 hammer_unlock_cursor(&cursor); 222 hammer_flusher_wait(trans->hmp, seq); 223 hammer_lock_cursor(&cursor); 224 seq = hammer_flusher_async_one(trans->hmp); 225 } 226 hammer_sync_lock_sh(trans); 227 error = hammer_btree_iterate_reverse(&cursor); 228 hammer_sync_unlock(trans); 229 } 230 if (error == ENOENT) 231 error = 0; 232 hammer_done_cursor(&cursor); 233 if (error == EDEADLK) 234 goto retry; 235 if (error == EINTR) { 236 prune->head.flags |= HAMMER_IOC_HEAD_INTR; 237 error = 0; 238 } 239 failed: 240 prune->key_cur.localization &= HAMMER_LOCALIZE_MASK; 241 prune->elms = user_elms; 242 kfree(copy_elms, M_TEMP); 243 return(error); 244 } 245 246 /* 247 * Check pruning list. The list must be sorted in descending order. 248 * 249 * Return non-zero if the record should be deleted. 250 */ 251 static int 252 prune_should_delete(struct hammer_ioc_prune *prune, hammer_btree_leaf_elm_t elm) 253 { 254 struct hammer_ioc_prune_elm *scan; 255 int i; 256 257 /* 258 * If pruning everything remove all records with a non-zero 259 * delete_tid. 260 */ 261 if (prune->head.flags & HAMMER_IOC_PRUNE_ALL) { 262 if (elm->base.delete_tid != 0) 263 return(1); 264 return(0); 265 } 266 267 for (i = 0; i < prune->nelms; ++i) { 268 scan = &prune->elms[i]; 269 270 /* 271 * Check for loop termination. 272 */ 273 if (elm->base.create_tid >= scan->end_tid || 274 elm->base.delete_tid > scan->end_tid) { 275 break; 276 } 277 278 /* 279 * Determine if we can delete the record. 280 */ 281 if (elm->base.delete_tid && 282 elm->base.create_tid >= scan->beg_tid && 283 elm->base.delete_tid <= scan->end_tid && 284 (elm->base.create_tid - scan->beg_tid) / scan->mod_tid == 285 (elm->base.delete_tid - scan->beg_tid) / scan->mod_tid) { 286 return(1); 287 } 288 } 289 return(0); 290 } 291 292 /* 293 * Dangling inodes can occur if processes are holding open descriptors on 294 * deleted files as-of when a machine crashes. When we find one simply 295 * acquire the inode and release it. The inode handling code will then 296 * do the right thing. 297 */ 298 static 299 void 300 prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm) 301 { 302 hammer_inode_t ip; 303 int error; 304 305 if (elm->base.rec_type != HAMMER_RECTYPE_INODE) 306 return; 307 if (elm->base.delete_tid != 0) 308 return; 309 if (hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA)) 310 return; 311 if (cursor->data->inode.nlinks) 312 return; 313 hammer_cursor_downgrade(cursor); 314 ip = hammer_get_inode(cursor->trans, NULL, elm->base.obj_id, 315 HAMMER_MAX_TID, 316 elm->base.localization & HAMMER_LOCALIZE_PSEUDOFS_MASK, 317 0, &error); 318 if (ip) { 319 if (hammer_debug_general & 0x0001) { 320 kprintf("pruning disconnected inode %016llx\n", 321 (long long)elm->base.obj_id); 322 } 323 hammer_rel_inode(ip, 0); 324 hammer_inode_waitreclaims(cursor->trans->hmp); 325 } else { 326 kprintf("unable to prune disconnected inode %016llx\n", 327 (long long)elm->base.obj_id); 328 } 329 } 330 331 #if 0 332 333 /* 334 * NOTE: THIS CODE HAS BEEN REMOVED! Pruning no longer attempts to realign 335 * adjacent records because it seriously interferes with every 336 * mirroring algorithm I could come up with. 337 * 338 * This means that historical accesses beyond the first snapshot 339 * softlink should be on snapshot boundaries only. Historical 340 * accesses from "now" to the first snapshot softlink continue to 341 * be fine-grained. 342 * 343 * NOTE: It also looks like there's a bug in the removed code. It is believed 344 * that create_tid can sometimes get set to 0xffffffffffffffff. Just as 345 * well we no longer try to do this fancy shit. Probably the attempt to 346 * correct the rhb is blowing up the cursor's indexing or addressing mapping. 347 * 348 * Align the record to cover any gaps created through the deletion of 349 * records within the pruning space. If we were to just delete the records 350 * there would be gaps which in turn would cause a snapshot that is NOT on 351 * a pruning boundary to appear corrupt to the user. Forcing alignment 352 * of the create_tid and delete_tid for retained records 'reconnects' 353 * the previously contiguous space, making it contiguous again after the 354 * deletions. 355 * 356 * The use of a reverse iteration allows us to safely align the records and 357 * related elements without creating temporary overlaps. XXX we should 358 * add ordering dependancies for record buffers to guarantee consistency 359 * during recovery. 360 */ 361 static int 362 realign_prune(struct hammer_ioc_prune *prune, 363 hammer_cursor_t cursor, int realign_cre, int realign_del) 364 { 365 struct hammer_ioc_prune_elm *scan; 366 hammer_btree_elm_t elm; 367 hammer_tid_t delta; 368 hammer_tid_t tid; 369 int error; 370 371 hammer_cursor_downgrade(cursor); 372 373 elm = &cursor->node->ondisk->elms[cursor->index]; 374 ++prune->stat_realignments; 375 376 /* 377 * Align the create_tid. By doing a reverse iteration we guarantee 378 * that all records after our current record have already been 379 * aligned, allowing us to safely correct the right-hand-boundary 380 * (because no record to our right is otherwise exactly matching 381 * will have a create_tid to the left of our aligned create_tid). 382 */ 383 error = 0; 384 if (realign_cre >= 0) { 385 scan = &prune->elms[realign_cre]; 386 387 delta = (elm->leaf.base.create_tid - scan->beg_tid) % 388 scan->mod_tid; 389 if (delta) { 390 tid = elm->leaf.base.create_tid - delta + scan->mod_tid; 391 392 /* can EDEADLK */ 393 error = hammer_btree_correct_rhb(cursor, tid + 1); 394 if (error == 0) { 395 error = hammer_btree_extract(cursor, 396 HAMMER_CURSOR_GET_LEAF); 397 } 398 if (error == 0) { 399 /* can EDEADLK */ 400 error = hammer_cursor_upgrade(cursor); 401 } 402 if (error == 0) { 403 hammer_modify_node(cursor->trans, cursor->node, 404 &elm->leaf.base.create_tid, 405 sizeof(elm->leaf.base.create_tid)); 406 elm->leaf.base.create_tid = tid; 407 hammer_modify_node_done(cursor->node); 408 } 409 } 410 } 411 412 /* 413 * Align the delete_tid. This only occurs if the record is historical 414 * was deleted at some point. Realigning the delete_tid does not 415 * move the record within the B-Tree but may cause it to temporarily 416 * overlap a record that has not yet been pruned. 417 */ 418 if (error == 0 && realign_del >= 0) { 419 scan = &prune->elms[realign_del]; 420 421 delta = (elm->leaf.base.delete_tid - scan->beg_tid) % 422 scan->mod_tid; 423 if (delta) { 424 error = hammer_btree_extract(cursor, 425 HAMMER_CURSOR_GET_LEAF); 426 if (error == 0) { 427 hammer_modify_node(cursor->trans, cursor->node, 428 &elm->leaf.base.delete_tid, 429 sizeof(elm->leaf.base.delete_tid)); 430 elm->leaf.base.delete_tid = 431 elm->leaf.base.delete_tid - 432 delta + scan->mod_tid; 433 hammer_modify_node_done(cursor->node); 434 } 435 } 436 } 437 return (error); 438 } 439 440 #endif 441