1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu.h> 28 #include <sys/dmu_impl.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/spa.h> 35 #include <sys/zio.h> 36 #include <sys/dmu_zfetch.h> 37 38 static void dbuf_destroy(dmu_buf_impl_t *db); 39 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 40 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 41 42 /* 43 * Global data structures and functions for the dbuf cache. 44 */ 45 static kmem_cache_t *dbuf_cache; 46 47 /* ARGSUSED */ 48 static int 49 dbuf_cons(void *vdb, void *unused, int kmflag) 50 { 51 dmu_buf_impl_t *db = unused; 52 bzero(db, sizeof (dmu_buf_impl_t)); 53 54 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 55 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 56 refcount_create(&db->db_holds); 57 return (0); 58 } 59 60 /* ARGSUSED */ 61 static void 62 dbuf_dest(void *vdb, void *unused) 63 { 64 dmu_buf_impl_t *db = unused; 65 mutex_destroy(&db->db_mtx); 66 cv_destroy(&db->db_changed); 67 refcount_destroy(&db->db_holds); 68 } 69 70 /* 71 * dbuf hash table routines 72 */ 73 static dbuf_hash_table_t dbuf_hash_table; 74 75 static uint64_t dbuf_hash_count; 76 77 static uint64_t 78 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 79 { 80 uintptr_t osv = (uintptr_t)os; 81 uint64_t crc = -1ULL; 82 83 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 84 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 85 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 86 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 90 91 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 92 93 return (crc); 94 } 95 96 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 97 98 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 99 ((dbuf)->db.db_object == (obj) && \ 100 (dbuf)->db_objset == (os) && \ 101 (dbuf)->db_level == (level) && \ 102 (dbuf)->db_blkid == (blkid)) 103 104 dmu_buf_impl_t * 105 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 106 { 107 dbuf_hash_table_t *h = &dbuf_hash_table; 108 objset_t *os = dn->dn_objset; 109 uint64_t obj = dn->dn_object; 110 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 111 uint64_t idx = hv & h->hash_table_mask; 112 dmu_buf_impl_t *db; 113 114 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 115 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 116 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 117 mutex_enter(&db->db_mtx); 118 if (db->db_state != DB_EVICTING) { 119 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 120 return (db); 121 } 122 mutex_exit(&db->db_mtx); 123 } 124 } 125 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 126 return (NULL); 127 } 128 129 /* 130 * Insert an entry into the hash table. If there is already an element 131 * equal to elem in the hash table, then the already existing element 132 * will be returned and the new element will not be inserted. 133 * Otherwise returns NULL. 134 */ 135 static dmu_buf_impl_t * 136 dbuf_hash_insert(dmu_buf_impl_t *db) 137 { 138 dbuf_hash_table_t *h = &dbuf_hash_table; 139 objset_t *os = db->db_objset; 140 uint64_t obj = db->db.db_object; 141 int level = db->db_level; 142 uint64_t blkid = db->db_blkid; 143 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 144 uint64_t idx = hv & h->hash_table_mask; 145 dmu_buf_impl_t *dbf; 146 147 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 148 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 149 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 150 mutex_enter(&dbf->db_mtx); 151 if (dbf->db_state != DB_EVICTING) { 152 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 153 return (dbf); 154 } 155 mutex_exit(&dbf->db_mtx); 156 } 157 } 158 159 mutex_enter(&db->db_mtx); 160 db->db_hash_next = h->hash_table[idx]; 161 h->hash_table[idx] = db; 162 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 163 atomic_add_64(&dbuf_hash_count, 1); 164 165 return (NULL); 166 } 167 168 /* 169 * Remove an entry from the hash table. This operation will 170 * fail if there are any existing holds on the db. 171 */ 172 static void 173 dbuf_hash_remove(dmu_buf_impl_t *db) 174 { 175 dbuf_hash_table_t *h = &dbuf_hash_table; 176 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 177 db->db_level, db->db_blkid); 178 uint64_t idx = hv & h->hash_table_mask; 179 dmu_buf_impl_t *dbf, **dbp; 180 181 /* 182 * We musn't hold db_mtx to maintin lock ordering: 183 * DBUF_HASH_MUTEX > db_mtx. 184 */ 185 ASSERT(refcount_is_zero(&db->db_holds)); 186 ASSERT(db->db_state == DB_EVICTING); 187 ASSERT(!MUTEX_HELD(&db->db_mtx)); 188 189 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 190 dbp = &h->hash_table[idx]; 191 while ((dbf = *dbp) != db) { 192 dbp = &dbf->db_hash_next; 193 ASSERT(dbf != NULL); 194 } 195 *dbp = db->db_hash_next; 196 db->db_hash_next = NULL; 197 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 198 atomic_add_64(&dbuf_hash_count, -1); 199 } 200 201 static arc_evict_func_t dbuf_do_evict; 202 203 static void 204 dbuf_evict_user(dmu_buf_impl_t *db) 205 { 206 ASSERT(MUTEX_HELD(&db->db_mtx)); 207 208 if (db->db_level != 0 || db->db_evict_func == NULL) 209 return; 210 211 if (db->db_user_data_ptr_ptr) 212 *db->db_user_data_ptr_ptr = db->db.db_data; 213 db->db_evict_func(&db->db, db->db_user_ptr); 214 db->db_user_ptr = NULL; 215 db->db_user_data_ptr_ptr = NULL; 216 db->db_evict_func = NULL; 217 } 218 219 void 220 dbuf_evict(dmu_buf_impl_t *db) 221 { 222 ASSERT(MUTEX_HELD(&db->db_mtx)); 223 ASSERT(db->db_buf == NULL); 224 ASSERT(db->db_data_pending == NULL); 225 226 dbuf_clear(db); 227 dbuf_destroy(db); 228 } 229 230 void 231 dbuf_init(void) 232 { 233 uint64_t hsize = 1ULL << 16; 234 dbuf_hash_table_t *h = &dbuf_hash_table; 235 int i; 236 237 /* 238 * The hash table is big enough to fill all of physical memory 239 * with an average 4K block size. The table will take up 240 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 241 */ 242 while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 243 hsize <<= 1; 244 245 retry: 246 h->hash_table_mask = hsize - 1; 247 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 248 if (h->hash_table == NULL) { 249 /* XXX - we should really return an error instead of assert */ 250 ASSERT(hsize > (1ULL << 10)); 251 hsize >>= 1; 252 goto retry; 253 } 254 255 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 256 sizeof (dmu_buf_impl_t), 257 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 258 259 for (i = 0; i < DBUF_MUTEXES; i++) 260 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 261 } 262 263 void 264 dbuf_fini(void) 265 { 266 dbuf_hash_table_t *h = &dbuf_hash_table; 267 int i; 268 269 for (i = 0; i < DBUF_MUTEXES; i++) 270 mutex_destroy(&h->hash_mutexes[i]); 271 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 272 kmem_cache_destroy(dbuf_cache); 273 } 274 275 /* 276 * Other stuff. 277 */ 278 279 #ifdef ZFS_DEBUG 280 static void 281 dbuf_verify(dmu_buf_impl_t *db) 282 { 283 dnode_t *dn = db->db_dnode; 284 dbuf_dirty_record_t *dr; 285 286 ASSERT(MUTEX_HELD(&db->db_mtx)); 287 288 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 289 return; 290 291 ASSERT(db->db_objset != NULL); 292 if (dn == NULL) { 293 ASSERT(db->db_parent == NULL); 294 ASSERT(db->db_blkptr == NULL); 295 } else { 296 ASSERT3U(db->db.db_object, ==, dn->dn_object); 297 ASSERT3P(db->db_objset, ==, dn->dn_objset); 298 ASSERT3U(db->db_level, <, dn->dn_nlevels); 299 ASSERT(db->db_blkid == DB_BONUS_BLKID || 300 list_head(&dn->dn_dbufs)); 301 } 302 if (db->db_blkid == DB_BONUS_BLKID) { 303 ASSERT(dn != NULL); 304 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 305 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 306 } else { 307 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 308 } 309 310 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 311 ASSERT(dr->dr_dbuf == db); 312 313 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 314 ASSERT(dr->dr_dbuf == db); 315 316 /* 317 * We can't assert that db_size matches dn_datablksz because it 318 * can be momentarily different when another thread is doing 319 * dnode_set_blksz(). 320 */ 321 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 322 dr = db->db_data_pending; 323 /* 324 * It should only be modified in syncing context, so 325 * make sure we only have one copy of the data. 326 */ 327 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 328 } 329 330 /* verify db->db_blkptr */ 331 if (db->db_blkptr) { 332 if (db->db_parent == dn->dn_dbuf) { 333 /* db is pointed to by the dnode */ 334 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 335 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 336 ASSERT(db->db_parent == NULL); 337 else 338 ASSERT(db->db_parent != NULL); 339 ASSERT3P(db->db_blkptr, ==, 340 &dn->dn_phys->dn_blkptr[db->db_blkid]); 341 } else { 342 /* db is pointed to by an indirect block */ 343 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 344 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 345 ASSERT3U(db->db_parent->db.db_object, ==, 346 db->db.db_object); 347 /* 348 * dnode_grow_indblksz() can make this fail if we don't 349 * have the struct_rwlock. XXX indblksz no longer 350 * grows. safe to do this now? 351 */ 352 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 353 ASSERT3P(db->db_blkptr, ==, 354 ((blkptr_t *)db->db_parent->db.db_data + 355 db->db_blkid % epb)); 356 } 357 } 358 } 359 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 360 db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 361 db->db_state != DB_FILL && !dn->dn_free_txg) { 362 /* 363 * If the blkptr isn't set but they have nonzero data, 364 * it had better be dirty, otherwise we'll lose that 365 * data when we evict this buffer. 366 */ 367 if (db->db_dirtycnt == 0) { 368 uint64_t *buf = db->db.db_data; 369 int i; 370 371 for (i = 0; i < db->db.db_size >> 3; i++) { 372 ASSERT(buf[i] == 0); 373 } 374 } 375 } 376 } 377 #endif 378 379 static void 380 dbuf_update_data(dmu_buf_impl_t *db) 381 { 382 ASSERT(MUTEX_HELD(&db->db_mtx)); 383 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 384 ASSERT(!refcount_is_zero(&db->db_holds)); 385 *db->db_user_data_ptr_ptr = db->db.db_data; 386 } 387 } 388 389 static void 390 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 391 { 392 ASSERT(MUTEX_HELD(&db->db_mtx)); 393 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 394 db->db_buf = buf; 395 if (buf != NULL) { 396 ASSERT(buf->b_data != NULL); 397 db->db.db_data = buf->b_data; 398 if (!arc_released(buf)) 399 arc_set_callback(buf, dbuf_do_evict, db); 400 dbuf_update_data(db); 401 } else { 402 dbuf_evict_user(db); 403 db->db.db_data = NULL; 404 if (db->db_state != DB_NOFILL) 405 db->db_state = DB_UNCACHED; 406 } 407 } 408 409 /* 410 * Loan out an arc_buf for read. Return the loaned arc_buf. 411 */ 412 arc_buf_t * 413 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 414 { 415 arc_buf_t *abuf; 416 417 mutex_enter(&db->db_mtx); 418 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 419 int blksz = db->db.db_size; 420 mutex_exit(&db->db_mtx); 421 abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz); 422 bcopy(db->db.db_data, abuf->b_data, blksz); 423 } else { 424 abuf = db->db_buf; 425 arc_loan_inuse_buf(abuf, db); 426 dbuf_set_data(db, NULL); 427 mutex_exit(&db->db_mtx); 428 } 429 return (abuf); 430 } 431 432 uint64_t 433 dbuf_whichblock(dnode_t *dn, uint64_t offset) 434 { 435 if (dn->dn_datablkshift) { 436 return (offset >> dn->dn_datablkshift); 437 } else { 438 ASSERT3U(offset, <, dn->dn_datablksz); 439 return (0); 440 } 441 } 442 443 static void 444 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 445 { 446 dmu_buf_impl_t *db = vdb; 447 448 mutex_enter(&db->db_mtx); 449 ASSERT3U(db->db_state, ==, DB_READ); 450 /* 451 * All reads are synchronous, so we must have a hold on the dbuf 452 */ 453 ASSERT(refcount_count(&db->db_holds) > 0); 454 ASSERT(db->db_buf == NULL); 455 ASSERT(db->db.db_data == NULL); 456 if (db->db_level == 0 && db->db_freed_in_flight) { 457 /* we were freed in flight; disregard any error */ 458 arc_release(buf, db); 459 bzero(buf->b_data, db->db.db_size); 460 arc_buf_freeze(buf); 461 db->db_freed_in_flight = FALSE; 462 dbuf_set_data(db, buf); 463 db->db_state = DB_CACHED; 464 } else if (zio == NULL || zio->io_error == 0) { 465 dbuf_set_data(db, buf); 466 db->db_state = DB_CACHED; 467 } else { 468 ASSERT(db->db_blkid != DB_BONUS_BLKID); 469 ASSERT3P(db->db_buf, ==, NULL); 470 VERIFY(arc_buf_remove_ref(buf, db) == 1); 471 db->db_state = DB_UNCACHED; 472 } 473 cv_broadcast(&db->db_changed); 474 mutex_exit(&db->db_mtx); 475 dbuf_rele(db, NULL); 476 } 477 478 static void 479 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 480 { 481 dnode_t *dn = db->db_dnode; 482 zbookmark_t zb; 483 uint32_t aflags = ARC_NOWAIT; 484 arc_buf_t *pbuf; 485 486 ASSERT(!refcount_is_zero(&db->db_holds)); 487 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 488 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 489 ASSERT(MUTEX_HELD(&db->db_mtx)); 490 ASSERT(db->db_state == DB_UNCACHED); 491 ASSERT(db->db_buf == NULL); 492 493 if (db->db_blkid == DB_BONUS_BLKID) { 494 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 495 496 ASSERT3U(bonuslen, <=, db->db.db_size); 497 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 498 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 499 if (bonuslen < DN_MAX_BONUSLEN) 500 bzero(db->db.db_data, DN_MAX_BONUSLEN); 501 if (bonuslen) 502 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 503 dbuf_update_data(db); 504 db->db_state = DB_CACHED; 505 mutex_exit(&db->db_mtx); 506 return; 507 } 508 509 /* 510 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 511 * processes the delete record and clears the bp while we are waiting 512 * for the dn_mtx (resulting in a "no" from block_freed). 513 */ 514 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 515 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 516 BP_IS_HOLE(db->db_blkptr)))) { 517 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 518 519 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 520 db->db.db_size, db, type)); 521 bzero(db->db.db_data, db->db.db_size); 522 db->db_state = DB_CACHED; 523 *flags |= DB_RF_CACHED; 524 mutex_exit(&db->db_mtx); 525 return; 526 } 527 528 db->db_state = DB_READ; 529 mutex_exit(&db->db_mtx); 530 531 if (DBUF_IS_L2CACHEABLE(db)) 532 aflags |= ARC_L2CACHE; 533 534 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 535 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 536 db->db.db_object, db->db_level, db->db_blkid); 537 538 dbuf_add_ref(db, NULL); 539 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 540 541 if (db->db_parent) 542 pbuf = db->db_parent->db_buf; 543 else 544 pbuf = db->db_objset->os_phys_buf; 545 546 (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, 547 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 548 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 549 &aflags, &zb); 550 if (aflags & ARC_CACHED) 551 *flags |= DB_RF_CACHED; 552 } 553 554 int 555 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 556 { 557 int err = 0; 558 int havepzio = (zio != NULL); 559 int prefetch; 560 561 /* 562 * We don't have to hold the mutex to check db_state because it 563 * can't be freed while we have a hold on the buffer. 564 */ 565 ASSERT(!refcount_is_zero(&db->db_holds)); 566 567 if (db->db_state == DB_NOFILL) 568 return (EIO); 569 570 if ((flags & DB_RF_HAVESTRUCT) == 0) 571 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 572 573 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 574 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && 575 DBUF_IS_CACHEABLE(db); 576 577 mutex_enter(&db->db_mtx); 578 if (db->db_state == DB_CACHED) { 579 mutex_exit(&db->db_mtx); 580 if (prefetch) 581 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 582 db->db.db_size, TRUE); 583 if ((flags & DB_RF_HAVESTRUCT) == 0) 584 rw_exit(&db->db_dnode->dn_struct_rwlock); 585 } else if (db->db_state == DB_UNCACHED) { 586 if (zio == NULL) { 587 zio = zio_root(db->db_dnode->dn_objset->os_spa, 588 NULL, NULL, ZIO_FLAG_CANFAIL); 589 } 590 dbuf_read_impl(db, zio, &flags); 591 592 /* dbuf_read_impl has dropped db_mtx for us */ 593 594 if (prefetch) 595 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 596 db->db.db_size, flags & DB_RF_CACHED); 597 598 if ((flags & DB_RF_HAVESTRUCT) == 0) 599 rw_exit(&db->db_dnode->dn_struct_rwlock); 600 601 if (!havepzio) 602 err = zio_wait(zio); 603 } else { 604 mutex_exit(&db->db_mtx); 605 if (prefetch) 606 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 607 db->db.db_size, TRUE); 608 if ((flags & DB_RF_HAVESTRUCT) == 0) 609 rw_exit(&db->db_dnode->dn_struct_rwlock); 610 611 mutex_enter(&db->db_mtx); 612 if ((flags & DB_RF_NEVERWAIT) == 0) { 613 while (db->db_state == DB_READ || 614 db->db_state == DB_FILL) { 615 ASSERT(db->db_state == DB_READ || 616 (flags & DB_RF_HAVESTRUCT) == 0); 617 cv_wait(&db->db_changed, &db->db_mtx); 618 } 619 if (db->db_state == DB_UNCACHED) 620 err = EIO; 621 } 622 mutex_exit(&db->db_mtx); 623 } 624 625 ASSERT(err || havepzio || db->db_state == DB_CACHED); 626 return (err); 627 } 628 629 static void 630 dbuf_noread(dmu_buf_impl_t *db) 631 { 632 ASSERT(!refcount_is_zero(&db->db_holds)); 633 ASSERT(db->db_blkid != DB_BONUS_BLKID); 634 mutex_enter(&db->db_mtx); 635 while (db->db_state == DB_READ || db->db_state == DB_FILL) 636 cv_wait(&db->db_changed, &db->db_mtx); 637 if (db->db_state == DB_UNCACHED) { 638 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 639 640 ASSERT(db->db_buf == NULL); 641 ASSERT(db->db.db_data == NULL); 642 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 643 db->db.db_size, db, type)); 644 db->db_state = DB_FILL; 645 } else if (db->db_state == DB_NOFILL) { 646 dbuf_set_data(db, NULL); 647 } else { 648 ASSERT3U(db->db_state, ==, DB_CACHED); 649 } 650 mutex_exit(&db->db_mtx); 651 } 652 653 /* 654 * This is our just-in-time copy function. It makes a copy of 655 * buffers, that have been modified in a previous transaction 656 * group, before we modify them in the current active group. 657 * 658 * This function is used in two places: when we are dirtying a 659 * buffer for the first time in a txg, and when we are freeing 660 * a range in a dnode that includes this buffer. 661 * 662 * Note that when we are called from dbuf_free_range() we do 663 * not put a hold on the buffer, we just traverse the active 664 * dbuf list for the dnode. 665 */ 666 static void 667 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 668 { 669 dbuf_dirty_record_t *dr = db->db_last_dirty; 670 671 ASSERT(MUTEX_HELD(&db->db_mtx)); 672 ASSERT(db->db.db_data != NULL); 673 ASSERT(db->db_level == 0); 674 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 675 676 if (dr == NULL || 677 (dr->dt.dl.dr_data != 678 ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 679 return; 680 681 /* 682 * If the last dirty record for this dbuf has not yet synced 683 * and its referencing the dbuf data, either: 684 * reset the reference to point to a new copy, 685 * or (if there a no active holders) 686 * just null out the current db_data pointer. 687 */ 688 ASSERT(dr->dr_txg >= txg - 2); 689 if (db->db_blkid == DB_BONUS_BLKID) { 690 /* Note that the data bufs here are zio_bufs */ 691 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 692 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 693 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 694 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 695 int size = db->db.db_size; 696 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 697 dr->dt.dl.dr_data = arc_buf_alloc( 698 db->db_dnode->dn_objset->os_spa, size, db, type); 699 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 700 } else { 701 dbuf_set_data(db, NULL); 702 } 703 } 704 705 void 706 dbuf_unoverride(dbuf_dirty_record_t *dr) 707 { 708 dmu_buf_impl_t *db = dr->dr_dbuf; 709 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 710 uint64_t txg = dr->dr_txg; 711 712 ASSERT(MUTEX_HELD(&db->db_mtx)); 713 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 714 ASSERT(db->db_level == 0); 715 716 if (db->db_blkid == DB_BONUS_BLKID || 717 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 718 return; 719 720 ASSERT(db->db_data_pending != dr); 721 722 /* free this block */ 723 if (!BP_IS_HOLE(bp)) 724 dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp); 725 726 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 727 /* 728 * Release the already-written buffer, so we leave it in 729 * a consistent dirty state. Note that all callers are 730 * modifying the buffer, so they will immediately do 731 * another (redundant) arc_release(). Therefore, leave 732 * the buf thawed to save the effort of freezing & 733 * immediately re-thawing it. 734 */ 735 arc_release(dr->dt.dl.dr_data, db); 736 } 737 738 /* 739 * Evict (if its unreferenced) or clear (if its referenced) any level-0 740 * data blocks in the free range, so that any future readers will find 741 * empty blocks. Also, if we happen accross any level-1 dbufs in the 742 * range that have not already been marked dirty, mark them dirty so 743 * they stay in memory. 744 */ 745 void 746 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 747 { 748 dmu_buf_impl_t *db, *db_next; 749 uint64_t txg = tx->tx_txg; 750 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 751 uint64_t first_l1 = start >> epbs; 752 uint64_t last_l1 = end >> epbs; 753 754 if (end > dn->dn_maxblkid) { 755 end = dn->dn_maxblkid; 756 last_l1 = end >> epbs; 757 } 758 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 759 mutex_enter(&dn->dn_dbufs_mtx); 760 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 761 db_next = list_next(&dn->dn_dbufs, db); 762 ASSERT(db->db_blkid != DB_BONUS_BLKID); 763 764 if (db->db_level == 1 && 765 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 766 mutex_enter(&db->db_mtx); 767 if (db->db_last_dirty && 768 db->db_last_dirty->dr_txg < txg) { 769 dbuf_add_ref(db, FTAG); 770 mutex_exit(&db->db_mtx); 771 dbuf_will_dirty(db, tx); 772 dbuf_rele(db, FTAG); 773 } else { 774 mutex_exit(&db->db_mtx); 775 } 776 } 777 778 if (db->db_level != 0) 779 continue; 780 dprintf_dbuf(db, "found buf %s\n", ""); 781 if (db->db_blkid < start || db->db_blkid > end) 782 continue; 783 784 /* found a level 0 buffer in the range */ 785 if (dbuf_undirty(db, tx)) 786 continue; 787 788 mutex_enter(&db->db_mtx); 789 if (db->db_state == DB_UNCACHED || 790 db->db_state == DB_NOFILL || 791 db->db_state == DB_EVICTING) { 792 ASSERT(db->db.db_data == NULL); 793 mutex_exit(&db->db_mtx); 794 continue; 795 } 796 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 797 /* will be handled in dbuf_read_done or dbuf_rele */ 798 db->db_freed_in_flight = TRUE; 799 mutex_exit(&db->db_mtx); 800 continue; 801 } 802 if (refcount_count(&db->db_holds) == 0) { 803 ASSERT(db->db_buf); 804 dbuf_clear(db); 805 continue; 806 } 807 /* The dbuf is referenced */ 808 809 if (db->db_last_dirty != NULL) { 810 dbuf_dirty_record_t *dr = db->db_last_dirty; 811 812 if (dr->dr_txg == txg) { 813 /* 814 * This buffer is "in-use", re-adjust the file 815 * size to reflect that this buffer may 816 * contain new data when we sync. 817 */ 818 if (db->db_blkid > dn->dn_maxblkid) 819 dn->dn_maxblkid = db->db_blkid; 820 dbuf_unoverride(dr); 821 } else { 822 /* 823 * This dbuf is not dirty in the open context. 824 * Either uncache it (if its not referenced in 825 * the open context) or reset its contents to 826 * empty. 827 */ 828 dbuf_fix_old_data(db, txg); 829 } 830 } 831 /* clear the contents if its cached */ 832 if (db->db_state == DB_CACHED) { 833 ASSERT(db->db.db_data != NULL); 834 arc_release(db->db_buf, db); 835 bzero(db->db.db_data, db->db.db_size); 836 arc_buf_freeze(db->db_buf); 837 } 838 839 mutex_exit(&db->db_mtx); 840 } 841 mutex_exit(&dn->dn_dbufs_mtx); 842 } 843 844 static int 845 dbuf_block_freeable(dmu_buf_impl_t *db) 846 { 847 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 848 uint64_t birth_txg = 0; 849 850 /* 851 * We don't need any locking to protect db_blkptr: 852 * If it's syncing, then db_last_dirty will be set 853 * so we'll ignore db_blkptr. 854 */ 855 ASSERT(MUTEX_HELD(&db->db_mtx)); 856 if (db->db_last_dirty) 857 birth_txg = db->db_last_dirty->dr_txg; 858 else if (db->db_blkptr) 859 birth_txg = db->db_blkptr->blk_birth; 860 861 /* If we don't exist or are in a snapshot, we can't be freed */ 862 if (birth_txg) 863 return (ds == NULL || 864 dsl_dataset_block_freeable(ds, birth_txg)); 865 else 866 return (FALSE); 867 } 868 869 void 870 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 871 { 872 arc_buf_t *buf, *obuf; 873 int osize = db->db.db_size; 874 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 875 876 ASSERT(db->db_blkid != DB_BONUS_BLKID); 877 878 /* XXX does *this* func really need the lock? */ 879 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 880 881 /* 882 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 883 * is OK, because there can be no other references to the db 884 * when we are changing its size, so no concurrent DB_FILL can 885 * be happening. 886 */ 887 /* 888 * XXX we should be doing a dbuf_read, checking the return 889 * value and returning that up to our callers 890 */ 891 dbuf_will_dirty(db, tx); 892 893 /* create the data buffer for the new block */ 894 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); 895 896 /* copy old block data to the new block */ 897 obuf = db->db_buf; 898 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 899 /* zero the remainder */ 900 if (size > osize) 901 bzero((uint8_t *)buf->b_data + osize, size - osize); 902 903 mutex_enter(&db->db_mtx); 904 dbuf_set_data(db, buf); 905 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 906 db->db.db_size = size; 907 908 if (db->db_level == 0) { 909 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 910 db->db_last_dirty->dt.dl.dr_data = buf; 911 } 912 mutex_exit(&db->db_mtx); 913 914 dnode_willuse_space(db->db_dnode, size-osize, tx); 915 } 916 917 dbuf_dirty_record_t * 918 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 919 { 920 dnode_t *dn = db->db_dnode; 921 objset_t *os = dn->dn_objset; 922 dbuf_dirty_record_t **drp, *dr; 923 int drop_struct_lock = FALSE; 924 boolean_t do_free_accounting = B_FALSE; 925 int txgoff = tx->tx_txg & TXG_MASK; 926 927 ASSERT(tx->tx_txg != 0); 928 ASSERT(!refcount_is_zero(&db->db_holds)); 929 DMU_TX_DIRTY_BUF(tx, db); 930 931 /* 932 * Shouldn't dirty a regular buffer in syncing context. Private 933 * objects may be dirtied in syncing context, but only if they 934 * were already pre-dirtied in open context. 935 */ 936 ASSERT(!dmu_tx_is_syncing(tx) || 937 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 938 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 939 dn->dn_objset->os_dsl_dataset == NULL); 940 /* 941 * We make this assert for private objects as well, but after we 942 * check if we're already dirty. They are allowed to re-dirty 943 * in syncing context. 944 */ 945 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 946 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 947 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 948 949 mutex_enter(&db->db_mtx); 950 /* 951 * XXX make this true for indirects too? The problem is that 952 * transactions created with dmu_tx_create_assigned() from 953 * syncing context don't bother holding ahead. 954 */ 955 ASSERT(db->db_level != 0 || 956 db->db_state == DB_CACHED || db->db_state == DB_FILL || 957 db->db_state == DB_NOFILL); 958 959 mutex_enter(&dn->dn_mtx); 960 /* 961 * Don't set dirtyctx to SYNC if we're just modifying this as we 962 * initialize the objset. 963 */ 964 if (dn->dn_dirtyctx == DN_UNDIRTIED && 965 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 966 dn->dn_dirtyctx = 967 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 968 ASSERT(dn->dn_dirtyctx_firstset == NULL); 969 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 970 } 971 mutex_exit(&dn->dn_mtx); 972 973 /* 974 * If this buffer is already dirty, we're done. 975 */ 976 drp = &db->db_last_dirty; 977 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 978 db->db.db_object == DMU_META_DNODE_OBJECT); 979 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 980 drp = &dr->dr_next; 981 if (dr && dr->dr_txg == tx->tx_txg) { 982 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 983 /* 984 * If this buffer has already been written out, 985 * we now need to reset its state. 986 */ 987 dbuf_unoverride(dr); 988 if (db->db.db_object != DMU_META_DNODE_OBJECT && 989 db->db_state != DB_NOFILL) 990 arc_buf_thaw(db->db_buf); 991 } 992 mutex_exit(&db->db_mtx); 993 return (dr); 994 } 995 996 /* 997 * Only valid if not already dirty. 998 */ 999 ASSERT(dn->dn_object == 0 || 1000 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1001 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1002 1003 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1004 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1005 dn->dn_phys->dn_nlevels > db->db_level || 1006 dn->dn_next_nlevels[txgoff] > db->db_level || 1007 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1008 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1009 1010 /* 1011 * We should only be dirtying in syncing context if it's the 1012 * mos or we're initializing the os or it's a special object. 1013 * However, we are allowed to dirty in syncing context provided 1014 * we already dirtied it in open context. Hence we must make 1015 * this assertion only if we're not already dirty. 1016 */ 1017 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1018 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1019 ASSERT(db->db.db_size != 0); 1020 1021 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1022 1023 if (db->db_blkid != DB_BONUS_BLKID) { 1024 /* 1025 * Update the accounting. 1026 * Note: we delay "free accounting" until after we drop 1027 * the db_mtx. This keeps us from grabbing other locks 1028 * (and possibly deadlocking) in bp_get_dsize() while 1029 * also holding the db_mtx. 1030 */ 1031 dnode_willuse_space(dn, db->db.db_size, tx); 1032 do_free_accounting = dbuf_block_freeable(db); 1033 } 1034 1035 /* 1036 * If this buffer is dirty in an old transaction group we need 1037 * to make a copy of it so that the changes we make in this 1038 * transaction group won't leak out when we sync the older txg. 1039 */ 1040 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1041 if (db->db_level == 0) { 1042 void *data_old = db->db_buf; 1043 1044 if (db->db_state != DB_NOFILL) { 1045 if (db->db_blkid == DB_BONUS_BLKID) { 1046 dbuf_fix_old_data(db, tx->tx_txg); 1047 data_old = db->db.db_data; 1048 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1049 /* 1050 * Release the data buffer from the cache so 1051 * that we can modify it without impacting 1052 * possible other users of this cached data 1053 * block. Note that indirect blocks and 1054 * private objects are not released until the 1055 * syncing state (since they are only modified 1056 * then). 1057 */ 1058 arc_release(db->db_buf, db); 1059 dbuf_fix_old_data(db, tx->tx_txg); 1060 data_old = db->db_buf; 1061 } 1062 ASSERT(data_old != NULL); 1063 } 1064 dr->dt.dl.dr_data = data_old; 1065 } else { 1066 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1067 list_create(&dr->dt.di.dr_children, 1068 sizeof (dbuf_dirty_record_t), 1069 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1070 } 1071 dr->dr_dbuf = db; 1072 dr->dr_txg = tx->tx_txg; 1073 dr->dr_next = *drp; 1074 *drp = dr; 1075 1076 /* 1077 * We could have been freed_in_flight between the dbuf_noread 1078 * and dbuf_dirty. We win, as though the dbuf_noread() had 1079 * happened after the free. 1080 */ 1081 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 1082 mutex_enter(&dn->dn_mtx); 1083 dnode_clear_range(dn, db->db_blkid, 1, tx); 1084 mutex_exit(&dn->dn_mtx); 1085 db->db_freed_in_flight = FALSE; 1086 } 1087 1088 /* 1089 * This buffer is now part of this txg 1090 */ 1091 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1092 db->db_dirtycnt += 1; 1093 ASSERT3U(db->db_dirtycnt, <=, 3); 1094 1095 mutex_exit(&db->db_mtx); 1096 1097 if (db->db_blkid == DB_BONUS_BLKID) { 1098 mutex_enter(&dn->dn_mtx); 1099 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1100 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1101 mutex_exit(&dn->dn_mtx); 1102 dnode_setdirty(dn, tx); 1103 return (dr); 1104 } else if (do_free_accounting) { 1105 blkptr_t *bp = db->db_blkptr; 1106 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1107 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1108 /* 1109 * This is only a guess -- if the dbuf is dirty 1110 * in a previous txg, we don't know how much 1111 * space it will use on disk yet. We should 1112 * really have the struct_rwlock to access 1113 * db_blkptr, but since this is just a guess, 1114 * it's OK if we get an odd answer. 1115 */ 1116 dnode_willuse_space(dn, -willfree, tx); 1117 } 1118 1119 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1120 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1121 drop_struct_lock = TRUE; 1122 } 1123 1124 if (db->db_level == 0) { 1125 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1126 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1127 } 1128 1129 if (db->db_level+1 < dn->dn_nlevels) { 1130 dmu_buf_impl_t *parent = db->db_parent; 1131 dbuf_dirty_record_t *di; 1132 int parent_held = FALSE; 1133 1134 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1135 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1136 1137 parent = dbuf_hold_level(dn, db->db_level+1, 1138 db->db_blkid >> epbs, FTAG); 1139 parent_held = TRUE; 1140 } 1141 if (drop_struct_lock) 1142 rw_exit(&dn->dn_struct_rwlock); 1143 ASSERT3U(db->db_level+1, ==, parent->db_level); 1144 di = dbuf_dirty(parent, tx); 1145 if (parent_held) 1146 dbuf_rele(parent, FTAG); 1147 1148 mutex_enter(&db->db_mtx); 1149 /* possible race with dbuf_undirty() */ 1150 if (db->db_last_dirty == dr || 1151 dn->dn_object == DMU_META_DNODE_OBJECT) { 1152 mutex_enter(&di->dt.di.dr_mtx); 1153 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1154 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1155 list_insert_tail(&di->dt.di.dr_children, dr); 1156 mutex_exit(&di->dt.di.dr_mtx); 1157 dr->dr_parent = di; 1158 } 1159 mutex_exit(&db->db_mtx); 1160 } else { 1161 ASSERT(db->db_level+1 == dn->dn_nlevels); 1162 ASSERT(db->db_blkid < dn->dn_nblkptr); 1163 ASSERT(db->db_parent == NULL || 1164 db->db_parent == db->db_dnode->dn_dbuf); 1165 mutex_enter(&dn->dn_mtx); 1166 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1167 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1168 mutex_exit(&dn->dn_mtx); 1169 if (drop_struct_lock) 1170 rw_exit(&dn->dn_struct_rwlock); 1171 } 1172 1173 dnode_setdirty(dn, tx); 1174 return (dr); 1175 } 1176 1177 static int 1178 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1179 { 1180 dnode_t *dn = db->db_dnode; 1181 uint64_t txg = tx->tx_txg; 1182 dbuf_dirty_record_t *dr, **drp; 1183 1184 ASSERT(txg != 0); 1185 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1186 1187 mutex_enter(&db->db_mtx); 1188 /* 1189 * If this buffer is not dirty, we're done. 1190 */ 1191 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1192 if (dr->dr_txg <= txg) 1193 break; 1194 if (dr == NULL || dr->dr_txg < txg) { 1195 mutex_exit(&db->db_mtx); 1196 return (0); 1197 } 1198 ASSERT(dr->dr_txg == txg); 1199 ASSERT(dr->dr_dbuf == db); 1200 1201 /* 1202 * If this buffer is currently held, we cannot undirty 1203 * it, since one of the current holders may be in the 1204 * middle of an update. Note that users of dbuf_undirty() 1205 * should not place a hold on the dbuf before the call. 1206 */ 1207 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1208 mutex_exit(&db->db_mtx); 1209 /* Make sure we don't toss this buffer at sync phase */ 1210 mutex_enter(&dn->dn_mtx); 1211 dnode_clear_range(dn, db->db_blkid, 1, tx); 1212 mutex_exit(&dn->dn_mtx); 1213 return (0); 1214 } 1215 1216 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1217 1218 ASSERT(db->db.db_size != 0); 1219 1220 /* XXX would be nice to fix up dn_towrite_space[] */ 1221 1222 *drp = dr->dr_next; 1223 1224 if (dr->dr_parent) { 1225 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1226 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1227 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1228 } else if (db->db_level+1 == dn->dn_nlevels) { 1229 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1230 mutex_enter(&dn->dn_mtx); 1231 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1232 mutex_exit(&dn->dn_mtx); 1233 } 1234 1235 if (db->db_level == 0) { 1236 if (db->db_state != DB_NOFILL) { 1237 dbuf_unoverride(dr); 1238 1239 ASSERT(db->db_buf != NULL); 1240 ASSERT(dr->dt.dl.dr_data != NULL); 1241 if (dr->dt.dl.dr_data != db->db_buf) 1242 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 1243 db) == 1); 1244 } 1245 } else { 1246 ASSERT(db->db_buf != NULL); 1247 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1248 mutex_destroy(&dr->dt.di.dr_mtx); 1249 list_destroy(&dr->dt.di.dr_children); 1250 } 1251 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1252 1253 ASSERT(db->db_dirtycnt > 0); 1254 db->db_dirtycnt -= 1; 1255 1256 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1257 arc_buf_t *buf = db->db_buf; 1258 1259 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1260 dbuf_set_data(db, NULL); 1261 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1262 dbuf_evict(db); 1263 return (1); 1264 } 1265 1266 mutex_exit(&db->db_mtx); 1267 return (0); 1268 } 1269 1270 __attribute__((__weak__)) void 1271 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1272 { 1273 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1274 dbuf_will_dirty(db, tx); 1275 } 1276 1277 void 1278 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1279 { 1280 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1281 1282 ASSERT(tx->tx_txg != 0); 1283 ASSERT(!refcount_is_zero(&db->db_holds)); 1284 1285 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1286 rf |= DB_RF_HAVESTRUCT; 1287 (void) dbuf_read(db, NULL, rf); 1288 (void) dbuf_dirty(db, tx); 1289 } 1290 1291 void 1292 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1293 { 1294 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1295 1296 db->db_state = DB_NOFILL; 1297 1298 dmu_buf_will_fill(db_fake, tx); 1299 } 1300 1301 void 1302 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1303 { 1304 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1305 1306 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1307 ASSERT(tx->tx_txg != 0); 1308 ASSERT(db->db_level == 0); 1309 ASSERT(!refcount_is_zero(&db->db_holds)); 1310 1311 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1312 dmu_tx_private_ok(tx)); 1313 1314 dbuf_noread(db); 1315 (void) dbuf_dirty(db, tx); 1316 } 1317 1318 __attribute__((__weak__)) void 1319 dmu_buf_fill_done(dmu_buf_t *db_fake, dmu_tx_t *tx) 1320 { 1321 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1322 dbuf_fill_done(db, tx); 1323 } 1324 1325 /* ARGSUSED */ 1326 void 1327 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1328 { 1329 mutex_enter(&db->db_mtx); 1330 DBUF_VERIFY(db); 1331 1332 if (db->db_state == DB_FILL) { 1333 if (db->db_level == 0 && db->db_freed_in_flight) { 1334 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1335 /* we were freed while filling */ 1336 /* XXX dbuf_undirty? */ 1337 bzero(db->db.db_data, db->db.db_size); 1338 db->db_freed_in_flight = FALSE; 1339 } 1340 db->db_state = DB_CACHED; 1341 cv_broadcast(&db->db_changed); 1342 } 1343 mutex_exit(&db->db_mtx); 1344 } 1345 1346 /* 1347 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1348 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1349 */ 1350 void 1351 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1352 { 1353 ASSERT(!refcount_is_zero(&db->db_holds)); 1354 ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT); 1355 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1356 ASSERT(db->db_level == 0); 1357 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1358 ASSERT(buf != NULL); 1359 ASSERT(arc_buf_size(buf) == db->db.db_size); 1360 ASSERT(tx->tx_txg != 0); 1361 1362 arc_return_buf(buf, db); 1363 ASSERT(arc_released(buf)); 1364 1365 mutex_enter(&db->db_mtx); 1366 1367 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1368 cv_wait(&db->db_changed, &db->db_mtx); 1369 1370 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1371 1372 if (db->db_state == DB_CACHED && 1373 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1374 mutex_exit(&db->db_mtx); 1375 (void) dbuf_dirty(db, tx); 1376 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1377 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1378 xuio_stat_wbuf_copied(); 1379 return; 1380 } 1381 1382 xuio_stat_wbuf_nocopy(); 1383 if (db->db_state == DB_CACHED) { 1384 dbuf_dirty_record_t *dr = db->db_last_dirty; 1385 1386 ASSERT(db->db_buf != NULL); 1387 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1388 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1389 if (!arc_released(db->db_buf)) { 1390 ASSERT(dr->dt.dl.dr_override_state == 1391 DR_OVERRIDDEN); 1392 arc_release(db->db_buf, db); 1393 } 1394 dr->dt.dl.dr_data = buf; 1395 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1396 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1397 arc_release(db->db_buf, db); 1398 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1399 } 1400 db->db_buf = NULL; 1401 } 1402 ASSERT(db->db_buf == NULL); 1403 dbuf_set_data(db, buf); 1404 db->db_state = DB_FILL; 1405 mutex_exit(&db->db_mtx); 1406 (void) dbuf_dirty(db, tx); 1407 dbuf_fill_done(db, tx); 1408 } 1409 1410 /* 1411 * "Clear" the contents of this dbuf. This will mark the dbuf 1412 * EVICTING and clear *most* of its references. Unfortunetely, 1413 * when we are not holding the dn_dbufs_mtx, we can't clear the 1414 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1415 * in this case. For callers from the DMU we will usually see: 1416 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1417 * For the arc callback, we will usually see: 1418 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1419 * Sometimes, though, we will get a mix of these two: 1420 * DMU: dbuf_clear()->arc_buf_evict() 1421 * ARC: dbuf_do_evict()->dbuf_destroy() 1422 */ 1423 void 1424 dbuf_clear(dmu_buf_impl_t *db) 1425 { 1426 dnode_t *dn = db->db_dnode; 1427 dmu_buf_impl_t *parent = db->db_parent; 1428 dmu_buf_impl_t *dndb = dn->dn_dbuf; 1429 int dbuf_gone = FALSE; 1430 1431 ASSERT(MUTEX_HELD(&db->db_mtx)); 1432 ASSERT(refcount_is_zero(&db->db_holds)); 1433 1434 dbuf_evict_user(db); 1435 1436 if (db->db_state == DB_CACHED) { 1437 ASSERT(db->db.db_data != NULL); 1438 if (db->db_blkid == DB_BONUS_BLKID) { 1439 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1440 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1441 } 1442 db->db.db_data = NULL; 1443 db->db_state = DB_UNCACHED; 1444 } 1445 1446 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1447 ASSERT(db->db_data_pending == NULL); 1448 1449 db->db_state = DB_EVICTING; 1450 db->db_blkptr = NULL; 1451 1452 if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1453 list_remove(&dn->dn_dbufs, db); 1454 dnode_rele(dn, db); 1455 db->db_dnode = NULL; 1456 } 1457 1458 if (db->db_buf) 1459 dbuf_gone = arc_buf_evict(db->db_buf); 1460 1461 if (!dbuf_gone) 1462 mutex_exit(&db->db_mtx); 1463 1464 /* 1465 * If this dbuf is referened from an indirect dbuf, 1466 * decrement the ref count on the indirect dbuf. 1467 */ 1468 if (parent && parent != dndb) 1469 dbuf_rele(parent, db); 1470 } 1471 1472 static int 1473 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1474 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1475 { 1476 int nlevels, epbs; 1477 1478 *parentp = NULL; 1479 *bpp = NULL; 1480 1481 ASSERT(blkid != DB_BONUS_BLKID); 1482 1483 if (dn->dn_phys->dn_nlevels == 0) 1484 nlevels = 1; 1485 else 1486 nlevels = dn->dn_phys->dn_nlevels; 1487 1488 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1489 1490 ASSERT3U(level * epbs, <, 64); 1491 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1492 if (level >= nlevels || 1493 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1494 /* the buffer has no parent yet */ 1495 return (ENOENT); 1496 } else if (level < nlevels-1) { 1497 /* this block is referenced from an indirect block */ 1498 int err = dbuf_hold_impl(dn, level+1, 1499 blkid >> epbs, fail_sparse, NULL, parentp); 1500 if (err) 1501 return (err); 1502 err = dbuf_read(*parentp, NULL, 1503 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1504 if (err) { 1505 dbuf_rele(*parentp, NULL); 1506 *parentp = NULL; 1507 return (err); 1508 } 1509 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1510 (blkid & ((1ULL << epbs) - 1)); 1511 return (0); 1512 } else { 1513 /* the block is referenced from the dnode */ 1514 ASSERT3U(level, ==, nlevels-1); 1515 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1516 blkid < dn->dn_phys->dn_nblkptr); 1517 if (dn->dn_dbuf) { 1518 dbuf_add_ref(dn->dn_dbuf, NULL); 1519 *parentp = dn->dn_dbuf; 1520 } 1521 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1522 return (0); 1523 } 1524 } 1525 1526 static dmu_buf_impl_t * 1527 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1528 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1529 { 1530 objset_t *os = dn->dn_objset; 1531 dmu_buf_impl_t *db, *odb; 1532 1533 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1534 ASSERT(dn->dn_type != DMU_OT_NONE); 1535 1536 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1537 1538 db->db_objset = os; 1539 db->db.db_object = dn->dn_object; 1540 db->db_level = level; 1541 db->db_blkid = blkid; 1542 db->db_last_dirty = NULL; 1543 db->db_dirtycnt = 0; 1544 db->db_dnode = dn; 1545 db->db_parent = parent; 1546 db->db_blkptr = blkptr; 1547 1548 db->db_user_ptr = NULL; 1549 db->db_user_data_ptr_ptr = NULL; 1550 db->db_evict_func = NULL; 1551 db->db_immediate_evict = 0; 1552 db->db_freed_in_flight = 0; 1553 1554 if (blkid == DB_BONUS_BLKID) { 1555 ASSERT3P(parent, ==, dn->dn_dbuf); 1556 db->db.db_size = DN_MAX_BONUSLEN - 1557 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1558 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1559 db->db.db_offset = DB_BONUS_BLKID; 1560 db->db_state = DB_UNCACHED; 1561 /* the bonus dbuf is not placed in the hash table */ 1562 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1563 return (db); 1564 } else { 1565 int blocksize = 1566 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1567 db->db.db_size = blocksize; 1568 db->db.db_offset = db->db_blkid * blocksize; 1569 } 1570 1571 /* 1572 * Hold the dn_dbufs_mtx while we get the new dbuf 1573 * in the hash table *and* added to the dbufs list. 1574 * This prevents a possible deadlock with someone 1575 * trying to look up this dbuf before its added to the 1576 * dn_dbufs list. 1577 */ 1578 mutex_enter(&dn->dn_dbufs_mtx); 1579 db->db_state = DB_EVICTING; 1580 if ((odb = dbuf_hash_insert(db)) != NULL) { 1581 /* someone else inserted it first */ 1582 kmem_cache_free(dbuf_cache, db); 1583 mutex_exit(&dn->dn_dbufs_mtx); 1584 return (odb); 1585 } 1586 list_insert_head(&dn->dn_dbufs, db); 1587 db->db_state = DB_UNCACHED; 1588 mutex_exit(&dn->dn_dbufs_mtx); 1589 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1590 1591 if (parent && parent != dn->dn_dbuf) 1592 dbuf_add_ref(parent, db); 1593 1594 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1595 refcount_count(&dn->dn_holds) > 0); 1596 (void) refcount_add(&dn->dn_holds, db); 1597 1598 dprintf_dbuf(db, "db=%p\n", db); 1599 1600 return (db); 1601 } 1602 1603 static int 1604 dbuf_do_evict(void *private) 1605 { 1606 arc_buf_t *buf = private; 1607 dmu_buf_impl_t *db = buf->b_private; 1608 1609 if (!MUTEX_HELD(&db->db_mtx)) 1610 mutex_enter(&db->db_mtx); 1611 1612 ASSERT(refcount_is_zero(&db->db_holds)); 1613 1614 if (db->db_state != DB_EVICTING) { 1615 ASSERT(db->db_state == DB_CACHED); 1616 DBUF_VERIFY(db); 1617 db->db_buf = NULL; 1618 dbuf_evict(db); 1619 } else { 1620 mutex_exit(&db->db_mtx); 1621 dbuf_destroy(db); 1622 } 1623 return (0); 1624 } 1625 1626 static void 1627 dbuf_destroy(dmu_buf_impl_t *db) 1628 { 1629 ASSERT(refcount_is_zero(&db->db_holds)); 1630 1631 if (db->db_blkid != DB_BONUS_BLKID) { 1632 /* 1633 * If this dbuf is still on the dn_dbufs list, 1634 * remove it from that list. 1635 */ 1636 if (db->db_dnode) { 1637 dnode_t *dn = db->db_dnode; 1638 1639 mutex_enter(&dn->dn_dbufs_mtx); 1640 list_remove(&dn->dn_dbufs, db); 1641 mutex_exit(&dn->dn_dbufs_mtx); 1642 1643 dnode_rele(dn, db); 1644 db->db_dnode = NULL; 1645 } 1646 dbuf_hash_remove(db); 1647 } 1648 db->db_parent = NULL; 1649 db->db_buf = NULL; 1650 1651 ASSERT(!list_link_active(&db->db_link)); 1652 ASSERT(db->db.db_data == NULL); 1653 ASSERT(db->db_hash_next == NULL); 1654 ASSERT(db->db_blkptr == NULL); 1655 ASSERT(db->db_data_pending == NULL); 1656 1657 kmem_cache_free(dbuf_cache, db); 1658 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1659 } 1660 1661 void 1662 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1663 { 1664 dmu_buf_impl_t *db = NULL; 1665 blkptr_t *bp = NULL; 1666 1667 ASSERT(blkid != DB_BONUS_BLKID); 1668 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1669 1670 if (dnode_block_freed(dn, blkid)) 1671 return; 1672 1673 /* dbuf_find() returns with db_mtx held */ 1674 if (db = dbuf_find(dn, 0, blkid)) { 1675 if (refcount_count(&db->db_holds) > 0) { 1676 /* 1677 * This dbuf is active. We assume that it is 1678 * already CACHED, or else about to be either 1679 * read or filled. 1680 */ 1681 mutex_exit(&db->db_mtx); 1682 return; 1683 } 1684 mutex_exit(&db->db_mtx); 1685 db = NULL; 1686 } 1687 1688 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1689 if (bp && !BP_IS_HOLE(bp)) { 1690 arc_buf_t *pbuf; 1691 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1692 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1693 zbookmark_t zb; 1694 1695 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1696 dn->dn_object, 0, blkid); 1697 1698 if (db) 1699 pbuf = db->db_buf; 1700 else 1701 pbuf = dn->dn_objset->os_phys_buf; 1702 1703 (void) arc_read(NULL, dn->dn_objset->os_spa, 1704 bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1705 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1706 &aflags, &zb); 1707 } 1708 if (db) 1709 dbuf_rele(db, NULL); 1710 } 1711 } 1712 1713 /* 1714 * Returns with db_holds incremented, and db_mtx not held. 1715 * Note: dn_struct_rwlock must be held. 1716 */ 1717 int 1718 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1719 void *tag, dmu_buf_impl_t **dbp) 1720 { 1721 dmu_buf_impl_t *db, *parent = NULL; 1722 1723 ASSERT(blkid != DB_BONUS_BLKID); 1724 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1725 ASSERT3U(dn->dn_nlevels, >, level); 1726 1727 *dbp = NULL; 1728 top: 1729 /* dbuf_find() returns with db_mtx held */ 1730 db = dbuf_find(dn, level, blkid); 1731 1732 if (db == NULL) { 1733 blkptr_t *bp = NULL; 1734 int err; 1735 1736 ASSERT3P(parent, ==, NULL); 1737 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1738 if (fail_sparse) { 1739 if (err == 0 && bp && BP_IS_HOLE(bp)) 1740 err = ENOENT; 1741 if (err) { 1742 if (parent) 1743 dbuf_rele(parent, NULL); 1744 return (err); 1745 } 1746 } 1747 if (err && err != ENOENT) 1748 return (err); 1749 db = dbuf_create(dn, level, blkid, parent, bp); 1750 } 1751 1752 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1753 arc_buf_add_ref(db->db_buf, db); 1754 if (db->db_buf->b_data == NULL) { 1755 dbuf_clear(db); 1756 if (parent) { 1757 dbuf_rele(parent, NULL); 1758 parent = NULL; 1759 } 1760 goto top; 1761 } 1762 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1763 } 1764 1765 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1766 1767 /* 1768 * If this buffer is currently syncing out, and we are are 1769 * still referencing it from db_data, we need to make a copy 1770 * of it in case we decide we want to dirty it again in this txg. 1771 */ 1772 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 1773 dn->dn_object != DMU_META_DNODE_OBJECT && 1774 db->db_state == DB_CACHED && db->db_data_pending) { 1775 dbuf_dirty_record_t *dr = db->db_data_pending; 1776 1777 if (dr->dt.dl.dr_data == db->db_buf) { 1778 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1779 1780 dbuf_set_data(db, 1781 arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1782 db->db.db_size, db, type)); 1783 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1784 db->db.db_size); 1785 } 1786 } 1787 1788 (void) refcount_add(&db->db_holds, tag); 1789 dbuf_update_data(db); 1790 DBUF_VERIFY(db); 1791 mutex_exit(&db->db_mtx); 1792 1793 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1794 if (parent) 1795 dbuf_rele(parent, NULL); 1796 1797 ASSERT3P(db->db_dnode, ==, dn); 1798 ASSERT3U(db->db_blkid, ==, blkid); 1799 ASSERT3U(db->db_level, ==, level); 1800 *dbp = db; 1801 1802 return (0); 1803 } 1804 1805 dmu_buf_impl_t * 1806 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1807 { 1808 dmu_buf_impl_t *db; 1809 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1810 return (err ? NULL : db); 1811 } 1812 1813 dmu_buf_impl_t * 1814 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1815 { 1816 dmu_buf_impl_t *db; 1817 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1818 return (err ? NULL : db); 1819 } 1820 1821 void 1822 dbuf_create_bonus(dnode_t *dn) 1823 { 1824 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1825 1826 ASSERT(dn->dn_bonus == NULL); 1827 dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); 1828 } 1829 1830 __attribute__((__weak__)) void 1831 dmu_buf_add_ref(dmu_buf_t *db_fake, void *tag) 1832 { 1833 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1834 dbuf_add_ref(db, tag); 1835 } 1836 1837 void 1838 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1839 { 1840 int64_t holds = refcount_add(&db->db_holds, tag); 1841 ASSERT(holds > 1); 1842 } 1843 1844 __attribute__((__weak__)) void 1845 dmu_buf_rele(dmu_buf_t *db_fake, void *tag) 1846 { 1847 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1848 dbuf_rele(db, tag); 1849 } 1850 1851 void 1852 dbuf_rele(dmu_buf_impl_t *db, void *tag) 1853 { 1854 mutex_enter(&db->db_mtx); 1855 dbuf_rele_and_unlock(db, tag); 1856 } 1857 1858 /* 1859 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 1860 * db_dirtycnt and db_holds to be updated atomically. 1861 */ 1862 void 1863 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 1864 { 1865 int64_t holds; 1866 1867 ASSERT(MUTEX_HELD(&db->db_mtx)); 1868 DBUF_VERIFY(db); 1869 1870 holds = refcount_remove(&db->db_holds, tag); 1871 ASSERT(holds >= 0); 1872 1873 /* 1874 * We can't freeze indirects if there is a possibility that they 1875 * may be modified in the current syncing context. 1876 */ 1877 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 1878 arc_buf_freeze(db->db_buf); 1879 1880 if (holds == db->db_dirtycnt && 1881 db->db_level == 0 && db->db_immediate_evict) 1882 dbuf_evict_user(db); 1883 1884 if (holds == 0) { 1885 if (db->db_blkid == DB_BONUS_BLKID) { 1886 mutex_exit(&db->db_mtx); 1887 dnode_rele(db->db_dnode, db); 1888 } else if (db->db_buf == NULL) { 1889 /* 1890 * This is a special case: we never associated this 1891 * dbuf with any data allocated from the ARC. 1892 */ 1893 ASSERT(db->db_state == DB_UNCACHED || 1894 db->db_state == DB_NOFILL); 1895 dbuf_evict(db); 1896 } else if (arc_released(db->db_buf)) { 1897 arc_buf_t *buf = db->db_buf; 1898 /* 1899 * This dbuf has anonymous data associated with it. 1900 */ 1901 dbuf_set_data(db, NULL); 1902 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1903 dbuf_evict(db); 1904 } else { 1905 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 1906 if (!DBUF_IS_CACHEABLE(db)) 1907 dbuf_clear(db); 1908 else 1909 mutex_exit(&db->db_mtx); 1910 } 1911 } else { 1912 mutex_exit(&db->db_mtx); 1913 } 1914 } 1915 1916 __attribute__((__weak__)) uint64_t 1917 dmu_buf_refcount(dmu_buf_t *db_fake) 1918 { 1919 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1920 return dbuf_refcount(db); 1921 } 1922 1923 uint64_t 1924 dbuf_refcount(dmu_buf_impl_t *db) 1925 { 1926 return (refcount_count(&db->db_holds)); 1927 } 1928 1929 void * 1930 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1931 dmu_buf_evict_func_t *evict_func) 1932 { 1933 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1934 user_data_ptr_ptr, evict_func)); 1935 } 1936 1937 void * 1938 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1939 dmu_buf_evict_func_t *evict_func) 1940 { 1941 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1942 1943 db->db_immediate_evict = TRUE; 1944 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1945 user_data_ptr_ptr, evict_func)); 1946 } 1947 1948 void * 1949 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1950 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1951 { 1952 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1953 ASSERT(db->db_level == 0); 1954 1955 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1956 1957 mutex_enter(&db->db_mtx); 1958 1959 if (db->db_user_ptr == old_user_ptr) { 1960 db->db_user_ptr = user_ptr; 1961 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 1962 db->db_evict_func = evict_func; 1963 1964 dbuf_update_data(db); 1965 } else { 1966 old_user_ptr = db->db_user_ptr; 1967 } 1968 1969 mutex_exit(&db->db_mtx); 1970 return (old_user_ptr); 1971 } 1972 1973 void * 1974 dmu_buf_get_user(dmu_buf_t *db_fake) 1975 { 1976 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1977 ASSERT(!refcount_is_zero(&db->db_holds)); 1978 1979 return (db->db_user_ptr); 1980 } 1981 1982 boolean_t 1983 dmu_buf_freeable(dmu_buf_t *dbuf) 1984 { 1985 boolean_t res = B_FALSE; 1986 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1987 1988 if (db->db_blkptr) 1989 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 1990 db->db_blkptr->blk_birth); 1991 1992 return (res); 1993 } 1994 1995 static void 1996 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 1997 { 1998 /* ASSERT(dmu_tx_is_syncing(tx) */ 1999 ASSERT(MUTEX_HELD(&db->db_mtx)); 2000 2001 if (db->db_blkptr != NULL) 2002 return; 2003 2004 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2005 /* 2006 * This buffer was allocated at a time when there was 2007 * no available blkptrs from the dnode, or it was 2008 * inappropriate to hook it in (i.e., nlevels mis-match). 2009 */ 2010 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2011 ASSERT(db->db_parent == NULL); 2012 db->db_parent = dn->dn_dbuf; 2013 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2014 DBUF_VERIFY(db); 2015 } else { 2016 dmu_buf_impl_t *parent = db->db_parent; 2017 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2018 2019 ASSERT(dn->dn_phys->dn_nlevels > 1); 2020 if (parent == NULL) { 2021 mutex_exit(&db->db_mtx); 2022 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2023 (void) dbuf_hold_impl(dn, db->db_level+1, 2024 db->db_blkid >> epbs, FALSE, db, &parent); 2025 rw_exit(&dn->dn_struct_rwlock); 2026 mutex_enter(&db->db_mtx); 2027 db->db_parent = parent; 2028 } 2029 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2030 (db->db_blkid & ((1ULL << epbs) - 1)); 2031 DBUF_VERIFY(db); 2032 } 2033 } 2034 2035 static void 2036 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2037 { 2038 dmu_buf_impl_t *db = dr->dr_dbuf; 2039 dnode_t *dn = db->db_dnode; 2040 zio_t *zio; 2041 2042 ASSERT(dmu_tx_is_syncing(tx)); 2043 2044 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2045 2046 mutex_enter(&db->db_mtx); 2047 2048 ASSERT(db->db_level > 0); 2049 DBUF_VERIFY(db); 2050 2051 if (db->db_buf == NULL) { 2052 mutex_exit(&db->db_mtx); 2053 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2054 mutex_enter(&db->db_mtx); 2055 } 2056 ASSERT3U(db->db_state, ==, DB_CACHED); 2057 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2058 ASSERT(db->db_buf != NULL); 2059 2060 dbuf_check_blkptr(dn, db); 2061 2062 db->db_data_pending = dr; 2063 2064 mutex_exit(&db->db_mtx); 2065 dbuf_write(dr, db->db_buf, tx); 2066 2067 zio = dr->dr_zio; 2068 mutex_enter(&dr->dt.di.dr_mtx); 2069 dbuf_sync_list(&dr->dt.di.dr_children, tx); 2070 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2071 mutex_exit(&dr->dt.di.dr_mtx); 2072 zio_nowait(zio); 2073 } 2074 2075 static void 2076 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2077 { 2078 arc_buf_t **datap = &dr->dt.dl.dr_data; 2079 dmu_buf_impl_t *db = dr->dr_dbuf; 2080 dnode_t *dn = db->db_dnode; 2081 objset_t *os = dn->dn_objset; 2082 uint64_t txg = tx->tx_txg; 2083 2084 ASSERT(dmu_tx_is_syncing(tx)); 2085 2086 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2087 2088 mutex_enter(&db->db_mtx); 2089 /* 2090 * To be synced, we must be dirtied. But we 2091 * might have been freed after the dirty. 2092 */ 2093 if (db->db_state == DB_UNCACHED) { 2094 /* This buffer has been freed since it was dirtied */ 2095 ASSERT(db->db.db_data == NULL); 2096 } else if (db->db_state == DB_FILL) { 2097 /* This buffer was freed and is now being re-filled */ 2098 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2099 } else { 2100 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2101 } 2102 DBUF_VERIFY(db); 2103 2104 /* 2105 * If this is a bonus buffer, simply copy the bonus data into the 2106 * dnode. It will be written out when the dnode is synced (and it 2107 * will be synced, since it must have been dirty for dbuf_sync to 2108 * be called). 2109 */ 2110 if (db->db_blkid == DB_BONUS_BLKID) { 2111 dbuf_dirty_record_t **drp; 2112 2113 ASSERT(*datap != NULL); 2114 ASSERT3U(db->db_level, ==, 0); 2115 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2116 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2117 if (*datap != db->db.db_data) { 2118 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2119 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2120 } 2121 db->db_data_pending = NULL; 2122 drp = &db->db_last_dirty; 2123 while (*drp != dr) 2124 drp = &(*drp)->dr_next; 2125 ASSERT(dr->dr_next == NULL); 2126 ASSERT(dr->dr_dbuf == db); 2127 *drp = dr->dr_next; 2128 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2129 ASSERT(db->db_dirtycnt > 0); 2130 db->db_dirtycnt -= 1; 2131 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2132 return; 2133 } 2134 2135 /* 2136 * This function may have dropped the db_mtx lock allowing a dmu_sync 2137 * operation to sneak in. As a result, we need to ensure that we 2138 * don't check the dr_override_state until we have returned from 2139 * dbuf_check_blkptr. 2140 */ 2141 dbuf_check_blkptr(dn, db); 2142 2143 /* 2144 * If this buffer is in the middle of an immdiate write, 2145 * wait for the synchronous IO to complete. 2146 */ 2147 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2148 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2149 cv_wait(&db->db_changed, &db->db_mtx); 2150 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2151 } 2152 2153 if (db->db_state != DB_NOFILL && 2154 dn->dn_object != DMU_META_DNODE_OBJECT && 2155 refcount_count(&db->db_holds) > 1 && 2156 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2157 *datap == db->db_buf) { 2158 /* 2159 * If this buffer is currently "in use" (i.e., there 2160 * are active holds and db_data still references it), 2161 * then make a copy before we start the write so that 2162 * any modifications from the open txg will not leak 2163 * into this write. 2164 * 2165 * NOTE: this copy does not need to be made for 2166 * objects only modified in the syncing context (e.g. 2167 * DNONE_DNODE blocks). 2168 */ 2169 int blksz = arc_buf_size(*datap); 2170 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2171 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2172 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2173 } 2174 db->db_data_pending = dr; 2175 2176 mutex_exit(&db->db_mtx); 2177 2178 dbuf_write(dr, *datap, tx); 2179 2180 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2181 if (dn->dn_object == DMU_META_DNODE_OBJECT) 2182 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2183 else 2184 zio_nowait(dr->dr_zio); 2185 } 2186 2187 void 2188 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2189 { 2190 dbuf_dirty_record_t *dr; 2191 2192 while (dr = list_head(list)) { 2193 if (dr->dr_zio != NULL) { 2194 /* 2195 * If we find an already initialized zio then we 2196 * are processing the meta-dnode, and we have finished. 2197 * The dbufs for all dnodes are put back on the list 2198 * during processing, so that we can zio_wait() 2199 * these IOs after initiating all child IOs. 2200 */ 2201 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2202 DMU_META_DNODE_OBJECT); 2203 break; 2204 } 2205 list_remove(list, dr); 2206 if (dr->dr_dbuf->db_level > 0) 2207 dbuf_sync_indirect(dr, tx); 2208 else 2209 dbuf_sync_leaf(dr, tx); 2210 } 2211 } 2212 2213 /* ARGSUSED */ 2214 static void 2215 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2216 { 2217 dmu_buf_impl_t *db = vdb; 2218 blkptr_t *bp = zio->io_bp; 2219 blkptr_t *bp_orig = &zio->io_bp_orig; 2220 dnode_t *dn = db->db_dnode; 2221 spa_t *spa = zio->io_spa; 2222 int64_t delta; 2223 uint64_t fill = 0; 2224 int i; 2225 2226 ASSERT(db->db_blkptr == bp); 2227 2228 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2229 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2230 zio->io_prev_space_delta = delta; 2231 2232 if (BP_IS_HOLE(bp)) { 2233 ASSERT(bp->blk_fill == 0); 2234 return; 2235 } 2236 2237 ASSERT(BP_GET_TYPE(bp) == dn->dn_type); 2238 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2239 2240 mutex_enter(&db->db_mtx); 2241 2242 if (db->db_level == 0) { 2243 mutex_enter(&dn->dn_mtx); 2244 if (db->db_blkid > dn->dn_phys->dn_maxblkid) 2245 dn->dn_phys->dn_maxblkid = db->db_blkid; 2246 mutex_exit(&dn->dn_mtx); 2247 2248 if (dn->dn_type == DMU_OT_DNODE) { 2249 dnode_phys_t *dnp = db->db.db_data; 2250 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2251 i--, dnp++) { 2252 if (dnp->dn_type != DMU_OT_NONE) 2253 fill++; 2254 } 2255 } else { 2256 fill = 1; 2257 } 2258 } else { 2259 blkptr_t *ibp = db->db.db_data; 2260 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2261 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2262 if (BP_IS_HOLE(ibp)) 2263 continue; 2264 fill += ibp->blk_fill; 2265 } 2266 } 2267 2268 bp->blk_fill = fill; 2269 2270 mutex_exit(&db->db_mtx); 2271 } 2272 2273 /* ARGSUSED */ 2274 static void 2275 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2276 { 2277 dmu_buf_impl_t *db = vdb; 2278 blkptr_t *bp = zio->io_bp; 2279 blkptr_t *bp_orig = &zio->io_bp_orig; 2280 dnode_t *dn = db->db_dnode; 2281 objset_t *os = dn->dn_objset; 2282 uint64_t txg = zio->io_txg; 2283 dbuf_dirty_record_t **drp, *dr; 2284 2285 ASSERT3U(zio->io_error, ==, 0); 2286 ASSERT(db->db_blkptr == bp); 2287 2288 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 2289 ASSERT(BP_EQUAL(bp, bp_orig)); 2290 } else { 2291 dsl_dataset_t *ds = os->os_dsl_dataset; 2292 dmu_tx_t *tx = os->os_synctx; 2293 2294 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2295 dsl_dataset_block_born(ds, bp, tx); 2296 } 2297 2298 mutex_enter(&db->db_mtx); 2299 2300 DBUF_VERIFY(db); 2301 2302 drp = &db->db_last_dirty; 2303 while ((dr = *drp) != db->db_data_pending) 2304 drp = &dr->dr_next; 2305 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2306 ASSERT(dr->dr_txg == txg); 2307 ASSERT(dr->dr_dbuf == db); 2308 ASSERT(dr->dr_next == NULL); 2309 *drp = dr->dr_next; 2310 2311 if (db->db_level == 0) { 2312 ASSERT(db->db_blkid != DB_BONUS_BLKID); 2313 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2314 if (db->db_state != DB_NOFILL) { 2315 if (dr->dt.dl.dr_data != db->db_buf) 2316 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2317 db) == 1); 2318 else if (!arc_released(db->db_buf)) 2319 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2320 } 2321 } else { 2322 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2323 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2324 if (!BP_IS_HOLE(db->db_blkptr)) { 2325 int epbs = 2326 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2327 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2328 db->db.db_size); 2329 ASSERT3U(dn->dn_phys->dn_maxblkid 2330 >> (db->db_level * epbs), >=, db->db_blkid); 2331 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2332 } 2333 mutex_destroy(&dr->dt.di.dr_mtx); 2334 list_destroy(&dr->dt.di.dr_children); 2335 } 2336 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2337 2338 cv_broadcast(&db->db_changed); 2339 ASSERT(db->db_dirtycnt > 0); 2340 db->db_dirtycnt -= 1; 2341 db->db_data_pending = NULL; 2342 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2343 } 2344 2345 static void 2346 dbuf_write_nofill_ready(zio_t *zio) 2347 { 2348 dbuf_write_ready(zio, NULL, zio->io_private); 2349 } 2350 2351 static void 2352 dbuf_write_nofill_done(zio_t *zio) 2353 { 2354 dbuf_write_done(zio, NULL, zio->io_private); 2355 } 2356 2357 static void 2358 dbuf_write_override_ready(zio_t *zio) 2359 { 2360 dbuf_dirty_record_t *dr = zio->io_private; 2361 dmu_buf_impl_t *db = dr->dr_dbuf; 2362 2363 dbuf_write_ready(zio, NULL, db); 2364 } 2365 2366 static void 2367 dbuf_write_override_done(zio_t *zio) 2368 { 2369 dbuf_dirty_record_t *dr = zio->io_private; 2370 dmu_buf_impl_t *db = dr->dr_dbuf; 2371 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2372 2373 mutex_enter(&db->db_mtx); 2374 if (!BP_EQUAL(zio->io_bp, obp)) { 2375 if (!BP_IS_HOLE(obp)) 2376 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2377 arc_release(dr->dt.dl.dr_data, db); 2378 } 2379 mutex_exit(&db->db_mtx); 2380 2381 dbuf_write_done(zio, NULL, db); 2382 } 2383 2384 static void 2385 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2386 { 2387 dmu_buf_impl_t *db = dr->dr_dbuf; 2388 dnode_t *dn = db->db_dnode; 2389 objset_t *os = dn->dn_objset; 2390 dmu_buf_impl_t *parent = db->db_parent; 2391 uint64_t txg = tx->tx_txg; 2392 zbookmark_t zb; 2393 zio_prop_t zp; 2394 zio_t *zio; 2395 2396 if (db->db_state != DB_NOFILL) { 2397 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2398 /* 2399 * Private object buffers are released here rather 2400 * than in dbuf_dirty() since they are only modified 2401 * in the syncing context and we don't want the 2402 * overhead of making multiple copies of the data. 2403 */ 2404 if (BP_IS_HOLE(db->db_blkptr)) { 2405 arc_buf_thaw(data); 2406 } else { 2407 arc_release(data, db); 2408 } 2409 } 2410 } 2411 2412 if (parent != dn->dn_dbuf) { 2413 ASSERT(parent && parent->db_data_pending); 2414 ASSERT(db->db_level == parent->db_level-1); 2415 ASSERT(arc_released(parent->db_buf)); 2416 zio = parent->db_data_pending->dr_zio; 2417 } else { 2418 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 2419 ASSERT3P(db->db_blkptr, ==, 2420 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2421 zio = dn->dn_zio; 2422 } 2423 2424 ASSERT(db->db_level == 0 || data == db->db_buf); 2425 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2426 ASSERT(zio); 2427 2428 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2429 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2430 db->db.db_object, db->db_level, db->db_blkid); 2431 2432 dmu_write_policy(os, dn, db->db_level, 2433 db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp); 2434 2435 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2436 ASSERT(db->db_state != DB_NOFILL); 2437 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2438 db->db_blkptr, data->b_data, arc_buf_size(data), &zp, 2439 dbuf_write_override_ready, dbuf_write_override_done, dr, 2440 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2441 mutex_enter(&db->db_mtx); 2442 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2443 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2444 dr->dt.dl.dr_copies); 2445 mutex_exit(&db->db_mtx); 2446 } else if (db->db_state == DB_NOFILL) { 2447 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); 2448 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2449 db->db_blkptr, NULL, db->db.db_size, &zp, 2450 dbuf_write_nofill_ready, dbuf_write_nofill_done, db, 2451 ZIO_PRIORITY_ASYNC_WRITE, 2452 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2453 } else { 2454 ASSERT(arc_released(data)); 2455 dr->dr_zio = arc_write(zio, os->os_spa, txg, 2456 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, 2457 dbuf_write_ready, dbuf_write_done, db, 2458 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2459 } 2460 } 2461