1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2016 by Delphix. All rights reserved. 24 * Copyright (c) 2023, Klara Inc. 25 */ 26 27 #ifndef _SYS_DDT_IMPL_H 28 #define _SYS_DDT_IMPL_H 29 30 #include <sys/ddt.h> 31 #include <sys/bitops.h> 32 33 #ifdef __cplusplus 34 extern "C" { 35 #endif 36 37 /* DDT version numbers */ 38 #define DDT_VERSION_LEGACY (0) 39 #define DDT_VERSION_FDT (1) 40 41 /* Dummy version to signal that configure is still necessary */ 42 #define DDT_VERSION_UNCONFIGURED (UINT64_MAX) 43 44 /* Names of interesting objects in the DDT root dir */ 45 #define DDT_DIR_VERSION "version" 46 #define DDT_DIR_FLAGS "flags" 47 48 /* Fill a lightweight entry from a live entry. */ 49 #define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \ 50 memset((ddlwe), 0, sizeof (*ddlwe)); \ 51 (ddlwe)->ddlwe_key = (dde)->dde_key; \ 52 (ddlwe)->ddlwe_type = (dde)->dde_type; \ 53 (ddlwe)->ddlwe_class = (dde)->dde_class; \ 54 memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \ 55 } while (0) 56 57 #define DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do { \ 58 memset((ddlwe), 0, sizeof (*ddlwe)); \ 59 (ddlwe)->ddlwe_key = (ddle)->ddle_key; \ 60 (ddlwe)->ddlwe_type = (ddle)->ddle_type; \ 61 (ddlwe)->ddlwe_class = (ddle)->ddle_class; \ 62 memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \ 63 } while (0) 64 65 /* 66 * An entry on the log tree. These are "frozen", and a record of what's in 67 * the on-disk log. They can't be used in place, but can be "loaded" back into 68 * the live tree. 69 */ 70 typedef struct { 71 ddt_key_t ddle_key; /* ddt_log_tree key */ 72 avl_node_t ddle_node; /* ddt_log_tree node */ 73 74 ddt_type_t ddle_type; /* storage type */ 75 ddt_class_t ddle_class; /* storage class */ 76 77 /* extra allocation for flat/trad phys */ 78 ddt_univ_phys_t ddle_phys[]; 79 } ddt_log_entry_t; 80 81 /* On-disk log record types. */ 82 typedef enum { 83 DLR_INVALID = 0, /* end of block marker */ 84 DLR_ENTRY = 1, /* an entry to add or replace in the log tree */ 85 } ddt_log_record_type_t; 86 87 /* On-disk log record header. */ 88 typedef struct { 89 /* 90 * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to 91 * access it. 92 * 93 * bits 0-7: record type (ddt_log_record_type_t) 94 * bits 8-15: length of record header+payload 95 * bits 16-47: reserved, all zero 96 * bits 48-55: if type==DLR_ENTRY, storage type (ddt_type) 97 * otherwise all zero 98 * bits 56-63: if type==DLR_ENTRY, storage class (ddt_class) 99 * otherwise all zero 100 */ 101 uint64_t dlr_info; 102 uint8_t dlr_payload[]; 103 } ddt_log_record_t; 104 105 #define DLR_GET_TYPE(dlr) BF64_GET((dlr)->dlr_info, 0, 8) 106 #define DLR_SET_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 0, 8, v) 107 #define DLR_GET_RECLEN(dlr) BF64_GET((dlr)->dlr_info, 8, 16) 108 #define DLR_SET_RECLEN(dlr, v) BF64_SET((dlr)->dlr_info, 8, 16, v) 109 #define DLR_GET_ENTRY_TYPE(dlr) BF64_GET((dlr)->dlr_info, 48, 8) 110 #define DLR_SET_ENTRY_TYPE(dlr, v) BF64_SET((dlr)->dlr_info, 48, 8, v) 111 #define DLR_GET_ENTRY_CLASS(dlr) BF64_GET((dlr)->dlr_info, 56, 8) 112 #define DLR_SET_ENTRY_CLASS(dlr, v) BF64_SET((dlr)->dlr_info, 56, 8, v) 113 114 /* Payload for DLR_ENTRY. */ 115 typedef struct { 116 ddt_key_t dlre_key; 117 ddt_univ_phys_t dlre_phys[]; 118 } ddt_log_record_entry_t; 119 120 /* Log flags (ddl_flags, dlh_flags) */ 121 #define DDL_FLAG_FLUSHING (1 << 0) /* this log is being flushed */ 122 #define DDL_FLAG_CHECKPOINT (1 << 1) /* header has a checkpoint */ 123 124 /* On-disk log header, stored in the bonus buffer. */ 125 typedef struct { 126 /* 127 * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to 128 * access it. 129 * 130 * bits 0-7: log version 131 * bits 8-15: log flags 132 * bits 16-63: reserved, all zero 133 */ 134 uint64_t dlh_info; 135 136 uint64_t dlh_length; /* log size in bytes */ 137 uint64_t dlh_first_txg; /* txg this log went active */ 138 ddt_key_t dlh_checkpoint; /* last checkpoint */ 139 } ddt_log_header_t; 140 141 #define DLH_GET_VERSION(dlh) BF64_GET((dlh)->dlh_info, 0, 8) 142 #define DLH_SET_VERSION(dlh, v) BF64_SET((dlh)->dlh_info, 0, 8, v) 143 #define DLH_GET_FLAGS(dlh) BF64_GET((dlh)->dlh_info, 8, 8) 144 #define DLH_SET_FLAGS(dlh, v) BF64_SET((dlh)->dlh_info, 8, 8, v) 145 146 /* DDT log update state */ 147 typedef struct { 148 dmu_tx_t *dlu_tx; /* tx the update is being applied to */ 149 dnode_t *dlu_dn; /* log object dnode */ 150 dmu_buf_t **dlu_dbp; /* array of block buffer pointers */ 151 int dlu_ndbp; /* number of block buffer pointers */ 152 uint16_t dlu_reclen; /* cached length of record */ 153 uint64_t dlu_block; /* block for next entry */ 154 uint64_t dlu_offset; /* offset for next entry */ 155 } ddt_log_update_t; 156 157 /* 158 * Ops vector to access a specific DDT object type. 159 */ 160 typedef struct { 161 char ddt_op_name[32]; 162 int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, 163 boolean_t prehash); 164 int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); 165 int (*ddt_op_lookup)(objset_t *os, uint64_t object, 166 const ddt_key_t *ddk, void *phys, size_t psize); 167 int (*ddt_op_contains)(objset_t *os, uint64_t object, 168 const ddt_key_t *ddk); 169 void (*ddt_op_prefetch)(objset_t *os, uint64_t object, 170 const ddt_key_t *ddk); 171 void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object); 172 int (*ddt_op_update)(objset_t *os, uint64_t object, 173 const ddt_key_t *ddk, const void *phys, size_t psize, 174 dmu_tx_t *tx); 175 int (*ddt_op_remove)(objset_t *os, uint64_t object, 176 const ddt_key_t *ddk, dmu_tx_t *tx); 177 int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk, 178 ddt_key_t *ddk, void *phys, size_t psize); 179 int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); 180 } ddt_ops_t; 181 182 extern const ddt_ops_t ddt_zap_ops; 183 184 /* Dedup log API */ 185 extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, 186 ddt_log_update_t *dlu); 187 extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde, 188 ddt_log_update_t *dlu); 189 extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu); 190 191 extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, 192 ddt_lightweight_entry_t *ddlwe); 193 194 extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, 195 ddt_lightweight_entry_t *ddlwe); 196 extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, 197 const ddt_key_t *ddk); 198 199 extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, 200 dmu_tx_t *tx); 201 extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx); 202 203 extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx); 204 205 extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx); 206 207 extern int ddt_log_load(ddt_t *ddt); 208 extern void ddt_log_alloc(ddt_t *ddt); 209 extern void ddt_log_free(ddt_t *ddt); 210 211 extern void ddt_log_init(void); 212 extern void ddt_log_fini(void); 213 214 /* 215 * These are only exposed so that zdb can access them. Try not to use them 216 * outside of the DDT implementation proper, and if you do, consider moving 217 * them up. 218 */ 219 220 /* 221 * We use a histogram to convert a percentage request into a 222 * cutoff value where entries older than the cutoff get pruned. 223 * 224 * The histogram bins represent hours in power-of-two increments. 225 * 16 bins covers up to four years. 226 */ 227 #define HIST_BINS 16 228 229 typedef struct ddt_age_histo { 230 uint64_t dah_entries; 231 uint64_t dah_age_histo[HIST_BINS]; 232 } ddt_age_histo_t; 233 234 void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram); 235 236 #if defined(_KERNEL) || !defined(ZFS_DEBUG) 237 #define ddt_dump_age_histogram(histo, cutoff) ((void)0) 238 #else 239 static inline void 240 ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff) 241 { 242 if (histogram->dah_entries == 0) 243 return; 244 245 (void) printf("DDT prune unique class age, %llu hour cutoff\n", 246 (u_longlong_t)(gethrestime_sec() - cutoff)/3600); 247 (void) printf("%5s %9s %4s\n", "age", "blocks", "amnt"); 248 (void) printf("%5s %9s %4s\n", "-----", "---------", "----"); 249 for (int i = 0; i < HIST_BINS; i++) { 250 (void) printf("%5d %9llu %4d%%\n", 1<<i, 251 (u_longlong_t)histogram->dah_age_histo[i], 252 (int)((histogram->dah_age_histo[i] * 100) / 253 histogram->dah_entries)); 254 } 255 } 256 #endif 257 258 /* 259 * Enough room to expand DMU_POOL_DDT format for all possible DDT 260 * checksum/class/type combinations. 261 */ 262 #define DDT_NAMELEN 32 263 264 extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, 265 const ddt_univ_phys_t *ddp); 266 267 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); 268 269 extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, 270 char *name); 271 extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, 272 uint64_t *walk, ddt_lightweight_entry_t *ddlwe); 273 extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, 274 uint64_t *count); 275 extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, 276 dmu_object_info_t *); 277 278 #ifdef __cplusplus 279 } 280 #endif 281 282 #endif /* _SYS_DDT_H */ 283