1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #ifndef SPDK_BDEV_NVME_H 9 #define SPDK_BDEV_NVME_H 10 11 #include "spdk/stdinc.h" 12 13 #include "spdk/queue.h" 14 #include "spdk/nvme.h" 15 #include "spdk/bdev_module.h" 16 #include "spdk/jsonrpc.h" 17 18 TAILQ_HEAD(nvme_bdev_ctrlrs, nvme_bdev_ctrlr); 19 extern struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs; 20 extern pthread_mutex_t g_bdev_nvme_mutex; 21 extern bool g_bdev_nvme_module_finish; 22 extern struct spdk_thread *g_bdev_nvme_init_thread; 23 24 #define NVME_MAX_CONTROLLERS 1024 25 26 enum bdev_nvme_multipath_policy { 27 BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE, 28 BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE, 29 }; 30 31 enum bdev_nvme_multipath_selector { 32 BDEV_NVME_MP_SELECTOR_ROUND_ROBIN = 1, 33 BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH, 34 }; 35 36 typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc); 37 typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx, int status); 38 typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx); 39 40 struct nvme_ctrlr_opts { 41 uint32_t prchk_flags; 42 int32_t ctrlr_loss_timeout_sec; 43 uint32_t reconnect_delay_sec; 44 uint32_t fast_io_fail_timeout_sec; 45 bool from_discovery_service; 46 }; 47 48 struct nvme_async_probe_ctx { 49 struct spdk_nvme_probe_ctx *probe_ctx; 50 const char *base_name; 51 const char **names; 52 uint32_t count; 53 struct spdk_poller *poller; 54 struct spdk_nvme_transport_id trid; 55 struct nvme_ctrlr_opts bdev_opts; 56 struct spdk_nvme_ctrlr_opts drv_opts; 57 spdk_bdev_create_nvme_fn cb_fn; 58 void *cb_ctx; 59 uint32_t populates_in_progress; 60 bool ctrlr_attached; 61 bool probe_done; 62 bool namespaces_populated; 63 }; 64 65 struct nvme_ns { 66 uint32_t id; 67 struct spdk_nvme_ns *ns; 68 struct nvme_ctrlr *ctrlr; 69 struct nvme_bdev *bdev; 70 uint32_t ana_group_id; 71 enum spdk_nvme_ana_state ana_state; 72 bool ana_state_updating; 73 bool ana_transition_timedout; 74 struct spdk_poller *anatt_timer; 75 struct nvme_async_probe_ctx *probe_ctx; 76 TAILQ_ENTRY(nvme_ns) tailq; 77 RB_ENTRY(nvme_ns) node; 78 79 /** 80 * record io path stat before destroyed. Allocation of stat is 81 * decided by option io_path_stat of RPC 82 * bdev_nvme_set_options 83 */ 84 struct spdk_bdev_io_stat *stat; 85 }; 86 87 struct nvme_bdev_io; 88 struct nvme_bdev_ctrlr; 89 struct nvme_bdev; 90 struct nvme_io_path; 91 92 struct nvme_path_id { 93 struct spdk_nvme_transport_id trid; 94 struct spdk_nvme_host_id hostid; 95 TAILQ_ENTRY(nvme_path_id) link; 96 bool is_failed; 97 }; 98 99 typedef void (*bdev_nvme_reset_cb)(void *cb_arg, bool success); 100 typedef void (*nvme_ctrlr_disconnected_cb)(struct nvme_ctrlr *nvme_ctrlr); 101 102 struct nvme_ctrlr { 103 /** 104 * points to pinned, physically contiguous memory region; 105 * contains 4KB IDENTIFY structure for controller which is 106 * target for CONTROLLER IDENTIFY command during initialization 107 */ 108 struct spdk_nvme_ctrlr *ctrlr; 109 struct nvme_path_id *active_path_id; 110 int ref; 111 112 uint32_t resetting : 1; 113 uint32_t reconnect_is_delayed : 1; 114 uint32_t fast_io_fail_timedout : 1; 115 uint32_t destruct : 1; 116 uint32_t ana_log_page_updating : 1; 117 uint32_t io_path_cache_clearing : 1; 118 uint32_t dont_retry : 1; 119 120 struct nvme_ctrlr_opts opts; 121 122 RB_HEAD(nvme_ns_tree, nvme_ns) namespaces; 123 124 struct spdk_opal_dev *opal_dev; 125 126 struct spdk_poller *adminq_timer_poller; 127 struct spdk_thread *thread; 128 129 bdev_nvme_reset_cb reset_cb_fn; 130 void *reset_cb_arg; 131 /* Poller used to check for reset/detach completion */ 132 struct spdk_poller *reset_detach_poller; 133 struct spdk_nvme_detach_ctx *detach_ctx; 134 135 uint64_t reset_start_tsc; 136 struct spdk_poller *reconnect_delay_timer; 137 138 nvme_ctrlr_disconnected_cb disconnected_cb; 139 140 /** linked list pointer for device list */ 141 TAILQ_ENTRY(nvme_ctrlr) tailq; 142 struct nvme_bdev_ctrlr *nbdev_ctrlr; 143 144 TAILQ_HEAD(nvme_paths, nvme_path_id) trids; 145 146 uint32_t max_ana_log_page_size; 147 struct spdk_nvme_ana_page *ana_log_page; 148 struct spdk_nvme_ana_group_descriptor *copied_ana_desc; 149 150 struct nvme_async_probe_ctx *probe_ctx; 151 152 pthread_mutex_t mutex; 153 }; 154 155 struct nvme_bdev_ctrlr { 156 char *name; 157 TAILQ_HEAD(, nvme_ctrlr) ctrlrs; 158 TAILQ_HEAD(, nvme_bdev) bdevs; 159 TAILQ_ENTRY(nvme_bdev_ctrlr) tailq; 160 }; 161 162 struct nvme_error_stat { 163 uint32_t status_type[8]; 164 uint32_t status[4][256]; 165 }; 166 167 struct nvme_bdev { 168 struct spdk_bdev disk; 169 uint32_t nsid; 170 struct nvme_bdev_ctrlr *nbdev_ctrlr; 171 pthread_mutex_t mutex; 172 int ref; 173 enum bdev_nvme_multipath_policy mp_policy; 174 enum bdev_nvme_multipath_selector mp_selector; 175 uint32_t rr_min_io; 176 TAILQ_HEAD(, nvme_ns) nvme_ns_list; 177 bool opal; 178 TAILQ_ENTRY(nvme_bdev) tailq; 179 struct nvme_error_stat *err_stat; 180 }; 181 182 struct nvme_qpair { 183 struct nvme_ctrlr *ctrlr; 184 struct spdk_nvme_qpair *qpair; 185 struct nvme_poll_group *group; 186 struct nvme_ctrlr_channel *ctrlr_ch; 187 188 /* The following is used to update io_path cache of nvme_bdev_channels. */ 189 TAILQ_HEAD(, nvme_io_path) io_path_list; 190 191 TAILQ_ENTRY(nvme_qpair) tailq; 192 }; 193 194 struct nvme_ctrlr_channel { 195 struct nvme_qpair *qpair; 196 TAILQ_HEAD(, spdk_bdev_io) pending_resets; 197 198 struct spdk_io_channel_iter *reset_iter; 199 }; 200 201 struct nvme_io_path { 202 struct nvme_ns *nvme_ns; 203 struct nvme_qpair *qpair; 204 STAILQ_ENTRY(nvme_io_path) stailq; 205 206 /* The following are used to update io_path cache of the nvme_bdev_channel. */ 207 struct nvme_bdev_channel *nbdev_ch; 208 TAILQ_ENTRY(nvme_io_path) tailq; 209 210 /* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */ 211 struct spdk_bdev_io_stat *stat; 212 }; 213 214 struct nvme_bdev_channel { 215 struct nvme_io_path *current_io_path; 216 enum bdev_nvme_multipath_policy mp_policy; 217 enum bdev_nvme_multipath_selector mp_selector; 218 uint32_t rr_min_io; 219 uint32_t rr_counter; 220 STAILQ_HEAD(, nvme_io_path) io_path_list; 221 TAILQ_HEAD(retry_io_head, spdk_bdev_io) retry_io_list; 222 struct spdk_poller *retry_io_poller; 223 }; 224 225 struct nvme_poll_group { 226 struct spdk_nvme_poll_group *group; 227 struct spdk_io_channel *accel_channel; 228 struct spdk_poller *poller; 229 bool collect_spin_stat; 230 uint64_t spin_ticks; 231 uint64_t start_ticks; 232 uint64_t end_ticks; 233 TAILQ_HEAD(, nvme_qpair) qpair_list; 234 }; 235 236 void nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path); 237 238 struct nvme_ctrlr *nvme_ctrlr_get_by_name(const char *name); 239 240 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr_get_by_name(const char *name); 241 242 typedef void (*nvme_bdev_ctrlr_for_each_fn)(struct nvme_bdev_ctrlr *nbdev_ctrlr, void *ctx); 243 244 void nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx); 245 246 void nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, 247 struct spdk_json_write_ctx *w); 248 249 void nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr); 250 251 struct nvme_ns *nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid); 252 struct nvme_ns *nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr); 253 struct nvme_ns *nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns); 254 255 enum spdk_bdev_timeout_action { 256 SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE = 0, 257 SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET, 258 SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT, 259 }; 260 261 struct spdk_bdev_nvme_opts { 262 enum spdk_bdev_timeout_action action_on_timeout; 263 uint64_t timeout_us; 264 uint64_t timeout_admin_us; 265 uint32_t keep_alive_timeout_ms; 266 /* The number of attempts per I/O in the transport layer before an I/O fails. */ 267 uint32_t transport_retry_count; 268 uint32_t arbitration_burst; 269 uint32_t low_priority_weight; 270 uint32_t medium_priority_weight; 271 uint32_t high_priority_weight; 272 uint64_t nvme_adminq_poll_period_us; 273 uint64_t nvme_ioq_poll_period_us; 274 uint32_t io_queue_requests; 275 bool delay_cmd_submit; 276 /* The number of attempts per I/O in the bdev layer before an I/O fails. */ 277 int32_t bdev_retry_count; 278 uint8_t transport_ack_timeout; 279 int32_t ctrlr_loss_timeout_sec; 280 uint32_t reconnect_delay_sec; 281 uint32_t fast_io_fail_timeout_sec; 282 bool disable_auto_failback; 283 bool generate_uuids; 284 /* Type of Service - RDMA only */ 285 uint8_t transport_tos; 286 bool nvme_error_stat; 287 uint32_t rdma_srq_size; 288 bool io_path_stat; 289 }; 290 291 struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); 292 void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts); 293 int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts); 294 int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx); 295 296 void bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts); 297 298 int bdev_nvme_create(struct spdk_nvme_transport_id *trid, 299 const char *base_name, 300 const char **names, 301 uint32_t count, 302 spdk_bdev_create_nvme_fn cb_fn, 303 void *cb_ctx, 304 struct spdk_nvme_ctrlr_opts *drv_opts, 305 struct nvme_ctrlr_opts *bdev_opts, 306 bool multipath); 307 308 int bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, const char *base_name, 309 struct spdk_nvme_ctrlr_opts *drv_opts, struct nvme_ctrlr_opts *bdev_opts, 310 uint64_t timeout, bool from_mdns, 311 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx); 312 int bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, 313 void *cb_ctx); 314 void bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w); 315 316 int bdev_nvme_start_mdns_discovery(const char *base_name, 317 const char *svcname, 318 struct spdk_nvme_ctrlr_opts *drv_opts, 319 struct nvme_ctrlr_opts *bdev_opts); 320 int bdev_nvme_stop_mdns_discovery(const char *name); 321 void bdev_nvme_get_mdns_discovery_info(struct spdk_jsonrpc_request *request); 322 void bdev_nvme_mdns_discovery_config_json(struct spdk_json_write_ctx *w); 323 324 struct spdk_nvme_ctrlr *bdev_nvme_get_ctrlr(struct spdk_bdev *bdev); 325 326 /** 327 * Delete NVMe controller with all bdevs on top of it, or delete the specified path 328 * if there is any alternative path. Requires to pass name of NVMe controller. 329 * 330 * \param name NVMe controller name 331 * \param path_id The specified path to remove (optional) 332 * \return zero on success, -EINVAL on wrong parameters or -ENODEV if controller is not found 333 */ 334 int bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id); 335 336 /** 337 * Reset NVMe controller. 338 * 339 * \param nvme_ctrlr The specified NVMe controller to reset 340 * \param cb_fn Function to be called back after reset completes 341 * \param cb_arg Argument for callback function 342 * \return zero on success. Negated errno on the following error conditions: 343 * -ENXIO: controller is being destroyed. 344 * -EBUSY: controller is already being reset. 345 */ 346 int bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg); 347 348 typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc); 349 350 /** 351 * Set the preferred I/O path for an NVMe bdev in multipath mode. 352 * 353 * NOTE: This function does not support NVMe bdevs in failover mode. 354 * 355 * \param name NVMe bdev name 356 * \param cntlid NVMe-oF controller ID 357 * \param cb_fn Function to be called back after completion. 358 * \param cb_arg Argument for callback function. 359 */ 360 void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 361 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg); 362 363 typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc); 364 365 /** 366 * Set multipath policy of the NVMe bdev. 367 * 368 * \param name NVMe bdev name 369 * \param policy Multipath policy (active-passive or active-active) 370 * \param selector Multipath selector (round_robin, queue_depth) 371 * \param rr_min_io Number of IO to route to a path before switching to another for round-robin 372 * \param cb_fn Function to be called back after completion. 373 */ 374 void bdev_nvme_set_multipath_policy(const char *name, 375 enum bdev_nvme_multipath_policy policy, 376 enum bdev_nvme_multipath_selector selector, 377 uint32_t rr_min_io, 378 bdev_nvme_set_multipath_policy_cb cb_fn, 379 void *cb_arg); 380 381 #endif /* SPDK_BDEV_NVME_H */ 382