1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #ifndef SPDK_BDEV_NVME_H 9 #define SPDK_BDEV_NVME_H 10 11 #include "spdk/stdinc.h" 12 13 #include "spdk/queue.h" 14 #include "spdk/nvme.h" 15 #include "spdk/bdev_module.h" 16 #include "spdk/jsonrpc.h" 17 18 TAILQ_HEAD(nvme_bdev_ctrlrs, nvme_bdev_ctrlr); 19 extern struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs; 20 extern pthread_mutex_t g_bdev_nvme_mutex; 21 extern bool g_bdev_nvme_module_finish; 22 extern struct spdk_thread *g_bdev_nvme_init_thread; 23 24 #define NVME_MAX_CONTROLLERS 1024 25 26 enum bdev_nvme_multipath_policy { 27 BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE, 28 BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE, 29 }; 30 31 enum bdev_nvme_multipath_selector { 32 BDEV_NVME_MP_SELECTOR_ROUND_ROBIN = 1, 33 BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH, 34 }; 35 36 typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc); 37 typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx, int status); 38 typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx); 39 40 struct nvme_ctrlr_opts { 41 uint32_t prchk_flags; 42 int32_t ctrlr_loss_timeout_sec; 43 uint32_t reconnect_delay_sec; 44 uint32_t fast_io_fail_timeout_sec; 45 bool from_discovery_service; 46 /* Name of the PSK or path to the file containing PSK. */ 47 char psk[PATH_MAX]; 48 const char *dhchap_key; 49 const char *dhchap_ctrlr_key; 50 }; 51 52 struct nvme_async_probe_ctx { 53 struct spdk_nvme_probe_ctx *probe_ctx; 54 const char *base_name; 55 const char **names; 56 uint32_t max_bdevs; 57 uint32_t reported_bdevs; 58 struct spdk_poller *poller; 59 struct spdk_nvme_transport_id trid; 60 struct nvme_ctrlr_opts bdev_opts; 61 struct spdk_nvme_ctrlr_opts drv_opts; 62 spdk_bdev_create_nvme_fn cb_fn; 63 void *cb_ctx; 64 uint32_t populates_in_progress; 65 bool ctrlr_attached; 66 bool probe_done; 67 bool namespaces_populated; 68 }; 69 70 struct nvme_ns { 71 uint32_t id; 72 struct spdk_nvme_ns *ns; 73 struct nvme_ctrlr *ctrlr; 74 struct nvme_bdev *bdev; 75 uint32_t ana_group_id; 76 enum spdk_nvme_ana_state ana_state; 77 bool ana_state_updating; 78 bool ana_transition_timedout; 79 struct spdk_poller *anatt_timer; 80 struct nvme_async_probe_ctx *probe_ctx; 81 TAILQ_ENTRY(nvme_ns) tailq; 82 RB_ENTRY(nvme_ns) node; 83 84 /** 85 * record io path stat before destroyed. Allocation of stat is 86 * decided by option io_path_stat of RPC 87 * bdev_nvme_set_options 88 */ 89 struct spdk_bdev_io_stat *stat; 90 }; 91 92 struct nvme_bdev_io; 93 struct nvme_bdev_ctrlr; 94 struct nvme_bdev; 95 struct nvme_io_path; 96 97 struct nvme_path_id { 98 struct spdk_nvme_transport_id trid; 99 struct spdk_nvme_host_id hostid; 100 TAILQ_ENTRY(nvme_path_id) link; 101 uint64_t last_failed_tsc; 102 }; 103 104 typedef void (*bdev_nvme_ctrlr_op_cb)(void *cb_arg, int rc); 105 typedef void (*nvme_ctrlr_disconnected_cb)(struct nvme_ctrlr *nvme_ctrlr); 106 107 struct nvme_ctrlr { 108 /** 109 * points to pinned, physically contiguous memory region; 110 * contains 4KB IDENTIFY structure for controller which is 111 * target for CONTROLLER IDENTIFY command during initialization 112 */ 113 struct spdk_nvme_ctrlr *ctrlr; 114 struct nvme_path_id *active_path_id; 115 int ref; 116 117 uint32_t resetting : 1; 118 uint32_t reconnect_is_delayed : 1; 119 uint32_t in_failover : 1; 120 uint32_t pending_failover : 1; 121 uint32_t fast_io_fail_timedout : 1; 122 uint32_t destruct : 1; 123 uint32_t ana_log_page_updating : 1; 124 uint32_t io_path_cache_clearing : 1; 125 uint32_t dont_retry : 1; 126 uint32_t disabled : 1; 127 128 struct nvme_ctrlr_opts opts; 129 130 RB_HEAD(nvme_ns_tree, nvme_ns) namespaces; 131 132 struct spdk_opal_dev *opal_dev; 133 134 struct spdk_poller *adminq_timer_poller; 135 struct spdk_thread *thread; 136 137 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn; 138 void *ctrlr_op_cb_arg; 139 /* Poller used to check for reset/detach completion */ 140 struct spdk_poller *reset_detach_poller; 141 struct spdk_nvme_detach_ctx *detach_ctx; 142 143 uint64_t reset_start_tsc; 144 struct spdk_poller *reconnect_delay_timer; 145 146 nvme_ctrlr_disconnected_cb disconnected_cb; 147 148 /** linked list pointer for device list */ 149 TAILQ_ENTRY(nvme_ctrlr) tailq; 150 struct nvme_bdev_ctrlr *nbdev_ctrlr; 151 152 TAILQ_HEAD(nvme_paths, nvme_path_id) trids; 153 154 uint32_t max_ana_log_page_size; 155 struct spdk_nvme_ana_page *ana_log_page; 156 struct spdk_nvme_ana_group_descriptor *copied_ana_desc; 157 158 struct nvme_async_probe_ctx *probe_ctx; 159 struct spdk_key *psk; 160 struct spdk_key *dhchap_key; 161 struct spdk_key *dhchap_ctrlr_key; 162 163 pthread_mutex_t mutex; 164 }; 165 166 struct nvme_bdev_ctrlr { 167 char *name; 168 TAILQ_HEAD(, nvme_ctrlr) ctrlrs; 169 TAILQ_HEAD(, nvme_bdev) bdevs; 170 TAILQ_ENTRY(nvme_bdev_ctrlr) tailq; 171 }; 172 173 struct nvme_error_stat { 174 uint32_t status_type[8]; 175 uint32_t status[4][256]; 176 }; 177 178 struct nvme_bdev { 179 struct spdk_bdev disk; 180 uint32_t nsid; 181 struct nvme_bdev_ctrlr *nbdev_ctrlr; 182 pthread_mutex_t mutex; 183 int ref; 184 enum bdev_nvme_multipath_policy mp_policy; 185 enum bdev_nvme_multipath_selector mp_selector; 186 uint32_t rr_min_io; 187 TAILQ_HEAD(, nvme_ns) nvme_ns_list; 188 bool opal; 189 TAILQ_ENTRY(nvme_bdev) tailq; 190 struct nvme_error_stat *err_stat; 191 }; 192 193 struct nvme_qpair { 194 struct nvme_ctrlr *ctrlr; 195 struct spdk_nvme_qpair *qpair; 196 struct nvme_poll_group *group; 197 struct nvme_ctrlr_channel *ctrlr_ch; 198 199 /* The following is used to update io_path cache of nvme_bdev_channels. */ 200 TAILQ_HEAD(, nvme_io_path) io_path_list; 201 202 TAILQ_ENTRY(nvme_qpair) tailq; 203 }; 204 205 struct nvme_ctrlr_channel { 206 struct nvme_qpair *qpair; 207 TAILQ_HEAD(, nvme_bdev_io) pending_resets; 208 209 struct spdk_io_channel_iter *reset_iter; 210 struct spdk_poller *connect_poller; 211 }; 212 213 struct nvme_io_path { 214 struct nvme_ns *nvme_ns; 215 struct nvme_qpair *qpair; 216 STAILQ_ENTRY(nvme_io_path) stailq; 217 218 /* The following are used to update io_path cache of the nvme_bdev_channel. */ 219 struct nvme_bdev_channel *nbdev_ch; 220 TAILQ_ENTRY(nvme_io_path) tailq; 221 222 /* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */ 223 struct spdk_bdev_io_stat *stat; 224 }; 225 226 struct nvme_bdev_channel { 227 struct nvme_io_path *current_io_path; 228 enum bdev_nvme_multipath_policy mp_policy; 229 enum bdev_nvme_multipath_selector mp_selector; 230 uint32_t rr_min_io; 231 uint32_t rr_counter; 232 STAILQ_HEAD(, nvme_io_path) io_path_list; 233 TAILQ_HEAD(retry_io_head, nvme_bdev_io) retry_io_list; 234 struct spdk_poller *retry_io_poller; 235 }; 236 237 struct nvme_poll_group { 238 struct spdk_nvme_poll_group *group; 239 struct spdk_io_channel *accel_channel; 240 struct spdk_poller *poller; 241 bool collect_spin_stat; 242 uint64_t spin_ticks; 243 uint64_t start_ticks; 244 uint64_t end_ticks; 245 TAILQ_HEAD(, nvme_qpair) qpair_list; 246 }; 247 248 void nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path); 249 250 struct nvme_ctrlr *nvme_ctrlr_get_by_name(const char *name); 251 252 struct nvme_ctrlr *nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 253 uint16_t cntlid); 254 255 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr_get_by_name(const char *name); 256 257 typedef void (*nvme_bdev_ctrlr_for_each_fn)(struct nvme_bdev_ctrlr *nbdev_ctrlr, void *ctx); 258 259 void nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx); 260 261 void nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, 262 struct spdk_json_write_ctx *w); 263 264 void nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr); 265 266 struct nvme_ns *nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid); 267 struct nvme_ns *nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr); 268 struct nvme_ns *nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns); 269 270 enum spdk_bdev_timeout_action { 271 SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE = 0, 272 SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET, 273 SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT, 274 }; 275 276 struct spdk_bdev_nvme_opts { 277 enum spdk_bdev_timeout_action action_on_timeout; 278 uint64_t timeout_us; 279 uint64_t timeout_admin_us; 280 uint32_t keep_alive_timeout_ms; 281 /* The number of attempts per I/O in the transport layer before an I/O fails. */ 282 uint32_t transport_retry_count; 283 uint32_t arbitration_burst; 284 uint32_t low_priority_weight; 285 uint32_t medium_priority_weight; 286 uint32_t high_priority_weight; 287 uint64_t nvme_adminq_poll_period_us; 288 uint64_t nvme_ioq_poll_period_us; 289 uint32_t io_queue_requests; 290 bool delay_cmd_submit; 291 /* The number of attempts per I/O in the bdev layer before an I/O fails. */ 292 int32_t bdev_retry_count; 293 uint8_t transport_ack_timeout; 294 int32_t ctrlr_loss_timeout_sec; 295 uint32_t reconnect_delay_sec; 296 uint32_t fast_io_fail_timeout_sec; 297 bool disable_auto_failback; 298 bool generate_uuids; 299 /* Type of Service - RDMA only */ 300 uint8_t transport_tos; 301 bool nvme_error_stat; 302 uint32_t rdma_srq_size; 303 bool io_path_stat; 304 bool allow_accel_sequence; 305 uint32_t rdma_max_cq_size; 306 uint16_t rdma_cm_event_timeout_ms; 307 uint32_t dhchap_digests; 308 uint32_t dhchap_dhgroups; 309 }; 310 311 struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); 312 void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts); 313 int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts); 314 int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx); 315 316 void bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts); 317 318 int bdev_nvme_create(struct spdk_nvme_transport_id *trid, 319 const char *base_name, 320 const char **names, 321 uint32_t count, 322 spdk_bdev_create_nvme_fn cb_fn, 323 void *cb_ctx, 324 struct spdk_nvme_ctrlr_opts *drv_opts, 325 struct nvme_ctrlr_opts *bdev_opts, 326 bool multipath); 327 328 int bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, const char *base_name, 329 struct spdk_nvme_ctrlr_opts *drv_opts, struct nvme_ctrlr_opts *bdev_opts, 330 uint64_t timeout, bool from_mdns, 331 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx); 332 int bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, 333 void *cb_ctx); 334 void bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w); 335 336 int bdev_nvme_start_mdns_discovery(const char *base_name, 337 const char *svcname, 338 struct spdk_nvme_ctrlr_opts *drv_opts, 339 struct nvme_ctrlr_opts *bdev_opts); 340 int bdev_nvme_stop_mdns_discovery(const char *name); 341 void bdev_nvme_get_mdns_discovery_info(struct spdk_jsonrpc_request *request); 342 void bdev_nvme_mdns_discovery_config_json(struct spdk_json_write_ctx *w); 343 344 struct spdk_nvme_ctrlr *bdev_nvme_get_ctrlr(struct spdk_bdev *bdev); 345 346 typedef void (*bdev_nvme_delete_done_fn)(void *ctx, int rc); 347 348 /** 349 * Delete NVMe controller with all bdevs on top of it, or delete the specified path 350 * if there is any alternative path. Requires to pass name of NVMe controller. 351 * 352 * \param name NVMe controller name 353 * \param path_id The specified path to remove (optional) 354 * \param delete_done Callback function on delete complete (optional) 355 * \param delete_done_ctx Context passed to callback (optional) 356 * \return zero on success, 357 * -EINVAL on wrong parameters or 358 * -ENODEV if controller is not found or 359 * -ENOMEM on no memory 360 */ 361 int bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 362 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx); 363 364 enum nvme_ctrlr_op { 365 NVME_CTRLR_OP_RESET = 1, 366 NVME_CTRLR_OP_ENABLE, 367 NVME_CTRLR_OP_DISABLE, 368 }; 369 370 /** 371 * Perform specified operation on an NVMe controller. 372 * 373 * NOTE: The callback function is always called after this function returns except for 374 * out of memory cases. 375 * 376 * \param nvme_ctrlr The specified NVMe controller to operate 377 * \param op Operation code 378 * \param cb_fn Function to be called back after operation completes 379 * \param cb_arg Argument for callback function 380 */ 381 void nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 382 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg); 383 384 /** 385 * Perform specified operation on all NVMe controllers in an NVMe bdev controller. 386 * 387 * NOTE: The callback function is always called after this function returns except for 388 * out of memory cases. 389 * 390 * \param nbdev_ctrlr The specified NVMe bdev controller to operate 391 * \param op Operation code 392 * \param cb_fn Function to be called back after operation completes 393 * \param cb_arg Argument for callback function 394 */ 395 void nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 396 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg); 397 398 typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc); 399 400 /** 401 * Set the preferred I/O path for an NVMe bdev in multipath mode. 402 * 403 * NOTE: This function does not support NVMe bdevs in failover mode. 404 * 405 * \param name NVMe bdev name 406 * \param cntlid NVMe-oF controller ID 407 * \param cb_fn Function to be called back after completion. 408 * \param cb_arg Argument for callback function. 409 */ 410 void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 411 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg); 412 413 typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc); 414 415 /** 416 * Set multipath policy of the NVMe bdev. 417 * 418 * \param name NVMe bdev name 419 * \param policy Multipath policy (active-passive or active-active) 420 * \param selector Multipath selector (round_robin, queue_depth) 421 * \param rr_min_io Number of IO to route to a path before switching to another for round-robin 422 * \param cb_fn Function to be called back after completion. 423 */ 424 void bdev_nvme_set_multipath_policy(const char *name, 425 enum bdev_nvme_multipath_policy policy, 426 enum bdev_nvme_multipath_selector selector, 427 uint32_t rr_min_io, 428 bdev_nvme_set_multipath_policy_cb cb_fn, 429 void *cb_arg); 430 431 #endif /* SPDK_BDEV_NVME_H */ 432