xref: /spdk/module/bdev/nvme/bdev_nvme.h (revision 318515b44ec8b67f83bcc9ca83f0c7d5ea919e62)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  *   Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved.
6  */
7 
8 #ifndef SPDK_BDEV_NVME_H
9 #define SPDK_BDEV_NVME_H
10 
11 #include "spdk/stdinc.h"
12 
13 #include "spdk/queue.h"
14 #include "spdk/nvme.h"
15 #include "spdk/bdev_module.h"
16 #include "spdk/module/bdev/nvme.h"
17 #include "spdk/jsonrpc.h"
18 
19 TAILQ_HEAD(nvme_bdev_ctrlrs, nvme_bdev_ctrlr);
20 extern struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs;
21 extern pthread_mutex_t g_bdev_nvme_mutex;
22 extern bool g_bdev_nvme_module_finish;
23 extern struct spdk_thread *g_bdev_nvme_init_thread;
24 
25 #define NVME_MAX_CONTROLLERS 1024
26 
27 typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx, int status);
28 typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx);
29 
30 struct nvme_async_probe_ctx {
31 	struct spdk_nvme_probe_ctx *probe_ctx;
32 	const char *base_name;
33 	const char **names;
34 	uint32_t max_bdevs;
35 	uint32_t reported_bdevs;
36 	struct spdk_poller *poller;
37 	struct spdk_nvme_transport_id trid;
38 	struct spdk_bdev_nvme_ctrlr_opts bdev_opts;
39 	struct spdk_nvme_ctrlr_opts drv_opts;
40 	spdk_bdev_nvme_create_cb cb_fn;
41 	void *cb_ctx;
42 	uint32_t populates_in_progress;
43 	bool ctrlr_attached;
44 	bool probe_done;
45 	bool namespaces_populated;
46 };
47 
48 struct nvme_ns {
49 	uint32_t			id;
50 	struct spdk_nvme_ns		*ns;
51 	struct nvme_ctrlr		*ctrlr;
52 	struct nvme_bdev		*bdev;
53 	uint32_t			ana_group_id;
54 	enum spdk_nvme_ana_state	ana_state;
55 	bool				ana_state_updating;
56 	bool				ana_transition_timedout;
57 	struct spdk_poller		*anatt_timer;
58 	struct nvme_async_probe_ctx	*probe_ctx;
59 	TAILQ_ENTRY(nvme_ns)		tailq;
60 	RB_ENTRY(nvme_ns)		node;
61 
62 	/**
63 	 * record io path stat before destroyed. Allocation of stat is
64 	 * decided by option io_path_stat of RPC
65 	 * bdev_nvme_set_options
66 	 */
67 	struct spdk_bdev_io_stat	*stat;
68 };
69 
70 struct nvme_bdev_io;
71 struct nvme_bdev_ctrlr;
72 struct nvme_bdev;
73 struct nvme_io_path;
74 struct nvme_ctrlr_channel_iter;
75 struct nvme_bdev_channel_iter;
76 
77 struct nvme_path_id {
78 	struct spdk_nvme_transport_id		trid;
79 	struct spdk_nvme_host_id		hostid;
80 	TAILQ_ENTRY(nvme_path_id)		link;
81 	uint64_t				last_failed_tsc;
82 };
83 
84 typedef void (*bdev_nvme_ctrlr_op_cb)(void *cb_arg, int rc);
85 typedef void (*nvme_ctrlr_disconnected_cb)(struct nvme_ctrlr *nvme_ctrlr);
86 
87 struct nvme_ctrlr {
88 	/**
89 	 * points to pinned, physically contiguous memory region;
90 	 * contains 4KB IDENTIFY structure for controller which is
91 	 *  target for CONTROLLER IDENTIFY command during initialization
92 	 */
93 	struct spdk_nvme_ctrlr			*ctrlr;
94 	struct nvme_path_id			*active_path_id;
95 	int					ref;
96 
97 	uint32_t				resetting : 1;
98 	uint32_t				reconnect_is_delayed : 1;
99 	uint32_t				in_failover : 1;
100 	uint32_t				pending_failover : 1;
101 	uint32_t				fast_io_fail_timedout : 1;
102 	uint32_t				destruct : 1;
103 	uint32_t				ana_log_page_updating : 1;
104 	uint32_t				io_path_cache_clearing : 1;
105 	uint32_t				dont_retry : 1;
106 	uint32_t				disabled : 1;
107 
108 	struct spdk_bdev_nvme_ctrlr_opts	opts;
109 
110 	RB_HEAD(nvme_ns_tree, nvme_ns)		namespaces;
111 
112 	struct spdk_opal_dev			*opal_dev;
113 
114 	struct spdk_poller			*adminq_timer_poller;
115 	struct spdk_thread			*thread;
116 	struct spdk_interrupt			*intr;
117 
118 	bdev_nvme_ctrlr_op_cb			ctrlr_op_cb_fn;
119 	void					*ctrlr_op_cb_arg;
120 	/* Poller used to check for reset/detach completion */
121 	struct spdk_poller			*reset_detach_poller;
122 	struct spdk_nvme_detach_ctx		*detach_ctx;
123 
124 	uint64_t				reset_start_tsc;
125 	struct spdk_poller			*reconnect_delay_timer;
126 
127 	nvme_ctrlr_disconnected_cb		disconnected_cb;
128 
129 	TAILQ_HEAD(, nvme_bdev_io)		pending_resets;
130 
131 	/** linked list pointer for device list */
132 	TAILQ_ENTRY(nvme_ctrlr)			tailq;
133 	struct nvme_bdev_ctrlr			*nbdev_ctrlr;
134 
135 	TAILQ_HEAD(nvme_paths, nvme_path_id)	trids;
136 
137 	uint32_t				max_ana_log_page_size;
138 	struct spdk_nvme_ana_page		*ana_log_page;
139 	struct spdk_nvme_ana_group_descriptor	*copied_ana_desc;
140 
141 	struct nvme_async_probe_ctx		*probe_ctx;
142 	struct spdk_key				*psk;
143 	struct spdk_key				*dhchap_key;
144 	struct spdk_key				*dhchap_ctrlr_key;
145 
146 	pthread_mutex_t				mutex;
147 };
148 
149 struct nvme_bdev_ctrlr {
150 	char				*name;
151 	TAILQ_HEAD(, nvme_ctrlr)	ctrlrs;
152 	TAILQ_HEAD(, nvme_bdev)		bdevs;
153 	TAILQ_ENTRY(nvme_bdev_ctrlr)	tailq;
154 };
155 
156 struct nvme_error_stat {
157 	uint32_t status_type[8];
158 	uint32_t status[4][256];
159 };
160 
161 struct nvme_bdev {
162 	struct spdk_bdev			disk;
163 	uint32_t				nsid;
164 	struct nvme_bdev_ctrlr			*nbdev_ctrlr;
165 	pthread_mutex_t				mutex;
166 	int					ref;
167 	enum spdk_bdev_nvme_multipath_policy	mp_policy;
168 	enum spdk_bdev_nvme_multipath_selector	mp_selector;
169 	uint32_t				rr_min_io;
170 	TAILQ_HEAD(, nvme_ns)			nvme_ns_list;
171 	bool					opal;
172 	TAILQ_ENTRY(nvme_bdev)			tailq;
173 	struct nvme_error_stat			*err_stat;
174 };
175 
176 struct nvme_qpair {
177 	struct nvme_ctrlr		*ctrlr;
178 	struct spdk_nvme_qpair		*qpair;
179 	struct nvme_poll_group		*group;
180 	struct nvme_ctrlr_channel	*ctrlr_ch;
181 
182 	/* The following is used to update io_path cache of nvme_bdev_channels. */
183 	TAILQ_HEAD(, nvme_io_path)	io_path_list;
184 
185 	TAILQ_ENTRY(nvme_qpair)		tailq;
186 };
187 
188 struct nvme_ctrlr_channel {
189 	struct nvme_qpair		*qpair;
190 
191 	struct nvme_ctrlr_channel_iter	*reset_iter;
192 	struct spdk_poller		*connect_poller;
193 };
194 
195 struct nvme_io_path {
196 	struct nvme_ns			*nvme_ns;
197 	struct nvme_qpair		*qpair;
198 	STAILQ_ENTRY(nvme_io_path)	stailq;
199 
200 	/* The following are used to update io_path cache of the nvme_bdev_channel. */
201 	struct nvme_bdev_channel	*nbdev_ch;
202 	TAILQ_ENTRY(nvme_io_path)	tailq;
203 
204 	/* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */
205 	struct spdk_bdev_io_stat	*stat;
206 };
207 
208 struct nvme_bdev_channel {
209 	struct nvme_io_path			*current_io_path;
210 	enum spdk_bdev_nvme_multipath_policy	mp_policy;
211 	enum spdk_bdev_nvme_multipath_selector	mp_selector;
212 	uint32_t				rr_min_io;
213 	uint32_t				rr_counter;
214 	STAILQ_HEAD(, nvme_io_path)		io_path_list;
215 	TAILQ_HEAD(retry_io_head, nvme_bdev_io)	retry_io_list;
216 	struct spdk_poller			*retry_io_poller;
217 	bool					resetting;
218 };
219 
220 struct nvme_poll_group {
221 	struct spdk_nvme_poll_group		*group;
222 	struct spdk_io_channel			*accel_channel;
223 	struct spdk_poller			*poller;
224 	bool					collect_spin_stat;
225 	uint64_t				spin_ticks;
226 	uint64_t				start_ticks;
227 	uint64_t				end_ticks;
228 	TAILQ_HEAD(, nvme_qpair)		qpair_list;
229 	struct spdk_interrupt			*intr;
230 };
231 
232 void nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path);
233 
234 struct nvme_ctrlr *nvme_ctrlr_get_by_name(const char *name);
235 
236 typedef void (*nvme_ctrlr_for_each_channel_msg)(struct nvme_ctrlr_channel_iter *iter,
237 		struct nvme_ctrlr *nvme_ctrlr,
238 		struct nvme_ctrlr_channel *ctrlr_ch,
239 		void *ctx);
240 
241 typedef void (*nvme_ctrlr_for_each_channel_done)(struct nvme_ctrlr *nvme_ctrlr,
242 		void *ctx, int status);
243 
244 void nvme_ctrlr_for_each_channel(struct nvme_ctrlr *nvme_ctrlr,
245 				 nvme_ctrlr_for_each_channel_msg fn, void *ctx,
246 				 nvme_ctrlr_for_each_channel_done cpl);
247 
248 void nvme_ctrlr_for_each_channel_continue(struct nvme_ctrlr_channel_iter *iter,
249 		int status);
250 
251 
252 typedef void (*nvme_bdev_for_each_channel_msg)(struct nvme_bdev_channel_iter *iter,
253 		struct nvme_bdev *nbdev,
254 		struct nvme_bdev_channel *nbdev_ch,
255 		void *ctx);
256 
257 typedef void (*nvme_bdev_for_each_channel_done)(struct nvme_bdev *nbdev,
258 		void *ctx, int status);
259 
260 void nvme_bdev_for_each_channel(struct nvme_bdev *nbdev,
261 				nvme_bdev_for_each_channel_msg fn, void *ctx,
262 				nvme_bdev_for_each_channel_done cpl);
263 
264 void nvme_bdev_for_each_channel_continue(struct nvme_bdev_channel_iter *iter,
265 		int status);
266 
267 struct nvme_ctrlr *nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr,
268 		uint16_t cntlid);
269 
270 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr_get_by_name(const char *name);
271 
272 typedef void (*nvme_bdev_ctrlr_for_each_fn)(struct nvme_bdev_ctrlr *nbdev_ctrlr, void *ctx);
273 
274 void nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx);
275 
276 void nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid,
277 			      struct spdk_json_write_ctx *w);
278 
279 void nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr);
280 
281 struct nvme_ns *nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid);
282 struct nvme_ns *nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr);
283 struct nvme_ns *nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns);
284 
285 enum spdk_bdev_timeout_action {
286 	SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE = 0,
287 	SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET,
288 	SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT,
289 };
290 
291 struct spdk_bdev_nvme_opts {
292 	enum spdk_bdev_timeout_action action_on_timeout;
293 	uint64_t timeout_us;
294 	uint64_t timeout_admin_us;
295 	uint32_t keep_alive_timeout_ms;
296 	/* The number of attempts per I/O in the transport layer before an I/O fails. */
297 	uint32_t transport_retry_count;
298 	uint32_t arbitration_burst;
299 	uint32_t low_priority_weight;
300 	uint32_t medium_priority_weight;
301 	uint32_t high_priority_weight;
302 	uint64_t nvme_adminq_poll_period_us;
303 	uint64_t nvme_ioq_poll_period_us;
304 	uint32_t io_queue_requests;
305 	bool delay_cmd_submit;
306 	/* The number of attempts per I/O in the bdev layer before an I/O fails. */
307 	int32_t bdev_retry_count;
308 	uint8_t transport_ack_timeout;
309 	int32_t ctrlr_loss_timeout_sec;
310 	uint32_t reconnect_delay_sec;
311 	uint32_t fast_io_fail_timeout_sec;
312 	bool disable_auto_failback;
313 	bool generate_uuids;
314 	/* Type of Service - RDMA only */
315 	uint8_t transport_tos;
316 	bool nvme_error_stat;
317 	uint32_t rdma_srq_size;
318 	bool io_path_stat;
319 	bool allow_accel_sequence;
320 	uint32_t rdma_max_cq_size;
321 	uint16_t rdma_cm_event_timeout_ms;
322 	uint32_t dhchap_digests;
323 	uint32_t dhchap_dhgroups;
324 };
325 
326 struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch);
327 void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts);
328 int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts);
329 int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx);
330 
331 int bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, const char *base_name,
332 			      struct spdk_nvme_ctrlr_opts *drv_opts, struct spdk_bdev_nvme_ctrlr_opts *bdev_opts,
333 			      uint64_t timeout, bool from_mdns,
334 			      spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx);
335 int bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn,
336 			     void *cb_ctx);
337 void bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w);
338 
339 int bdev_nvme_start_mdns_discovery(const char *base_name,
340 				   const char *svcname,
341 				   struct spdk_nvme_ctrlr_opts *drv_opts,
342 				   struct spdk_bdev_nvme_ctrlr_opts *bdev_opts);
343 int bdev_nvme_stop_mdns_discovery(const char *name);
344 void bdev_nvme_get_mdns_discovery_info(struct spdk_jsonrpc_request *request);
345 void bdev_nvme_mdns_discovery_config_json(struct spdk_json_write_ctx *w);
346 
347 typedef void (*bdev_nvme_set_keys_cb)(void *ctx, int status);
348 
349 int bdev_nvme_set_keys(const char *name, const char *dhchap_key, const char *dhchap_ctrlr_key,
350 		       bdev_nvme_set_keys_cb cb_fn, void *cb_ctx);
351 
352 struct spdk_nvme_ctrlr *bdev_nvme_get_ctrlr(struct spdk_bdev *bdev);
353 
354 typedef void (*bdev_nvme_delete_done_fn)(void *ctx, int rc);
355 
356 /**
357  * Delete NVMe controller with all bdevs on top of it, or delete the specified path
358  * if there is any alternative path. Requires to pass name of NVMe controller.
359  *
360  * \param name NVMe controller name
361  * \param path_id The specified path to remove (optional)
362  * \param delete_done Callback function on delete complete (optional)
363  * \param delete_done_ctx Context passed to callback (optional)
364  * \return zero on success,
365  *		-EINVAL on wrong parameters or
366  *		-ENODEV if controller is not found or
367  *		-ENOMEM on no memory
368  */
369 int bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id,
370 		     bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx);
371 
372 enum nvme_ctrlr_op {
373 	NVME_CTRLR_OP_RESET = 1,
374 	NVME_CTRLR_OP_ENABLE,
375 	NVME_CTRLR_OP_DISABLE,
376 };
377 
378 /**
379  * Perform specified operation on an NVMe controller.
380  *
381  * NOTE: The callback function is always called after this function returns except for
382  * out of memory cases.
383  *
384  * \param nvme_ctrlr The specified NVMe controller to operate
385  * \param op Operation code
386  * \param cb_fn Function to be called back after operation completes
387  * \param cb_arg Argument for callback function
388  */
389 void nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op,
390 		       bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg);
391 
392 /**
393  * Perform specified operation on all NVMe controllers in an NVMe bdev controller.
394  *
395  * NOTE: The callback function is always called after this function returns except for
396  * out of memory cases.
397  *
398  * \param nbdev_ctrlr The specified NVMe bdev controller to operate
399  * \param op Operation code
400  * \param cb_fn Function to be called back after operation completes
401  * \param cb_arg Argument for callback function
402  */
403 void nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op,
404 			    bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg);
405 
406 typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc);
407 
408 /**
409  * Set the preferred I/O path for an NVMe bdev in multipath mode.
410  *
411  * NOTE: This function does not support NVMe bdevs in failover mode.
412  *
413  * \param name NVMe bdev name
414  * \param cntlid NVMe-oF controller ID
415  * \param cb_fn Function to be called back after completion.
416  * \param cb_arg Argument for callback function.
417  */
418 void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
419 				  bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg);
420 
421 #endif /* SPDK_BDEV_NVME_H */
422