xref: /spdk/module/bdev/nvme/bdev_nvme.h (revision 48c41d946f69a6f1fec3e0925aa41a4e7d540a8b)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  *   Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved.
6  */
7 
8 #ifndef SPDK_BDEV_NVME_H
9 #define SPDK_BDEV_NVME_H
10 
11 #include "spdk/stdinc.h"
12 
13 #include "spdk/queue.h"
14 #include "spdk/nvme.h"
15 #include "spdk/bdev_module.h"
16 #include "spdk/jsonrpc.h"
17 
18 TAILQ_HEAD(nvme_bdev_ctrlrs, nvme_bdev_ctrlr);
19 extern struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs;
20 extern pthread_mutex_t g_bdev_nvme_mutex;
21 extern bool g_bdev_nvme_module_finish;
22 extern struct spdk_thread *g_bdev_nvme_init_thread;
23 
24 #define NVME_MAX_CONTROLLERS 1024
25 
26 enum bdev_nvme_multipath_policy {
27 	BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE,
28 	BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
29 };
30 
31 enum bdev_nvme_multipath_selector {
32 	BDEV_NVME_MP_SELECTOR_ROUND_ROBIN = 1,
33 	BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH,
34 };
35 
36 typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc);
37 typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx, int status);
38 typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx);
39 
40 struct nvme_ctrlr_opts {
41 	uint32_t prchk_flags;
42 	int32_t ctrlr_loss_timeout_sec;
43 	uint32_t reconnect_delay_sec;
44 	uint32_t fast_io_fail_timeout_sec;
45 	bool from_discovery_service;
46 	/* Name of the PSK or path to the file containing PSK. */
47 	char psk[PATH_MAX];
48 	const char *dhchap_key;
49 	const char *dhchap_ctrlr_key;
50 };
51 
52 struct nvme_async_probe_ctx {
53 	struct spdk_nvme_probe_ctx *probe_ctx;
54 	const char *base_name;
55 	const char **names;
56 	uint32_t max_bdevs;
57 	uint32_t reported_bdevs;
58 	struct spdk_poller *poller;
59 	struct spdk_nvme_transport_id trid;
60 	struct nvme_ctrlr_opts bdev_opts;
61 	struct spdk_nvme_ctrlr_opts drv_opts;
62 	spdk_bdev_create_nvme_fn cb_fn;
63 	void *cb_ctx;
64 	uint32_t populates_in_progress;
65 	bool ctrlr_attached;
66 	bool probe_done;
67 	bool namespaces_populated;
68 };
69 
70 struct nvme_ns {
71 	uint32_t			id;
72 	struct spdk_nvme_ns		*ns;
73 	struct nvme_ctrlr		*ctrlr;
74 	struct nvme_bdev		*bdev;
75 	uint32_t			ana_group_id;
76 	enum spdk_nvme_ana_state	ana_state;
77 	bool				ana_state_updating;
78 	bool				ana_transition_timedout;
79 	struct spdk_poller		*anatt_timer;
80 	struct nvme_async_probe_ctx	*probe_ctx;
81 	TAILQ_ENTRY(nvme_ns)		tailq;
82 	RB_ENTRY(nvme_ns)		node;
83 
84 	/**
85 	 * record io path stat before destroyed. Allocation of stat is
86 	 * decided by option io_path_stat of RPC
87 	 * bdev_nvme_set_options
88 	 */
89 	struct spdk_bdev_io_stat	*stat;
90 };
91 
92 struct nvme_bdev_io;
93 struct nvme_bdev_ctrlr;
94 struct nvme_bdev;
95 struct nvme_io_path;
96 
97 struct nvme_path_id {
98 	struct spdk_nvme_transport_id		trid;
99 	struct spdk_nvme_host_id		hostid;
100 	TAILQ_ENTRY(nvme_path_id)		link;
101 	uint64_t				last_failed_tsc;
102 };
103 
104 typedef void (*bdev_nvme_ctrlr_op_cb)(void *cb_arg, int rc);
105 typedef void (*nvme_ctrlr_disconnected_cb)(struct nvme_ctrlr *nvme_ctrlr);
106 
107 struct nvme_ctrlr {
108 	/**
109 	 * points to pinned, physically contiguous memory region;
110 	 * contains 4KB IDENTIFY structure for controller which is
111 	 *  target for CONTROLLER IDENTIFY command during initialization
112 	 */
113 	struct spdk_nvme_ctrlr			*ctrlr;
114 	struct nvme_path_id			*active_path_id;
115 	int					ref;
116 
117 	uint32_t				resetting : 1;
118 	uint32_t				reconnect_is_delayed : 1;
119 	uint32_t				in_failover : 1;
120 	uint32_t				pending_failover : 1;
121 	uint32_t				fast_io_fail_timedout : 1;
122 	uint32_t				destruct : 1;
123 	uint32_t				ana_log_page_updating : 1;
124 	uint32_t				io_path_cache_clearing : 1;
125 	uint32_t				dont_retry : 1;
126 	uint32_t				disabled : 1;
127 
128 	struct nvme_ctrlr_opts			opts;
129 
130 	RB_HEAD(nvme_ns_tree, nvme_ns)		namespaces;
131 
132 	struct spdk_opal_dev			*opal_dev;
133 
134 	struct spdk_poller			*adminq_timer_poller;
135 	struct spdk_thread			*thread;
136 
137 	bdev_nvme_ctrlr_op_cb			ctrlr_op_cb_fn;
138 	void					*ctrlr_op_cb_arg;
139 	/* Poller used to check for reset/detach completion */
140 	struct spdk_poller			*reset_detach_poller;
141 	struct spdk_nvme_detach_ctx		*detach_ctx;
142 
143 	uint64_t				reset_start_tsc;
144 	struct spdk_poller			*reconnect_delay_timer;
145 
146 	nvme_ctrlr_disconnected_cb		disconnected_cb;
147 
148 	/** linked list pointer for device list */
149 	TAILQ_ENTRY(nvme_ctrlr)			tailq;
150 	struct nvme_bdev_ctrlr			*nbdev_ctrlr;
151 
152 	TAILQ_HEAD(nvme_paths, nvme_path_id)	trids;
153 
154 	uint32_t				max_ana_log_page_size;
155 	struct spdk_nvme_ana_page		*ana_log_page;
156 	struct spdk_nvme_ana_group_descriptor	*copied_ana_desc;
157 
158 	struct nvme_async_probe_ctx		*probe_ctx;
159 	struct spdk_key				*psk;
160 	struct spdk_key				*dhchap_key;
161 	struct spdk_key				*dhchap_ctrlr_key;
162 
163 	pthread_mutex_t				mutex;
164 };
165 
166 struct nvme_bdev_ctrlr {
167 	char				*name;
168 	TAILQ_HEAD(, nvme_ctrlr)	ctrlrs;
169 	TAILQ_HEAD(, nvme_bdev)		bdevs;
170 	TAILQ_ENTRY(nvme_bdev_ctrlr)	tailq;
171 };
172 
173 struct nvme_error_stat {
174 	uint32_t status_type[8];
175 	uint32_t status[4][256];
176 };
177 
178 struct nvme_bdev {
179 	struct spdk_bdev		disk;
180 	uint32_t			nsid;
181 	struct nvme_bdev_ctrlr		*nbdev_ctrlr;
182 	pthread_mutex_t			mutex;
183 	int				ref;
184 	enum bdev_nvme_multipath_policy	mp_policy;
185 	enum bdev_nvme_multipath_selector mp_selector;
186 	uint32_t			rr_min_io;
187 	TAILQ_HEAD(, nvme_ns)		nvme_ns_list;
188 	bool				opal;
189 	TAILQ_ENTRY(nvme_bdev)		tailq;
190 	struct nvme_error_stat		*err_stat;
191 };
192 
193 struct nvme_qpair {
194 	struct nvme_ctrlr		*ctrlr;
195 	struct spdk_nvme_qpair		*qpair;
196 	struct nvme_poll_group		*group;
197 	struct nvme_ctrlr_channel	*ctrlr_ch;
198 
199 	/* The following is used to update io_path cache of nvme_bdev_channels. */
200 	TAILQ_HEAD(, nvme_io_path)	io_path_list;
201 
202 	TAILQ_ENTRY(nvme_qpair)		tailq;
203 };
204 
205 struct nvme_ctrlr_channel {
206 	struct nvme_qpair		*qpair;
207 	TAILQ_HEAD(, nvme_bdev_io)	pending_resets;
208 
209 	struct spdk_io_channel_iter	*reset_iter;
210 	struct spdk_poller		*connect_poller;
211 };
212 
213 struct nvme_io_path {
214 	struct nvme_ns			*nvme_ns;
215 	struct nvme_qpair		*qpair;
216 	STAILQ_ENTRY(nvme_io_path)	stailq;
217 
218 	/* The following are used to update io_path cache of the nvme_bdev_channel. */
219 	struct nvme_bdev_channel	*nbdev_ch;
220 	TAILQ_ENTRY(nvme_io_path)	tailq;
221 
222 	/* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */
223 	struct spdk_bdev_io_stat	*stat;
224 };
225 
226 struct nvme_bdev_channel {
227 	struct nvme_io_path			*current_io_path;
228 	enum bdev_nvme_multipath_policy		mp_policy;
229 	enum bdev_nvme_multipath_selector	mp_selector;
230 	uint32_t				rr_min_io;
231 	uint32_t				rr_counter;
232 	STAILQ_HEAD(, nvme_io_path)		io_path_list;
233 	TAILQ_HEAD(retry_io_head, nvme_bdev_io)	retry_io_list;
234 	struct spdk_poller			*retry_io_poller;
235 };
236 
237 struct nvme_poll_group {
238 	struct spdk_nvme_poll_group		*group;
239 	struct spdk_io_channel			*accel_channel;
240 	struct spdk_poller			*poller;
241 	bool					collect_spin_stat;
242 	uint64_t				spin_ticks;
243 	uint64_t				start_ticks;
244 	uint64_t				end_ticks;
245 	TAILQ_HEAD(, nvme_qpair)		qpair_list;
246 };
247 
248 void nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path);
249 
250 struct nvme_ctrlr *nvme_ctrlr_get_by_name(const char *name);
251 
252 struct nvme_ctrlr *nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr,
253 		uint16_t cntlid);
254 
255 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr_get_by_name(const char *name);
256 
257 typedef void (*nvme_bdev_ctrlr_for_each_fn)(struct nvme_bdev_ctrlr *nbdev_ctrlr, void *ctx);
258 
259 void nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx);
260 
261 void nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid,
262 			      struct spdk_json_write_ctx *w);
263 
264 void nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr);
265 
266 struct nvme_ns *nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid);
267 struct nvme_ns *nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr);
268 struct nvme_ns *nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns);
269 
270 enum spdk_bdev_timeout_action {
271 	SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE = 0,
272 	SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET,
273 	SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT,
274 };
275 
276 struct spdk_bdev_nvme_opts {
277 	enum spdk_bdev_timeout_action action_on_timeout;
278 	uint64_t timeout_us;
279 	uint64_t timeout_admin_us;
280 	uint32_t keep_alive_timeout_ms;
281 	/* The number of attempts per I/O in the transport layer before an I/O fails. */
282 	uint32_t transport_retry_count;
283 	uint32_t arbitration_burst;
284 	uint32_t low_priority_weight;
285 	uint32_t medium_priority_weight;
286 	uint32_t high_priority_weight;
287 	uint64_t nvme_adminq_poll_period_us;
288 	uint64_t nvme_ioq_poll_period_us;
289 	uint32_t io_queue_requests;
290 	bool delay_cmd_submit;
291 	/* The number of attempts per I/O in the bdev layer before an I/O fails. */
292 	int32_t bdev_retry_count;
293 	uint8_t transport_ack_timeout;
294 	int32_t ctrlr_loss_timeout_sec;
295 	uint32_t reconnect_delay_sec;
296 	uint32_t fast_io_fail_timeout_sec;
297 	bool disable_auto_failback;
298 	bool generate_uuids;
299 	/* Type of Service - RDMA only */
300 	uint8_t transport_tos;
301 	bool nvme_error_stat;
302 	uint32_t rdma_srq_size;
303 	bool io_path_stat;
304 	bool allow_accel_sequence;
305 	uint32_t rdma_max_cq_size;
306 	uint16_t rdma_cm_event_timeout_ms;
307 	uint32_t dhchap_digests;
308 	uint32_t dhchap_dhgroups;
309 };
310 
311 struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch);
312 void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts);
313 int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts);
314 int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx);
315 
316 void bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts);
317 
318 int bdev_nvme_create(struct spdk_nvme_transport_id *trid,
319 		     const char *base_name,
320 		     const char **names,
321 		     uint32_t count,
322 		     spdk_bdev_create_nvme_fn cb_fn,
323 		     void *cb_ctx,
324 		     struct spdk_nvme_ctrlr_opts *drv_opts,
325 		     struct nvme_ctrlr_opts *bdev_opts,
326 		     bool multipath);
327 
328 int bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, const char *base_name,
329 			      struct spdk_nvme_ctrlr_opts *drv_opts, struct nvme_ctrlr_opts *bdev_opts,
330 			      uint64_t timeout, bool from_mdns,
331 			      spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx);
332 int bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn,
333 			     void *cb_ctx);
334 void bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w);
335 
336 int bdev_nvme_start_mdns_discovery(const char *base_name,
337 				   const char *svcname,
338 				   struct spdk_nvme_ctrlr_opts *drv_opts,
339 				   struct nvme_ctrlr_opts *bdev_opts);
340 int bdev_nvme_stop_mdns_discovery(const char *name);
341 void bdev_nvme_get_mdns_discovery_info(struct spdk_jsonrpc_request *request);
342 void bdev_nvme_mdns_discovery_config_json(struct spdk_json_write_ctx *w);
343 
344 struct spdk_nvme_ctrlr *bdev_nvme_get_ctrlr(struct spdk_bdev *bdev);
345 
346 typedef void (*bdev_nvme_delete_done_fn)(void *ctx, int rc);
347 
348 /**
349  * Delete NVMe controller with all bdevs on top of it, or delete the specified path
350  * if there is any alternative path. Requires to pass name of NVMe controller.
351  *
352  * \param name NVMe controller name
353  * \param path_id The specified path to remove (optional)
354  * \param delete_done Callback function on delete complete (optional)
355  * \param delete_done_ctx Context passed to callback (optional)
356  * \return zero on success,
357  *		-EINVAL on wrong parameters or
358  *		-ENODEV if controller is not found or
359  *		-ENOMEM on no memory
360  */
361 int bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id,
362 		     bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx);
363 
364 enum nvme_ctrlr_op {
365 	NVME_CTRLR_OP_RESET = 1,
366 	NVME_CTRLR_OP_ENABLE,
367 	NVME_CTRLR_OP_DISABLE,
368 };
369 
370 /**
371  * Perform specified operation on an NVMe controller.
372  *
373  * NOTE: The callback function is always called after this function returns except for
374  * out of memory cases.
375  *
376  * \param nvme_ctrlr The specified NVMe controller to operate
377  * \param op Operation code
378  * \param cb_fn Function to be called back after operation completes
379  * \param cb_arg Argument for callback function
380  */
381 void nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op,
382 		       bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg);
383 
384 /**
385  * Perform specified operation on all NVMe controllers in an NVMe bdev controller.
386  *
387  * NOTE: The callback function is always called after this function returns except for
388  * out of memory cases.
389  *
390  * \param nbdev_ctrlr The specified NVMe bdev controller to operate
391  * \param op Operation code
392  * \param cb_fn Function to be called back after operation completes
393  * \param cb_arg Argument for callback function
394  */
395 void nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op,
396 			    bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg);
397 
398 typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc);
399 
400 /**
401  * Set the preferred I/O path for an NVMe bdev in multipath mode.
402  *
403  * NOTE: This function does not support NVMe bdevs in failover mode.
404  *
405  * \param name NVMe bdev name
406  * \param cntlid NVMe-oF controller ID
407  * \param cb_fn Function to be called back after completion.
408  * \param cb_arg Argument for callback function.
409  */
410 void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
411 				  bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg);
412 
413 typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc);
414 
415 /**
416  * Set multipath policy of the NVMe bdev.
417  *
418  * \param name NVMe bdev name
419  * \param policy Multipath policy (active-passive or active-active)
420  * \param selector Multipath selector (round_robin, queue_depth)
421  * \param rr_min_io Number of IO to route to a path before switching to another for round-robin
422  * \param cb_fn Function to be called back after completion.
423  */
424 void bdev_nvme_set_multipath_policy(const char *name,
425 				    enum bdev_nvme_multipath_policy policy,
426 				    enum bdev_nvme_multipath_selector selector,
427 				    uint32_t rr_min_io,
428 				    bdev_nvme_set_multipath_policy_cb cb_fn,
429 				    void *cb_arg);
430 
431 #endif /* SPDK_BDEV_NVME_H */
432