xref: /spdk/module/bdev/nvme/bdev_nvme.h (revision 838e61c3772fdefb17e1a0b8f9880e2bcb9c4c0d)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  *   Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved.
6  */
7 
8 #ifndef SPDK_BDEV_NVME_H
9 #define SPDK_BDEV_NVME_H
10 
11 #include "spdk/stdinc.h"
12 
13 #include "spdk/queue.h"
14 #include "spdk/nvme.h"
15 #include "spdk/bdev_module.h"
16 #include "spdk/jsonrpc.h"
17 
18 TAILQ_HEAD(nvme_bdev_ctrlrs, nvme_bdev_ctrlr);
19 extern struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs;
20 extern pthread_mutex_t g_bdev_nvme_mutex;
21 extern bool g_bdev_nvme_module_finish;
22 extern struct spdk_thread *g_bdev_nvme_init_thread;
23 
24 #define NVME_MAX_CONTROLLERS 1024
25 
26 enum bdev_nvme_multipath_policy {
27 	BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE,
28 	BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE,
29 };
30 
31 enum bdev_nvme_multipath_selector {
32 	BDEV_NVME_MP_SELECTOR_ROUND_ROBIN = 1,
33 	BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH,
34 };
35 
36 typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc);
37 typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx, int status);
38 typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx);
39 
40 struct nvme_ctrlr_opts {
41 	uint32_t prchk_flags;
42 	int32_t ctrlr_loss_timeout_sec;
43 	uint32_t reconnect_delay_sec;
44 	uint32_t fast_io_fail_timeout_sec;
45 	bool from_discovery_service;
46 };
47 
48 struct nvme_async_probe_ctx {
49 	struct spdk_nvme_probe_ctx *probe_ctx;
50 	const char *base_name;
51 	const char **names;
52 	uint32_t count;
53 	struct spdk_poller *poller;
54 	struct spdk_nvme_transport_id trid;
55 	struct nvme_ctrlr_opts bdev_opts;
56 	struct spdk_nvme_ctrlr_opts drv_opts;
57 	spdk_bdev_create_nvme_fn cb_fn;
58 	void *cb_ctx;
59 	uint32_t populates_in_progress;
60 	bool ctrlr_attached;
61 	bool probe_done;
62 	bool namespaces_populated;
63 };
64 
65 struct nvme_ns {
66 	uint32_t			id;
67 	struct spdk_nvme_ns		*ns;
68 	struct nvme_ctrlr		*ctrlr;
69 	struct nvme_bdev		*bdev;
70 	uint32_t			ana_group_id;
71 	enum spdk_nvme_ana_state	ana_state;
72 	bool				ana_state_updating;
73 	bool				ana_transition_timedout;
74 	struct spdk_poller		*anatt_timer;
75 	struct nvme_async_probe_ctx	*probe_ctx;
76 	TAILQ_ENTRY(nvme_ns)		tailq;
77 	RB_ENTRY(nvme_ns)		node;
78 
79 	/**
80 	 * record io path stat before destroyed. Allocation of stat is
81 	 * decided by option io_path_stat of RPC
82 	 * bdev_nvme_set_options
83 	 */
84 	struct spdk_bdev_io_stat	*stat;
85 };
86 
87 struct nvme_bdev_io;
88 struct nvme_bdev_ctrlr;
89 struct nvme_bdev;
90 struct nvme_io_path;
91 
92 struct nvme_path_id {
93 	struct spdk_nvme_transport_id		trid;
94 	struct spdk_nvme_host_id		hostid;
95 	TAILQ_ENTRY(nvme_path_id)		link;
96 	bool					is_failed;
97 };
98 
99 typedef void (*bdev_nvme_reset_cb)(void *cb_arg, bool success);
100 typedef void (*nvme_ctrlr_disconnected_cb)(struct nvme_ctrlr *nvme_ctrlr);
101 
102 struct nvme_ctrlr {
103 	/**
104 	 * points to pinned, physically contiguous memory region;
105 	 * contains 4KB IDENTIFY structure for controller which is
106 	 *  target for CONTROLLER IDENTIFY command during initialization
107 	 */
108 	struct spdk_nvme_ctrlr			*ctrlr;
109 	struct nvme_path_id			*active_path_id;
110 	int					ref;
111 
112 	uint32_t				resetting : 1;
113 	uint32_t				reconnect_is_delayed : 1;
114 	uint32_t				fast_io_fail_timedout : 1;
115 	uint32_t				destruct : 1;
116 	uint32_t				ana_log_page_updating : 1;
117 	uint32_t				io_path_cache_clearing : 1;
118 	uint32_t				dont_retry : 1;
119 
120 	struct nvme_ctrlr_opts			opts;
121 
122 	RB_HEAD(nvme_ns_tree, nvme_ns)		namespaces;
123 
124 	struct spdk_opal_dev			*opal_dev;
125 
126 	struct spdk_poller			*adminq_timer_poller;
127 	struct spdk_thread			*thread;
128 
129 	bdev_nvme_reset_cb			reset_cb_fn;
130 	void					*reset_cb_arg;
131 	/* Poller used to check for reset/detach completion */
132 	struct spdk_poller			*reset_detach_poller;
133 	struct spdk_nvme_detach_ctx		*detach_ctx;
134 
135 	uint64_t				reset_start_tsc;
136 	struct spdk_poller			*reconnect_delay_timer;
137 
138 	nvme_ctrlr_disconnected_cb		disconnected_cb;
139 
140 	/** linked list pointer for device list */
141 	TAILQ_ENTRY(nvme_ctrlr)			tailq;
142 	struct nvme_bdev_ctrlr			*nbdev_ctrlr;
143 
144 	TAILQ_HEAD(nvme_paths, nvme_path_id)	trids;
145 
146 	uint32_t				max_ana_log_page_size;
147 	struct spdk_nvme_ana_page		*ana_log_page;
148 	struct spdk_nvme_ana_group_descriptor	*copied_ana_desc;
149 
150 	struct nvme_async_probe_ctx		*probe_ctx;
151 
152 	pthread_mutex_t				mutex;
153 };
154 
155 struct nvme_bdev_ctrlr {
156 	char				*name;
157 	TAILQ_HEAD(, nvme_ctrlr)	ctrlrs;
158 	TAILQ_HEAD(, nvme_bdev)		bdevs;
159 	TAILQ_ENTRY(nvme_bdev_ctrlr)	tailq;
160 };
161 
162 struct nvme_error_stat {
163 	uint32_t status_type[8];
164 	uint32_t status[4][256];
165 };
166 
167 struct nvme_bdev {
168 	struct spdk_bdev		disk;
169 	uint32_t			nsid;
170 	struct nvme_bdev_ctrlr		*nbdev_ctrlr;
171 	pthread_mutex_t			mutex;
172 	int				ref;
173 	enum bdev_nvme_multipath_policy	mp_policy;
174 	enum bdev_nvme_multipath_selector mp_selector;
175 	uint32_t			rr_min_io;
176 	TAILQ_HEAD(, nvme_ns)		nvme_ns_list;
177 	bool				opal;
178 	TAILQ_ENTRY(nvme_bdev)		tailq;
179 	struct nvme_error_stat		*err_stat;
180 };
181 
182 struct nvme_qpair {
183 	struct nvme_ctrlr		*ctrlr;
184 	struct spdk_nvme_qpair		*qpair;
185 	struct nvme_poll_group		*group;
186 	struct nvme_ctrlr_channel	*ctrlr_ch;
187 
188 	/* The following is used to update io_path cache of nvme_bdev_channels. */
189 	TAILQ_HEAD(, nvme_io_path)	io_path_list;
190 
191 	TAILQ_ENTRY(nvme_qpair)		tailq;
192 };
193 
194 struct nvme_ctrlr_channel {
195 	struct nvme_qpair		*qpair;
196 	TAILQ_HEAD(, spdk_bdev_io)	pending_resets;
197 
198 	struct spdk_io_channel_iter	*reset_iter;
199 };
200 
201 struct nvme_io_path {
202 	struct nvme_ns			*nvme_ns;
203 	struct nvme_qpair		*qpair;
204 	STAILQ_ENTRY(nvme_io_path)	stailq;
205 
206 	/* The following are used to update io_path cache of the nvme_bdev_channel. */
207 	struct nvme_bdev_channel	*nbdev_ch;
208 	TAILQ_ENTRY(nvme_io_path)	tailq;
209 
210 	/* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */
211 	struct spdk_bdev_io_stat	*stat;
212 };
213 
214 struct nvme_bdev_channel {
215 	struct nvme_io_path			*current_io_path;
216 	enum bdev_nvme_multipath_policy		mp_policy;
217 	enum bdev_nvme_multipath_selector	mp_selector;
218 	uint32_t				rr_min_io;
219 	uint32_t				rr_counter;
220 	STAILQ_HEAD(, nvme_io_path)		io_path_list;
221 	TAILQ_HEAD(retry_io_head, spdk_bdev_io)	retry_io_list;
222 	struct spdk_poller			*retry_io_poller;
223 };
224 
225 struct nvme_poll_group {
226 	struct spdk_nvme_poll_group		*group;
227 	struct spdk_io_channel			*accel_channel;
228 	struct spdk_poller			*poller;
229 	bool					collect_spin_stat;
230 	uint64_t				spin_ticks;
231 	uint64_t				start_ticks;
232 	uint64_t				end_ticks;
233 	TAILQ_HEAD(, nvme_qpair)		qpair_list;
234 };
235 
236 void nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path);
237 
238 struct nvme_ctrlr *nvme_ctrlr_get_by_name(const char *name);
239 
240 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr_get_by_name(const char *name);
241 
242 typedef void (*nvme_bdev_ctrlr_for_each_fn)(struct nvme_bdev_ctrlr *nbdev_ctrlr, void *ctx);
243 
244 void nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx);
245 
246 void nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid,
247 			      struct spdk_json_write_ctx *w);
248 
249 void nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr);
250 
251 struct nvme_ns *nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid);
252 struct nvme_ns *nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr);
253 struct nvme_ns *nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns);
254 
255 enum spdk_bdev_timeout_action {
256 	SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE = 0,
257 	SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET,
258 	SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT,
259 };
260 
261 struct spdk_bdev_nvme_opts {
262 	enum spdk_bdev_timeout_action action_on_timeout;
263 	uint64_t timeout_us;
264 	uint64_t timeout_admin_us;
265 	uint32_t keep_alive_timeout_ms;
266 	/* The number of attempts per I/O in the transport layer before an I/O fails. */
267 	uint32_t transport_retry_count;
268 	uint32_t arbitration_burst;
269 	uint32_t low_priority_weight;
270 	uint32_t medium_priority_weight;
271 	uint32_t high_priority_weight;
272 	uint64_t nvme_adminq_poll_period_us;
273 	uint64_t nvme_ioq_poll_period_us;
274 	uint32_t io_queue_requests;
275 	bool delay_cmd_submit;
276 	/* The number of attempts per I/O in the bdev layer before an I/O fails. */
277 	int32_t bdev_retry_count;
278 	uint8_t transport_ack_timeout;
279 	int32_t ctrlr_loss_timeout_sec;
280 	uint32_t reconnect_delay_sec;
281 	uint32_t fast_io_fail_timeout_sec;
282 	bool disable_auto_failback;
283 	bool generate_uuids;
284 	/* Type of Service - RDMA only */
285 	uint8_t transport_tos;
286 	bool nvme_error_stat;
287 	uint32_t rdma_srq_size;
288 	bool io_path_stat;
289 };
290 
291 struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch);
292 void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts);
293 int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts);
294 int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx);
295 
296 void bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts);
297 
298 int bdev_nvme_create(struct spdk_nvme_transport_id *trid,
299 		     const char *base_name,
300 		     const char **names,
301 		     uint32_t count,
302 		     spdk_bdev_create_nvme_fn cb_fn,
303 		     void *cb_ctx,
304 		     struct spdk_nvme_ctrlr_opts *drv_opts,
305 		     struct nvme_ctrlr_opts *bdev_opts,
306 		     bool multipath);
307 
308 int bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, const char *base_name,
309 			      struct spdk_nvme_ctrlr_opts *drv_opts, struct nvme_ctrlr_opts *bdev_opts,
310 			      uint64_t timeout, bool from_mdns,
311 			      spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx);
312 int bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn,
313 			     void *cb_ctx);
314 void bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w);
315 
316 int bdev_nvme_start_mdns_discovery(const char *base_name,
317 				   const char *svcname,
318 				   struct spdk_nvme_ctrlr_opts *drv_opts,
319 				   struct nvme_ctrlr_opts *bdev_opts);
320 int bdev_nvme_stop_mdns_discovery(const char *name);
321 void bdev_nvme_get_mdns_discovery_info(struct spdk_jsonrpc_request *request);
322 void bdev_nvme_mdns_discovery_config_json(struct spdk_json_write_ctx *w);
323 
324 struct spdk_nvme_ctrlr *bdev_nvme_get_ctrlr(struct spdk_bdev *bdev);
325 
326 /**
327  * Delete NVMe controller with all bdevs on top of it, or delete the specified path
328  * if there is any alternative path. Requires to pass name of NVMe controller.
329  *
330  * \param name NVMe controller name
331  * \param path_id The specified path to remove (optional)
332  * \return zero on success, -EINVAL on wrong parameters or -ENODEV if controller is not found
333  */
334 int bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id);
335 
336 /**
337  * Reset NVMe controller.
338  *
339  * \param nvme_ctrlr The specified NVMe controller to reset
340  * \param cb_fn Function to be called back after reset completes
341  * \param cb_arg Argument for callback function
342  * \return zero on success. Negated errno on the following error conditions:
343  * -ENXIO: controller is being destroyed.
344  * -EBUSY: controller is already being reset.
345  */
346 int bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg);
347 
348 typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc);
349 
350 /**
351  * Set the preferred I/O path for an NVMe bdev in multipath mode.
352  *
353  * NOTE: This function does not support NVMe bdevs in failover mode.
354  *
355  * \param name NVMe bdev name
356  * \param cntlid NVMe-oF controller ID
357  * \param cb_fn Function to be called back after completion.
358  * \param cb_arg Argument for callback function.
359  */
360 void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
361 				  bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg);
362 
363 typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc);
364 
365 /**
366  * Set multipath policy of the NVMe bdev.
367  *
368  * \param name NVMe bdev name
369  * \param policy Multipath policy (active-passive or active-active)
370  * \param selector Multipath selector (round_robin, queue_depth)
371  * \param rr_min_io Number of IO to route to a path before switching to another for round-robin
372  * \param cb_fn Function to be called back after completion.
373  */
374 void bdev_nvme_set_multipath_policy(const char *name,
375 				    enum bdev_nvme_multipath_policy policy,
376 				    enum bdev_nvme_multipath_selector selector,
377 				    uint32_t rr_min_io,
378 				    bdev_nvme_set_multipath_policy_cb cb_fn,
379 				    void *cb_arg);
380 
381 #endif /* SPDK_BDEV_NVME_H */
382