xref: /illumos-gate/usr/src/uts/common/io/nvme/nvme_lock.c (revision 5a05d72d5fa6d0383f7ec8bb27efe79f297c4515)
1*533affcbSRobert Mustacchi /*
2*533affcbSRobert Mustacchi  * This file and its contents are supplied under the terms of the
3*533affcbSRobert Mustacchi  * Common Development and Distribution License ("CDDL"), version 1.0.
4*533affcbSRobert Mustacchi  * You may only use this file in accordance with the terms of version
5*533affcbSRobert Mustacchi  * 1.0 of the CDDL.
6*533affcbSRobert Mustacchi  *
7*533affcbSRobert Mustacchi  * A full copy of the text of the CDDL should have accompanied this
8*533affcbSRobert Mustacchi  * source.  A copy of the CDDL is also available via the Internet at
9*533affcbSRobert Mustacchi  * http://www.illumos.org/license/CDDL.
10*533affcbSRobert Mustacchi  */
11*533affcbSRobert Mustacchi 
12*533affcbSRobert Mustacchi /*
13*533affcbSRobert Mustacchi  * Copyright 2024 Oxide Computer Company
14*533affcbSRobert Mustacchi  */
15*533affcbSRobert Mustacchi 
16*533affcbSRobert Mustacchi /*
17*533affcbSRobert Mustacchi  * This implements the general locking routines. See the big theory section
18*533affcbSRobert Mustacchi  * 'ioctls, Errors, and Exclusive Access' for more information.
19*533affcbSRobert Mustacchi  */
20*533affcbSRobert Mustacchi 
21*533affcbSRobert Mustacchi #include <sys/stddef.h>
22*533affcbSRobert Mustacchi #include <sys/nvme.h>
23*533affcbSRobert Mustacchi 
24*533affcbSRobert Mustacchi #include "nvme_reg.h"
25*533affcbSRobert Mustacchi #include "nvme_var.h"
26*533affcbSRobert Mustacchi 
27*533affcbSRobert Mustacchi /*
28*533affcbSRobert Mustacchi  * Do we have a writer or someone pending. Note, some cases require checking
29*533affcbSRobert Mustacchi  * both of these and others do not. Please see each individual check for the
30*533affcbSRobert Mustacchi  * nuance here. As a general rule of thumb, when locking, the pending writers
31*533affcbSRobert Mustacchi  * are important. However, when passing the lock on to the next owner (the
32*533affcbSRobert Mustacchi  * handoff functions below), one doesn't check it.
33*533affcbSRobert Mustacchi  */
34*533affcbSRobert Mustacchi static boolean_t
nvme_rwlock_wr_or_pend(nvme_lock_t * lock)35*533affcbSRobert Mustacchi nvme_rwlock_wr_or_pend(nvme_lock_t *lock)
36*533affcbSRobert Mustacchi {
37*533affcbSRobert Mustacchi 	return (lock->nl_writer != NULL ||
38*533affcbSRobert Mustacchi 	    list_is_empty(&lock->nl_pend_writers) == 0);
39*533affcbSRobert Mustacchi }
40*533affcbSRobert Mustacchi 
41*533affcbSRobert Mustacchi /*
42*533affcbSRobert Mustacchi  * Taking a namespace read lock requires that there is no writer (or pending) on
43*533affcbSRobert Mustacchi  * the controller and the namespace.
44*533affcbSRobert Mustacchi  */
45*533affcbSRobert Mustacchi static boolean_t
nvme_rwlock_block_ns_rdlock(nvme_t * nvme,nvme_namespace_t * ns)46*533affcbSRobert Mustacchi nvme_rwlock_block_ns_rdlock(nvme_t *nvme, nvme_namespace_t *ns)
47*533affcbSRobert Mustacchi {
48*533affcbSRobert Mustacchi 	return (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
49*533affcbSRobert Mustacchi 	    nvme_rwlock_wr_or_pend(&ns->ns_lock));
50*533affcbSRobert Mustacchi }
51*533affcbSRobert Mustacchi 
52*533affcbSRobert Mustacchi /*
53*533affcbSRobert Mustacchi  * The following entities all block a namespace write lock from being taken:
54*533affcbSRobert Mustacchi  *
55*533affcbSRobert Mustacchi  * 1) Any active or pending writer on the controller lock. They block and starve
56*533affcbSRobert Mustacchi  *    namespace writers respectively.
57*533affcbSRobert Mustacchi  * 2) Any active or pending writers on the namespace lock. We must wait in line.
58*533affcbSRobert Mustacchi  * 3) Any active readers on the namespace lock. We ignore pending namespace
59*533affcbSRobert Mustacchi  *    readers as by definition that implies some other situation will cause
60*533affcbSRobert Mustacchi  *    this.
61*533affcbSRobert Mustacchi  */
62*533affcbSRobert Mustacchi static boolean_t
nvme_rwlock_block_ns_wrlock(nvme_t * nvme,nvme_namespace_t * ns)63*533affcbSRobert Mustacchi nvme_rwlock_block_ns_wrlock(nvme_t *nvme, nvme_namespace_t *ns)
64*533affcbSRobert Mustacchi {
65*533affcbSRobert Mustacchi 	return (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
66*533affcbSRobert Mustacchi 	    nvme_rwlock_wr_or_pend(&ns->ns_lock) ||
67*533affcbSRobert Mustacchi 	    list_is_empty(&ns->ns_lock.nl_readers) == 0);
68*533affcbSRobert Mustacchi }
69*533affcbSRobert Mustacchi 
70*533affcbSRobert Mustacchi /*
71*533affcbSRobert Mustacchi  * The only thing that blocks acquisition of a controller read lock is if
72*533affcbSRobert Mustacchi  * there are outstanding or pending writers on the controller lock. We can
73*533affcbSRobert Mustacchi  * ignore the state of all namespaces here.
74*533affcbSRobert Mustacchi  */
75*533affcbSRobert Mustacchi static boolean_t
nvme_rwlock_block_ctrl_rdlock(nvme_t * nvme)76*533affcbSRobert Mustacchi nvme_rwlock_block_ctrl_rdlock(nvme_t *nvme)
77*533affcbSRobert Mustacchi {
78*533affcbSRobert Mustacchi 	return (nvme_rwlock_wr_or_pend(&nvme->n_lock));
79*533affcbSRobert Mustacchi }
80*533affcbSRobert Mustacchi 
81*533affcbSRobert Mustacchi /*
82*533affcbSRobert Mustacchi  * Taking the controller write lock is the most challenging of all, but also
83*533affcbSRobert Mustacchi  * takes priority. The following all block a controller write lock from being
84*533affcbSRobert Mustacchi  * taken:
85*533affcbSRobert Mustacchi  *
86*533affcbSRobert Mustacchi  * 1) Any controller write lock or pending write
87*533affcbSRobert Mustacchi  * 2) Any controller read lock. We skip pending reads because if they exist,
88*533affcbSRobert Mustacchi  *    some other situation causes that that will trip us.
89*533affcbSRobert Mustacchi  * 3) Any namespace having a write lock. We ignore pending writes because by
90*533affcbSRobert Mustacchi  *    definition there is some condition that causes that to be the case.
91*533affcbSRobert Mustacchi  * 4) Any read lock on a namespace. We ignore pending reads like in the
92*533affcbSRobert Mustacchi  *    controller case.
93*533affcbSRobert Mustacchi  */
94*533affcbSRobert Mustacchi static boolean_t
nvme_rwlock_block_ctrl_wrlock(nvme_t * nvme)95*533affcbSRobert Mustacchi nvme_rwlock_block_ctrl_wrlock(nvme_t *nvme)
96*533affcbSRobert Mustacchi {
97*533affcbSRobert Mustacchi 	if (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
98*533affcbSRobert Mustacchi 	    list_is_empty(&nvme->n_lock.nl_readers) == 0) {
99*533affcbSRobert Mustacchi 		return (B_TRUE);
100*533affcbSRobert Mustacchi 	}
101*533affcbSRobert Mustacchi 
102*533affcbSRobert Mustacchi 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
103*533affcbSRobert Mustacchi 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
104*533affcbSRobert Mustacchi 		if (ns->ns_lock.nl_writer != NULL ||
105*533affcbSRobert Mustacchi 		    list_is_empty(&ns->ns_lock.nl_readers) == 0) {
106*533affcbSRobert Mustacchi 			return (B_TRUE);
107*533affcbSRobert Mustacchi 		}
108*533affcbSRobert Mustacchi 	}
109*533affcbSRobert Mustacchi 
110*533affcbSRobert Mustacchi 	return (B_FALSE);
111*533affcbSRobert Mustacchi }
112*533affcbSRobert Mustacchi 
113*533affcbSRobert Mustacchi /*
114*533affcbSRobert Mustacchi  * Answer can we hand off the world to a pending controller write lock. This has
115*533affcbSRobert Mustacchi  * similar rules to the above; however, we critically _ignore_ pending
116*533affcbSRobert Mustacchi  * controller write lock holds, as the assumption is that they are here, so the
117*533affcbSRobert Mustacchi  * only consideration from above are controller reader locks and namespace
118*533affcbSRobert Mustacchi  * locks.
119*533affcbSRobert Mustacchi  */
120*533affcbSRobert Mustacchi static boolean_t
nvme_rwlock_handoff_ctrl_wrlock(nvme_t * nvme)121*533affcbSRobert Mustacchi nvme_rwlock_handoff_ctrl_wrlock(nvme_t *nvme)
122*533affcbSRobert Mustacchi {
123*533affcbSRobert Mustacchi 	/* See nvme_rwlock_wakeup() for on why this can be done. */
124*533affcbSRobert Mustacchi 	ASSERT3P(nvme->n_lock.nl_writer, ==, NULL);
125*533affcbSRobert Mustacchi 
126*533affcbSRobert Mustacchi 	if (list_is_empty(&nvme->n_lock.nl_readers) == 0) {
127*533affcbSRobert Mustacchi 		return (B_FALSE);
128*533affcbSRobert Mustacchi 	}
129*533affcbSRobert Mustacchi 
130*533affcbSRobert Mustacchi 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
131*533affcbSRobert Mustacchi 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
132*533affcbSRobert Mustacchi 		if (ns->ns_lock.nl_writer != NULL ||
133*533affcbSRobert Mustacchi 		    list_is_empty(&ns->ns_lock.nl_readers) == 0) {
134*533affcbSRobert Mustacchi 			return (B_FALSE);
135*533affcbSRobert Mustacchi 		}
136*533affcbSRobert Mustacchi 	}
137*533affcbSRobert Mustacchi 
138*533affcbSRobert Mustacchi 	return (B_TRUE);
139*533affcbSRobert Mustacchi }
140*533affcbSRobert Mustacchi 
141*533affcbSRobert Mustacchi /*
142*533affcbSRobert Mustacchi  * Namespace handoff variant. It skips pending writers on the namespace lock,
143*533affcbSRobert Mustacchi  * but fully considers them on the controller due to their priority. Otherwise
144*533affcbSRobert Mustacchi  * this follows the same rules as the normal blocking check.
145*533affcbSRobert Mustacchi  */
146*533affcbSRobert Mustacchi static boolean_t
nvme_rwlock_handoff_ns_wrlock(nvme_t * nvme,nvme_namespace_t * ns)147*533affcbSRobert Mustacchi nvme_rwlock_handoff_ns_wrlock(nvme_t *nvme, nvme_namespace_t *ns)
148*533affcbSRobert Mustacchi {
149*533affcbSRobert Mustacchi 	if (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
150*533affcbSRobert Mustacchi 	    list_is_empty(&nvme->n_lock.nl_readers) == 0) {
151*533affcbSRobert Mustacchi 		return (B_FALSE);
152*533affcbSRobert Mustacchi 	}
153*533affcbSRobert Mustacchi 
154*533affcbSRobert Mustacchi 	if (ns->ns_lock.nl_writer != NULL ||
155*533affcbSRobert Mustacchi 	    list_is_empty(&ns->ns_lock.nl_readers) == 0) {
156*533affcbSRobert Mustacchi 		return (B_FALSE);
157*533affcbSRobert Mustacchi 	}
158*533affcbSRobert Mustacchi 
159*533affcbSRobert Mustacchi 	return (B_TRUE);
160*533affcbSRobert Mustacchi }
161*533affcbSRobert Mustacchi 
162*533affcbSRobert Mustacchi static void
nvme_rwlock_rdlock(nvme_minor_lock_info_t * info,nvme_lock_t * lock)163*533affcbSRobert Mustacchi nvme_rwlock_rdlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
164*533affcbSRobert Mustacchi {
165*533affcbSRobert Mustacchi 	ASSERT3U(list_is_empty(&lock->nl_pend_writers), !=, 0);
166*533affcbSRobert Mustacchi 	ASSERT3P(lock->nl_writer, ==, NULL);
167*533affcbSRobert Mustacchi 	ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
168*533affcbSRobert Mustacchi 	ASSERT3U(list_link_active(&info->nli_node), ==, 0);
169*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_minor, !=, NULL);
170*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_nvme, !=, NULL);
171*533affcbSRobert Mustacchi 	ASSERT3U(info->nli_curlevel, ==, NVME_LOCK_L_READ);
172*533affcbSRobert Mustacchi 
173*533affcbSRobert Mustacchi 	info->nli_state = NVME_LOCK_STATE_ACQUIRED;
174*533affcbSRobert Mustacchi 	info->nli_last_change = gethrtime();
175*533affcbSRobert Mustacchi 	info->nli_acq_kthread = (uintptr_t)curthread;
176*533affcbSRobert Mustacchi 	info->nli_acq_pid = (uint32_t)curproc->p_pid;
177*533affcbSRobert Mustacchi 
178*533affcbSRobert Mustacchi 	list_insert_tail(&lock->nl_readers, info);
179*533affcbSRobert Mustacchi 	lock->nl_nread_locks++;
180*533affcbSRobert Mustacchi }
181*533affcbSRobert Mustacchi 
182*533affcbSRobert Mustacchi static void
nvme_rwlock_wrlock(nvme_minor_lock_info_t * info,nvme_lock_t * lock)183*533affcbSRobert Mustacchi nvme_rwlock_wrlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
184*533affcbSRobert Mustacchi {
185*533affcbSRobert Mustacchi 	ASSERT3P(lock->nl_writer, ==, NULL);
186*533affcbSRobert Mustacchi 	ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
187*533affcbSRobert Mustacchi 	ASSERT3U(list_link_active(&info->nli_node), ==, 0);
188*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_minor, !=, NULL);
189*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_nvme, !=, NULL);
190*533affcbSRobert Mustacchi 
191*533affcbSRobert Mustacchi 	info->nli_state = NVME_LOCK_STATE_ACQUIRED;
192*533affcbSRobert Mustacchi 	info->nli_curlevel = NVME_LOCK_L_WRITE;
193*533affcbSRobert Mustacchi 	info->nli_last_change = gethrtime();
194*533affcbSRobert Mustacchi 	info->nli_acq_kthread = (uintptr_t)curthread;
195*533affcbSRobert Mustacchi 	info->nli_acq_pid = (uint32_t)curproc->p_pid;
196*533affcbSRobert Mustacchi 
197*533affcbSRobert Mustacchi 	lock->nl_writer = info;
198*533affcbSRobert Mustacchi 	lock->nl_nwrite_locks++;
199*533affcbSRobert Mustacchi }
200*533affcbSRobert Mustacchi 
201*533affcbSRobert Mustacchi #ifdef	DEBUG
202*533affcbSRobert Mustacchi /*
203*533affcbSRobert Mustacchi  * This is just a sanity check for our lock logic.
204*533affcbSRobert Mustacchi  */
205*533affcbSRobert Mustacchi static boolean_t
nvme_rwlock_is_reader(nvme_lock_t * lock,const nvme_minor_lock_info_t * info)206*533affcbSRobert Mustacchi nvme_rwlock_is_reader(nvme_lock_t *lock, const nvme_minor_lock_info_t *info)
207*533affcbSRobert Mustacchi {
208*533affcbSRobert Mustacchi 	for (nvme_minor_lock_info_t *i = list_head(&lock->nl_readers);
209*533affcbSRobert Mustacchi 	    i != NULL; i = list_next(&lock->nl_readers, i)) {
210*533affcbSRobert Mustacchi 		if (i == info) {
211*533affcbSRobert Mustacchi 			return (B_TRUE);
212*533affcbSRobert Mustacchi 		}
213*533affcbSRobert Mustacchi 	}
214*533affcbSRobert Mustacchi 	return (B_FALSE);
215*533affcbSRobert Mustacchi }
216*533affcbSRobert Mustacchi #endif
217*533affcbSRobert Mustacchi 
218*533affcbSRobert Mustacchi static void
nvme_rwlock_signal_one(nvme_minor_lock_info_t * info,nvme_ioctl_errno_t err)219*533affcbSRobert Mustacchi nvme_rwlock_signal_one(nvme_minor_lock_info_t *info, nvme_ioctl_errno_t err)
220*533affcbSRobert Mustacchi {
221*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_ioc, !=, NULL);
222*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_minor, !=, NULL);
223*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_state, !=, NVME_LOCK_STATE_BLOCKED);
224*533affcbSRobert Mustacchi 
225*533affcbSRobert Mustacchi 	if (err == NVME_IOCTL_E_OK) {
226*533affcbSRobert Mustacchi 		nvme_ioctl_success(info->nli_ioc);
227*533affcbSRobert Mustacchi 	} else {
228*533affcbSRobert Mustacchi 		(void) nvme_ioctl_error(info->nli_ioc, err, 0, 0);
229*533affcbSRobert Mustacchi 	}
230*533affcbSRobert Mustacchi 
231*533affcbSRobert Mustacchi 	cv_signal(&info->nli_minor->nm_cv);
232*533affcbSRobert Mustacchi }
233*533affcbSRobert Mustacchi 
234*533affcbSRobert Mustacchi static void
nvme_rwlock_wakeup_readers(nvme_lock_t * lock)235*533affcbSRobert Mustacchi nvme_rwlock_wakeup_readers(nvme_lock_t *lock)
236*533affcbSRobert Mustacchi {
237*533affcbSRobert Mustacchi 	nvme_minor_lock_info_t *info;
238*533affcbSRobert Mustacchi 
239*533affcbSRobert Mustacchi 	if (list_is_empty(&lock->nl_pend_readers) != 0) {
240*533affcbSRobert Mustacchi 		return;
241*533affcbSRobert Mustacchi 	}
242*533affcbSRobert Mustacchi 
243*533affcbSRobert Mustacchi 	ASSERT3U(list_is_empty(&lock->nl_readers), !=, 0);
244*533affcbSRobert Mustacchi 	ASSERT3P(lock->nl_writer, ==, NULL);
245*533affcbSRobert Mustacchi 	ASSERT3U(list_is_empty(&lock->nl_pend_writers), !=, 0);
246*533affcbSRobert Mustacchi 	while ((info = list_remove_head(&lock->nl_pend_readers)) != NULL) {
247*533affcbSRobert Mustacchi 		info->nli_state = NVME_LOCK_STATE_UNLOCKED;
248*533affcbSRobert Mustacchi 		nvme_rwlock_rdlock(info, lock);
249*533affcbSRobert Mustacchi 		nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
250*533affcbSRobert Mustacchi 	}
251*533affcbSRobert Mustacchi }
252*533affcbSRobert Mustacchi 
253*533affcbSRobert Mustacchi /*
254*533affcbSRobert Mustacchi  * An unlock occurred somewhere. We need to evaluate the total state of the
255*533affcbSRobert Mustacchi  * world. An unlock of a namespace can allow a controller lock to proceed. On
256*533affcbSRobert Mustacchi  * the other hand, dropping the controller write lock allows every namespace to
257*533affcbSRobert Mustacchi  * proceed. While we know the context of where the unlock occurred, it's simpler
258*533affcbSRobert Mustacchi  * right now to just allow everything to continue. This is somewhat expensive,
259*533affcbSRobert Mustacchi  * but this can be sped up with more cached information when it's justified. We
260*533affcbSRobert Mustacchi  * process things in the following order:
261*533affcbSRobert Mustacchi  *
262*533affcbSRobert Mustacchi  * 1) Evaluate if someone can now take a controller write lock. If so, wake up
263*533affcbSRobert Mustacchi  * the head of the list and then all subsequent processing is done.
264*533affcbSRobert Mustacchi  * 2) Evaluate if there are pending readers for the controller. If so, wake up
265*533affcbSRobert Mustacchi  * each and every waiter. Always continue to namespaces in this case.
266*533affcbSRobert Mustacchi  *
267*533affcbSRobert Mustacchi  * For each namespace:
268*533affcbSRobert Mustacchi  *
269*533affcbSRobert Mustacchi  * 1) Evaluate if there are pending writers and they can take the write lock. If
270*533affcbSRobert Mustacchi  * so, wake up the head of the list.  If so, continue to the next namespace.
271*533affcbSRobert Mustacchi  * 2) Otherwise, if there are pending readers. If so, wake up each and every
272*533affcbSRobert Mustacchi  * reader. Continue onto the next namespace.
273*533affcbSRobert Mustacchi  */
274*533affcbSRobert Mustacchi static void
nvme_rwlock_wakeup(nvme_t * nvme)275*533affcbSRobert Mustacchi nvme_rwlock_wakeup(nvme_t *nvme)
276*533affcbSRobert Mustacchi {
277*533affcbSRobert Mustacchi 	nvme_lock_t *ctrl_lock = &nvme->n_lock;
278*533affcbSRobert Mustacchi 
279*533affcbSRobert Mustacchi 	/*
280*533affcbSRobert Mustacchi 	 * This assertion may seem weird, but it's actually a bit of an
281*533affcbSRobert Mustacchi 	 * invariant. When the controller's write lock is taken, by definition
282*533affcbSRobert Mustacchi 	 * there are no other locks that can be taken. Therefore if we were
283*533affcbSRobert Mustacchi 	 * somehow unable to unlock a lock on this controller, then we'd be
284*533affcbSRobert Mustacchi 	 * violating our rules.
285*533affcbSRobert Mustacchi 	 */
286*533affcbSRobert Mustacchi 	VERIFY3P(ctrl_lock->nl_writer, ==, NULL);
287*533affcbSRobert Mustacchi 
288*533affcbSRobert Mustacchi 	/*
289*533affcbSRobert Mustacchi 	 * If there are pending writers, either one of them will be woken up or
290*533affcbSRobert Mustacchi 	 * no one will. Writers trump readers, but it's possible that we may not
291*533affcbSRobert Mustacchi 	 * be able to wake up a waiting writer yet. If we take this arm, we
292*533affcbSRobert Mustacchi 	 * should not process anything else. The same logic applies in the
293*533affcbSRobert Mustacchi 	 * namespace case as well.
294*533affcbSRobert Mustacchi 	 */
295*533affcbSRobert Mustacchi 	if (list_is_empty(&ctrl_lock->nl_pend_writers) == 0) {
296*533affcbSRobert Mustacchi 		nvme_minor_lock_info_t *info;
297*533affcbSRobert Mustacchi 
298*533affcbSRobert Mustacchi 		if (!nvme_rwlock_handoff_ctrl_wrlock(nvme))
299*533affcbSRobert Mustacchi 			return;
300*533affcbSRobert Mustacchi 
301*533affcbSRobert Mustacchi 		/*
302*533affcbSRobert Mustacchi 		 * We opt to indicate that this is unlocked ahead of
303*533affcbSRobert Mustacchi 		 * taking the lock for state tracking purposes.
304*533affcbSRobert Mustacchi 		 */
305*533affcbSRobert Mustacchi 		info = list_remove_head(&ctrl_lock->nl_pend_writers);
306*533affcbSRobert Mustacchi 		info->nli_state = NVME_LOCK_STATE_UNLOCKED;
307*533affcbSRobert Mustacchi 		nvme_rwlock_wrlock(info, ctrl_lock);
308*533affcbSRobert Mustacchi 		nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
309*533affcbSRobert Mustacchi 		return;
310*533affcbSRobert Mustacchi 	}
311*533affcbSRobert Mustacchi 
312*533affcbSRobert Mustacchi 	nvme_rwlock_wakeup_readers(ctrl_lock);
313*533affcbSRobert Mustacchi 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
314*533affcbSRobert Mustacchi 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
315*533affcbSRobert Mustacchi 		nvme_lock_t *ns_lock = &ns->ns_lock;
316*533affcbSRobert Mustacchi 
317*533affcbSRobert Mustacchi 		if (list_is_empty(&ns_lock->nl_pend_writers) == 0) {
318*533affcbSRobert Mustacchi 			nvme_minor_lock_info_t *info;
319*533affcbSRobert Mustacchi 
320*533affcbSRobert Mustacchi 			if (!nvme_rwlock_handoff_ns_wrlock(nvme, ns))
321*533affcbSRobert Mustacchi 				continue;
322*533affcbSRobert Mustacchi 
323*533affcbSRobert Mustacchi 			info = list_remove_head(&ns_lock->nl_pend_writers);
324*533affcbSRobert Mustacchi 			info->nli_state = NVME_LOCK_STATE_UNLOCKED;
325*533affcbSRobert Mustacchi 			nvme_rwlock_wrlock(info, ns_lock);
326*533affcbSRobert Mustacchi 			nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
327*533affcbSRobert Mustacchi 		} else {
328*533affcbSRobert Mustacchi 			nvme_rwlock_wakeup_readers(ns_lock);
329*533affcbSRobert Mustacchi 		}
330*533affcbSRobert Mustacchi 	}
331*533affcbSRobert Mustacchi }
332*533affcbSRobert Mustacchi 
333*533affcbSRobert Mustacchi /*
334*533affcbSRobert Mustacchi  * This cleans up all the state in the minor for returning without a lock held.
335*533affcbSRobert Mustacchi  */
336*533affcbSRobert Mustacchi static void
nvme_rwunlock_cleanup_minor(nvme_minor_lock_info_t * info)337*533affcbSRobert Mustacchi nvme_rwunlock_cleanup_minor(nvme_minor_lock_info_t *info)
338*533affcbSRobert Mustacchi {
339*533affcbSRobert Mustacchi 	info->nli_lock = NULL;
340*533affcbSRobert Mustacchi 	info->nli_state = NVME_LOCK_STATE_UNLOCKED;
341*533affcbSRobert Mustacchi 	info->nli_curlevel = 0;
342*533affcbSRobert Mustacchi 	info->nli_ns = NULL;
343*533affcbSRobert Mustacchi }
344*533affcbSRobert Mustacchi 
345*533affcbSRobert Mustacchi /*
346*533affcbSRobert Mustacchi  * We've been asked to unlock a lock. Not only must we remove our hold from this
347*533affcbSRobert Mustacchi  * lock, we must go through and wake up the next waiter. The waiters that we
348*533affcbSRobert Mustacchi  * have to wake up vary depending on our lock. See section 'ioctls, Errors, and
349*533affcbSRobert Mustacchi  * Exclusive Access' in the theory statement for more information.
350*533affcbSRobert Mustacchi  */
351*533affcbSRobert Mustacchi 
352*533affcbSRobert Mustacchi void
nvme_rwunlock(nvme_minor_lock_info_t * info,nvme_lock_t * lock)353*533affcbSRobert Mustacchi nvme_rwunlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
354*533affcbSRobert Mustacchi {
355*533affcbSRobert Mustacchi 	nvme_t *const nvme = info->nli_nvme;
356*533affcbSRobert Mustacchi 	boolean_t is_read;
357*533affcbSRobert Mustacchi 
358*533affcbSRobert Mustacchi 	VERIFY(MUTEX_HELD(&nvme->n_minor_mutex));
359*533affcbSRobert Mustacchi 	VERIFY3P(info->nli_lock, ==, lock);
360*533affcbSRobert Mustacchi 	VERIFY(info->nli_curlevel == NVME_LOCK_L_READ ||
361*533affcbSRobert Mustacchi 	    info->nli_curlevel == NVME_LOCK_L_WRITE);
362*533affcbSRobert Mustacchi 	is_read = info->nli_curlevel == NVME_LOCK_L_READ;
363*533affcbSRobert Mustacchi 
364*533affcbSRobert Mustacchi 	/*
365*533affcbSRobert Mustacchi 	 * First we need to remove this minor from the lock and clean up all of
366*533affcbSRobert Mustacchi 	 * the state this lock in the info structure.
367*533affcbSRobert Mustacchi 	 */
368*533affcbSRobert Mustacchi 	info->nli_last_change = gethrtime();
369*533affcbSRobert Mustacchi 	if (is_read) {
370*533affcbSRobert Mustacchi 		VERIFY3U(list_link_active(&info->nli_node), !=, 0);
371*533affcbSRobert Mustacchi 		ASSERT3U(nvme_rwlock_is_reader(lock, info), ==, B_TRUE);
372*533affcbSRobert Mustacchi 		list_remove(&lock->nl_readers, info);
373*533affcbSRobert Mustacchi 	} else {
374*533affcbSRobert Mustacchi 		VERIFY3U(list_link_active(&info->nli_node), ==, 0);
375*533affcbSRobert Mustacchi 		VERIFY3P(lock->nl_writer, ==, info);
376*533affcbSRobert Mustacchi 		lock->nl_writer = NULL;
377*533affcbSRobert Mustacchi 	}
378*533affcbSRobert Mustacchi 
379*533affcbSRobert Mustacchi 	nvme_rwunlock_cleanup_minor(info);
380*533affcbSRobert Mustacchi 	nvme_rwlock_wakeup(nvme);
381*533affcbSRobert Mustacchi }
382*533affcbSRobert Mustacchi 
383*533affcbSRobert Mustacchi /*
384*533affcbSRobert Mustacchi  * We were just interrupted due to a signal. However, just because our block was
385*533affcbSRobert Mustacchi  * interrupted due to a signal doesn't mean that other activity didn't occur. In
386*533affcbSRobert Mustacchi  * particular, the signal wake up could race with a subsequent wake up that was
387*533affcbSRobert Mustacchi  * due to the device being removed or actually acquiring the lock. Depending on
388*533affcbSRobert Mustacchi  * which state we were in, we need to perform the appropriate clean up. In all
389*533affcbSRobert Mustacchi  * cases, the signal trumps all, which may mean actually unlocking!
390*533affcbSRobert Mustacchi  */
391*533affcbSRobert Mustacchi static void
nvme_rwlock_signal(nvme_minor_lock_info_t * info,nvme_lock_t * lock,boolean_t is_read)392*533affcbSRobert Mustacchi nvme_rwlock_signal(nvme_minor_lock_info_t *info, nvme_lock_t *lock,
393*533affcbSRobert Mustacchi     boolean_t is_read)
394*533affcbSRobert Mustacchi {
395*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_ioc, !=, NULL);
396*533affcbSRobert Mustacchi 
397*533affcbSRobert Mustacchi 	/*
398*533affcbSRobert Mustacchi 	 * We're changing the state here, so update the minor's last change
399*533affcbSRobert Mustacchi 	 * time.
400*533affcbSRobert Mustacchi 	 */
401*533affcbSRobert Mustacchi 	info->nli_last_change = gethrtime();
402*533affcbSRobert Mustacchi 	lock->nl_nsignals++;
403*533affcbSRobert Mustacchi 
404*533affcbSRobert Mustacchi 	/*
405*533affcbSRobert Mustacchi 	 * This is the simplest case. We've already been removed from the lock
406*533affcbSRobert Mustacchi 	 * that we're on. All we need to do is change the error to indicate that
407*533affcbSRobert Mustacchi 	 * we received a signal.
408*533affcbSRobert Mustacchi 	 */
409*533affcbSRobert Mustacchi 	if (info->nli_state == NVME_LOCK_STATE_UNLOCKED) {
410*533affcbSRobert Mustacchi 		ASSERT3P(info->nli_lock, ==, NULL);
411*533affcbSRobert Mustacchi 		(void) nvme_ioctl_error(info->nli_ioc,
412*533affcbSRobert Mustacchi 		    NVME_IOCTL_E_LOCK_WAIT_SIGNAL, 0, 0);
413*533affcbSRobert Mustacchi 		lock->nl_nsig_unlock++;
414*533affcbSRobert Mustacchi 		return;
415*533affcbSRobert Mustacchi 	}
416*533affcbSRobert Mustacchi 
417*533affcbSRobert Mustacchi 	/*
418*533affcbSRobert Mustacchi 	 * For all others, the lock should be set here.
419*533affcbSRobert Mustacchi 	 */
420*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_lock, ==, lock);
421*533affcbSRobert Mustacchi 
422*533affcbSRobert Mustacchi 	/*
423*533affcbSRobert Mustacchi 	 * For someone that was blocked, we need to remove them from the pending
424*533affcbSRobert Mustacchi 	 * lists.
425*533affcbSRobert Mustacchi 	 */
426*533affcbSRobert Mustacchi 	if (info->nli_state == NVME_LOCK_STATE_BLOCKED) {
427*533affcbSRobert Mustacchi 		ASSERT3S(list_link_active(&info->nli_node), !=, 0);
428*533affcbSRobert Mustacchi 		if (is_read) {
429*533affcbSRobert Mustacchi 			list_remove(&lock->nl_pend_readers, info);
430*533affcbSRobert Mustacchi 		} else {
431*533affcbSRobert Mustacchi 			list_remove(&lock->nl_pend_writers, info);
432*533affcbSRobert Mustacchi 		}
433*533affcbSRobert Mustacchi 
434*533affcbSRobert Mustacchi 		nvme_rwunlock_cleanup_minor(info);
435*533affcbSRobert Mustacchi 		(void) nvme_ioctl_error(info->nli_ioc,
436*533affcbSRobert Mustacchi 		    NVME_IOCTL_E_LOCK_WAIT_SIGNAL, 0, 0);
437*533affcbSRobert Mustacchi 		lock->nl_nsig_blocks++;
438*533affcbSRobert Mustacchi 		return;
439*533affcbSRobert Mustacchi 	}
440*533affcbSRobert Mustacchi 
441*533affcbSRobert Mustacchi 	/*
442*533affcbSRobert Mustacchi 	 * Now, the most nuanced thing that we need to do. We need to unlock
443*533affcbSRobert Mustacchi 	 * this node. We synthesize an unlock request and submit that.
444*533affcbSRobert Mustacchi 	 */
445*533affcbSRobert Mustacchi 	lock->nl_nsig_acq++;
446*533affcbSRobert Mustacchi 	nvme_rwunlock(info, lock);
447*533affcbSRobert Mustacchi }
448*533affcbSRobert Mustacchi 
449*533affcbSRobert Mustacchi /*
450*533affcbSRobert Mustacchi  * Here we need to implement our read-write lock policy. Refer to the big theory
451*533affcbSRobert Mustacchi  * statement for more information. Here's a summary of the priority that's
452*533affcbSRobert Mustacchi  * relevant here:
453*533affcbSRobert Mustacchi  *
454*533affcbSRobert Mustacchi  * 1) Waiting writers starve waiting readers
455*533affcbSRobert Mustacchi  * 2) Waiting writers for the controller starve all namespace writers and
456*533affcbSRobert Mustacchi  *    readers
457*533affcbSRobert Mustacchi  * 3) A read lock can be taken if there are no pending or active writers on the
458*533affcbSRobert Mustacchi  *    lock (and the controller lock for a namespace).
459*533affcbSRobert Mustacchi  */
460*533affcbSRobert Mustacchi void
nvme_rwlock(nvme_minor_t * minor,nvme_ioctl_lock_t * req)461*533affcbSRobert Mustacchi nvme_rwlock(nvme_minor_t *minor, nvme_ioctl_lock_t *req)
462*533affcbSRobert Mustacchi {
463*533affcbSRobert Mustacchi 	nvme_t *const nvme = minor->nm_ctrl;
464*533affcbSRobert Mustacchi 	const boolean_t is_nonblock = (req->nil_flags &
465*533affcbSRobert Mustacchi 	    NVME_LOCK_F_DONT_BLOCK) != 0;
466*533affcbSRobert Mustacchi 	const boolean_t is_read = req->nil_level == NVME_LOCK_L_READ;
467*533affcbSRobert Mustacchi 	const boolean_t is_ctrl = req->nil_ent == NVME_LOCK_E_CTRL;
468*533affcbSRobert Mustacchi 	nvme_minor_lock_info_t *info;
469*533affcbSRobert Mustacchi 	nvme_lock_t *lock;
470*533affcbSRobert Mustacchi 	boolean_t waiters;
471*533affcbSRobert Mustacchi 	hrtime_t sleep_time;
472*533affcbSRobert Mustacchi 
473*533affcbSRobert Mustacchi 	VERIFY(MUTEX_HELD(&nvme->n_minor_mutex));
474*533affcbSRobert Mustacchi 
475*533affcbSRobert Mustacchi 	if (is_ctrl) {
476*533affcbSRobert Mustacchi 		info = &minor->nm_ctrl_lock;
477*533affcbSRobert Mustacchi 		lock = &nvme->n_lock;
478*533affcbSRobert Mustacchi 
479*533affcbSRobert Mustacchi 		if (is_read) {
480*533affcbSRobert Mustacchi 			waiters = nvme_rwlock_block_ctrl_rdlock(nvme);
481*533affcbSRobert Mustacchi 		} else {
482*533affcbSRobert Mustacchi 			waiters = nvme_rwlock_block_ctrl_wrlock(nvme);
483*533affcbSRobert Mustacchi 		}
484*533affcbSRobert Mustacchi 	} else {
485*533affcbSRobert Mustacchi 		nvme_namespace_t *ns;
486*533affcbSRobert Mustacchi 		const uint32_t nsid = req->nil_common.nioc_nsid;
487*533affcbSRobert Mustacchi 		info = &minor->nm_ns_lock;
488*533affcbSRobert Mustacchi 
489*533affcbSRobert Mustacchi 		VERIFY3U(req->nil_ent, ==, NVME_LOCK_E_NS);
490*533affcbSRobert Mustacchi 		ns = nvme_nsid2ns(nvme, nsid);
491*533affcbSRobert Mustacchi 		minor->nm_ns_lock.nli_ns = ns;
492*533affcbSRobert Mustacchi 		lock = &ns->ns_lock;
493*533affcbSRobert Mustacchi 
494*533affcbSRobert Mustacchi 		if (is_read) {
495*533affcbSRobert Mustacchi 			waiters = nvme_rwlock_block_ns_rdlock(nvme, ns);
496*533affcbSRobert Mustacchi 		} else {
497*533affcbSRobert Mustacchi 			waiters = nvme_rwlock_block_ns_wrlock(nvme, ns);
498*533affcbSRobert Mustacchi 		}
499*533affcbSRobert Mustacchi 	}
500*533affcbSRobert Mustacchi 
501*533affcbSRobert Mustacchi 	/*
502*533affcbSRobert Mustacchi 	 * Set the information that indicates what kind of lock we're attempting
503*533affcbSRobert Mustacchi 	 * to acquire and that we're operating on.
504*533affcbSRobert Mustacchi 	 */
505*533affcbSRobert Mustacchi 	info->nli_curlevel = is_read ? NVME_LOCK_L_READ : NVME_LOCK_L_WRITE;
506*533affcbSRobert Mustacchi 	info->nli_lock = lock;
507*533affcbSRobert Mustacchi 
508*533affcbSRobert Mustacchi 	/*
509*533affcbSRobert Mustacchi 	 * We think we can get the lock, hurrah.
510*533affcbSRobert Mustacchi 	 */
511*533affcbSRobert Mustacchi 	if (!waiters) {
512*533affcbSRobert Mustacchi 		if (is_read) {
513*533affcbSRobert Mustacchi 			nvme_rwlock_rdlock(info, lock);
514*533affcbSRobert Mustacchi 		} else {
515*533affcbSRobert Mustacchi 			nvme_rwlock_wrlock(info, lock);
516*533affcbSRobert Mustacchi 		}
517*533affcbSRobert Mustacchi 		(void) nvme_ioctl_success(&req->nil_common);
518*533affcbSRobert Mustacchi 		return;
519*533affcbSRobert Mustacchi 	}
520*533affcbSRobert Mustacchi 
521*533affcbSRobert Mustacchi 	/*
522*533affcbSRobert Mustacchi 	 * We failed to get the lock. At this point we will set ourselves up to
523*533affcbSRobert Mustacchi 	 * block. Once we go to sleep on the CV, our assumption is that anyone
524*533affcbSRobert Mustacchi 	 * who has woken us up will have filled in the information the status of
525*533affcbSRobert Mustacchi 	 * this operation and therefore after this point, all we have to do is
526*533affcbSRobert Mustacchi 	 * return.
527*533affcbSRobert Mustacchi 	 */
528*533affcbSRobert Mustacchi 	if (is_nonblock) {
529*533affcbSRobert Mustacchi 		nvme_rwunlock_cleanup_minor(info);
530*533affcbSRobert Mustacchi 		lock->nl_nnonblock++;
531*533affcbSRobert Mustacchi 		(void) nvme_ioctl_error(&req->nil_common,
532*533affcbSRobert Mustacchi 		    NVME_IOCTL_E_LOCK_WOULD_BLOCK, 0, 0);
533*533affcbSRobert Mustacchi 		return;
534*533affcbSRobert Mustacchi 	}
535*533affcbSRobert Mustacchi 
536*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_ioc, ==, NULL);
537*533affcbSRobert Mustacchi 	info->nli_ioc = &req->nil_common;
538*533affcbSRobert Mustacchi 	if (is_read) {
539*533affcbSRobert Mustacchi 		list_insert_tail(&lock->nl_pend_readers, info);
540*533affcbSRobert Mustacchi 		lock->nl_npend_reads++;
541*533affcbSRobert Mustacchi 	} else {
542*533affcbSRobert Mustacchi 		list_insert_tail(&lock->nl_pend_writers, info);
543*533affcbSRobert Mustacchi 		lock->nl_npend_writes++;
544*533affcbSRobert Mustacchi 	}
545*533affcbSRobert Mustacchi 
546*533affcbSRobert Mustacchi 	ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
547*533affcbSRobert Mustacchi 	info->nli_state = NVME_LOCK_STATE_BLOCKED;
548*533affcbSRobert Mustacchi 	sleep_time = gethrtime();
549*533affcbSRobert Mustacchi 	info->nli_last_change = sleep_time;
550*533affcbSRobert Mustacchi 	while (info->nli_state == NVME_LOCK_STATE_BLOCKED) {
551*533affcbSRobert Mustacchi 		/*
552*533affcbSRobert Mustacchi 		 * Block until we receive a signal. Note, a signal trumps all
553*533affcbSRobert Mustacchi 		 * other processing. We may be woken up here because we acquired
554*533affcbSRobert Mustacchi 		 * a lock, we may also end up woken up here if the controller is
555*533affcbSRobert Mustacchi 		 * marked as dead.
556*533affcbSRobert Mustacchi 		 */
557*533affcbSRobert Mustacchi 		if (cv_wait_sig(&minor->nm_cv, &nvme->n_minor_mutex) == 0) {
558*533affcbSRobert Mustacchi 			nvme_rwlock_signal(info, lock, is_read);
559*533affcbSRobert Mustacchi 			break;
560*533affcbSRobert Mustacchi 		}
561*533affcbSRobert Mustacchi 	}
562*533affcbSRobert Mustacchi 
563*533affcbSRobert Mustacchi 	/*
564*533affcbSRobert Mustacchi 	 * Before we return, clean up and sanity check our state.
565*533affcbSRobert Mustacchi 	 */
566*533affcbSRobert Mustacchi 	info->nli_ioc = NULL;
567*533affcbSRobert Mustacchi #ifdef	DEBUG
568*533affcbSRobert Mustacchi 	ASSERT3S(info->nli_last_change, !=, sleep_time);
569*533affcbSRobert Mustacchi 	if (info->nli_state == NVME_LOCK_STATE_UNLOCKED) {
570*533affcbSRobert Mustacchi 		ASSERT3S(list_link_active(&info->nli_node), ==, 0);
571*533affcbSRobert Mustacchi 		ASSERT3P(info->nli_ns, ==, NULL);
572*533affcbSRobert Mustacchi 		ASSERT3U(req->nil_common.nioc_drv_err, !=, NVME_IOCTL_E_OK);
573*533affcbSRobert Mustacchi 	} else {
574*533affcbSRobert Mustacchi 		ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_ACQUIRED);
575*533affcbSRobert Mustacchi 		ASSERT3U(req->nil_common.nioc_drv_err, ==, NVME_IOCTL_E_OK);
576*533affcbSRobert Mustacchi 		if (is_read) {
577*533affcbSRobert Mustacchi 			ASSERT3S(list_link_active(&info->nli_node), !=, 0);
578*533affcbSRobert Mustacchi 		} else {
579*533affcbSRobert Mustacchi 			ASSERT3P(lock->nl_writer, ==, info);
580*533affcbSRobert Mustacchi 		}
581*533affcbSRobert Mustacchi 	}
582*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_minor, ==, minor);
583*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_nvme, ==, minor->nm_ctrl);
584*533affcbSRobert Mustacchi #endif
585*533affcbSRobert Mustacchi }
586*533affcbSRobert Mustacchi 
587*533affcbSRobert Mustacchi /*
588*533affcbSRobert Mustacchi  * This is used to clean up a single minor that was blocking trying to get a
589*533affcbSRobert Mustacchi  * lock prior to a controller going dead. In particular, the key here is we need
590*533affcbSRobert Mustacchi  * to change its state to unlocked by cleaning it up and then signal it to wake
591*533affcbSRobert Mustacchi  * up and process things. The clean up also helps deal with the case of a racing
592*533affcbSRobert Mustacchi  * signal, though it does leave the state a little awkward in this intermediate
593*533affcbSRobert Mustacchi  * moment; however, since it's been removed from a list that's really the proper
594*533affcbSRobert Mustacchi  * action and no one can issue new lock ioctls at this point.
595*533affcbSRobert Mustacchi  */
596*533affcbSRobert Mustacchi static void
nvme_rwlock_ctrl_dead_cleanup_one(nvme_t * nvme,nvme_minor_lock_info_t * info)597*533affcbSRobert Mustacchi nvme_rwlock_ctrl_dead_cleanup_one(nvme_t *nvme, nvme_minor_lock_info_t *info)
598*533affcbSRobert Mustacchi {
599*533affcbSRobert Mustacchi 	ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_BLOCKED);
600*533affcbSRobert Mustacchi 	ASSERT3P(info->nli_ioc, !=, NULL);
601*533affcbSRobert Mustacchi 
602*533affcbSRobert Mustacchi 	/*
603*533affcbSRobert Mustacchi 	 * Update the last time this has changed for our snaity checks.
604*533affcbSRobert Mustacchi 	 */
605*533affcbSRobert Mustacchi 	info->nli_last_change = gethrtime();
606*533affcbSRobert Mustacchi 	nvme_rwunlock_cleanup_minor(info);
607*533affcbSRobert Mustacchi 	nvme_rwlock_signal_one(info, nvme->n_dead_status);
608*533affcbSRobert Mustacchi }
609*533affcbSRobert Mustacchi 
610*533affcbSRobert Mustacchi /*
611*533affcbSRobert Mustacchi  * We've just been informed that this controller has set n_dead. This is most
612*533affcbSRobert Mustacchi  * unfortunate for anyone trying to actively use it right now and we must notify
613*533affcbSRobert Mustacchi  * them. Anyone who has successfully obtained a lock gets to keep it until they
614*533affcbSRobert Mustacchi  * drop it (hopefully soon). Anyone who is asleep should be kicked out being
615*533affcbSRobert Mustacchi  * told they are not getting it.
616*533affcbSRobert Mustacchi  *
617*533affcbSRobert Mustacchi  * The moment we grab n_minor_mutex, no other state here can change. So we can
618*533affcbSRobert Mustacchi  * go ahead and wake up all waiters with impunity. This is being called from the
619*533affcbSRobert Mustacchi  * nvme_dead_taskq.
620*533affcbSRobert Mustacchi  */
621*533affcbSRobert Mustacchi void
nvme_rwlock_ctrl_dead(void * arg)622*533affcbSRobert Mustacchi nvme_rwlock_ctrl_dead(void *arg)
623*533affcbSRobert Mustacchi {
624*533affcbSRobert Mustacchi 	nvme_t *nvme = arg;
625*533affcbSRobert Mustacchi 	nvme_lock_t *ctrl_lock = &nvme->n_lock;
626*533affcbSRobert Mustacchi 	nvme_minor_lock_info_t *info;
627*533affcbSRobert Mustacchi 
628*533affcbSRobert Mustacchi 	mutex_enter(&nvme->n_minor_mutex);
629*533affcbSRobert Mustacchi 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
630*533affcbSRobert Mustacchi 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
631*533affcbSRobert Mustacchi 		nvme_lock_t *ns_lock = &ns->ns_lock;
632*533affcbSRobert Mustacchi 
633*533affcbSRobert Mustacchi 		while ((info = list_remove_head(&ns_lock->nl_pend_readers)) !=
634*533affcbSRobert Mustacchi 		    NULL) {
635*533affcbSRobert Mustacchi 			nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
636*533affcbSRobert Mustacchi 		}
637*533affcbSRobert Mustacchi 
638*533affcbSRobert Mustacchi 		while ((info = list_remove_head(&ns_lock->nl_pend_writers)) !=
639*533affcbSRobert Mustacchi 		    NULL) {
640*533affcbSRobert Mustacchi 			nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
641*533affcbSRobert Mustacchi 		}
642*533affcbSRobert Mustacchi 	}
643*533affcbSRobert Mustacchi 
644*533affcbSRobert Mustacchi 	while ((info = list_remove_head(&ctrl_lock->nl_pend_readers)) != NULL) {
645*533affcbSRobert Mustacchi 		nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
646*533affcbSRobert Mustacchi 	}
647*533affcbSRobert Mustacchi 
648*533affcbSRobert Mustacchi 	while ((info = list_remove_head(&ctrl_lock->nl_pend_writers)) != NULL) {
649*533affcbSRobert Mustacchi 
650*533affcbSRobert Mustacchi 		nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
651*533affcbSRobert Mustacchi 	}
652*533affcbSRobert Mustacchi 	mutex_exit(&nvme->n_minor_mutex);
653*533affcbSRobert Mustacchi }
654*533affcbSRobert Mustacchi 
655*533affcbSRobert Mustacchi void
nvme_lock_fini(nvme_lock_t * lock)656*533affcbSRobert Mustacchi nvme_lock_fini(nvme_lock_t *lock)
657*533affcbSRobert Mustacchi {
658*533affcbSRobert Mustacchi 	VERIFY3P(lock->nl_writer, ==, NULL);
659*533affcbSRobert Mustacchi 	list_destroy(&lock->nl_pend_writers);
660*533affcbSRobert Mustacchi 	list_destroy(&lock->nl_pend_readers);
661*533affcbSRobert Mustacchi 	list_destroy(&lock->nl_readers);
662*533affcbSRobert Mustacchi }
663*533affcbSRobert Mustacchi 
664*533affcbSRobert Mustacchi void
nvme_lock_init(nvme_lock_t * lock)665*533affcbSRobert Mustacchi nvme_lock_init(nvme_lock_t *lock)
666*533affcbSRobert Mustacchi {
667*533affcbSRobert Mustacchi 	list_create(&lock->nl_readers, sizeof (nvme_minor_lock_info_t),
668*533affcbSRobert Mustacchi 	    offsetof(nvme_minor_lock_info_t, nli_node));
669*533affcbSRobert Mustacchi 	list_create(&lock->nl_pend_readers, sizeof (nvme_minor_lock_info_t),
670*533affcbSRobert Mustacchi 	    offsetof(nvme_minor_lock_info_t, nli_node));
671*533affcbSRobert Mustacchi 	list_create(&lock->nl_pend_writers, sizeof (nvme_minor_lock_info_t),
672*533affcbSRobert Mustacchi 	    offsetof(nvme_minor_lock_info_t, nli_node));
673*533affcbSRobert Mustacchi }
674