1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /* ONC_PLUS EXTRACT START */
22
23 /*
24 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
25 * Use is subject to license terms.
26 */
27
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
30
31 #pragma ident "%Z%%M% %I% %E% SMI"
32
33 #include <sys/flock_impl.h>
34 #include <sys/vfs.h>
35 #include <sys/t_lock.h> /* for <sys/callb.h> */
36 #include <sys/callb.h>
37 #include <sys/clconf.h>
38 #include <sys/cladm.h>
39 #include <sys/nbmlock.h>
40 #include <sys/cred.h>
41 #include <sys/policy.h>
42
43 /*
44 * The following four variables are for statistics purposes and they are
45 * not protected by locks. They may not be accurate but will at least be
46 * close to the actual value.
47 */
48
49 int flk_lock_allocs;
50 int flk_lock_frees;
51 int edge_allocs;
52 int edge_frees;
53 int flk_proc_vertex_allocs;
54 int flk_proc_edge_allocs;
55 int flk_proc_vertex_frees;
56 int flk_proc_edge_frees;
57
58 static kmutex_t flock_lock;
59
60 #ifdef DEBUG
61 int check_debug = 0;
62 #define CHECK_ACTIVE_LOCKS(gp) if (check_debug) \
63 check_active_locks(gp);
64 #define CHECK_SLEEPING_LOCKS(gp) if (check_debug) \
65 check_sleeping_locks(gp);
66 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp) \
67 if (check_debug) \
68 check_owner_locks(gp, pid, sysid, vp);
69 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
70 { \
71 if (check_lock_transition(old_state, new_state)) { \
72 cmn_err(CE_PANIC, "Illegal lock transition \
73 from %d to %d", old_state, new_state); \
74 } \
75 }
76 #else
77
78 #define CHECK_ACTIVE_LOCKS(gp)
79 #define CHECK_SLEEPING_LOCKS(gp)
80 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
81 #define CHECK_LOCK_TRANSITION(old_state, new_state)
82
83 #endif /* DEBUG */
84
85 struct kmem_cache *flk_edge_cache;
86
87 graph_t *lock_graph[HASH_SIZE];
88 proc_graph_t pgraph;
89
90 /*
91 * Clustering.
92 *
93 * NLM REGISTRY TYPE IMPLEMENTATION
94 *
95 * Assumptions:
96 * 1. Nodes in a cluster are numbered starting at 1; always non-negative
97 * integers; maximum node id is returned by clconf_maximum_nodeid().
98 * 2. We use this node id to identify the node an NLM server runs on.
99 */
100
101 /*
102 * NLM registry object keeps track of NLM servers via their
103 * nlmids (which are the node ids of the node in the cluster they run on)
104 * that have requested locks at this LLM with which this registry is
105 * associated.
106 *
107 * Representation of abstraction:
108 * rep = record[ states: array[nlm_state],
109 * lock: mutex]
110 *
111 * Representation invariants:
112 * 1. index i of rep.states is between 0 and n - 1 where n is number
113 * of elements in the array, which happen to be the maximum number
114 * of nodes in the cluster configuration + 1.
115 * 2. map nlmid to index i of rep.states
116 * 0 -> 0
117 * 1 -> 1
118 * 2 -> 2
119 * n-1 -> clconf_maximum_nodeid()+1
120 * 3. This 1-1 mapping is quite convenient and it avoids errors resulting
121 * from forgetting to subtract 1 from the index.
122 * 4. The reason we keep the 0th index is the following. A legitimate
123 * cluster configuration includes making a UFS file system NFS
124 * exportable. The code is structured so that if you're in a cluster
125 * you do one thing; otherwise, you do something else. The problem
126 * is what to do if you think you're in a cluster with PXFS loaded,
127 * but you're using UFS not PXFS? The upper two bytes of the sysid
128 * encode the node id of the node where NLM server runs; these bytes
129 * are zero for UFS. Since the nodeid is used to index into the
130 * registry, we can record the NLM server state information at index
131 * 0 using the same mechanism used for PXFS file locks!
132 */
133 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
134 static kmutex_t nlm_reg_lock; /* lock to protect arrary */
135 static uint_t nlm_status_size; /* size of state array */
136
137 /*
138 * Although we need a global lock dependency graph (and associated data
139 * structures), we also need a per-zone notion of whether the lock manager is
140 * running, and so whether to allow lock manager requests or not.
141 *
142 * Thus, on a per-zone basis we maintain a ``global'' variable
143 * (flk_lockmgr_status), protected by flock_lock, and set when the lock
144 * manager is determined to be changing state (starting or stopping).
145 *
146 * Each graph/zone pair also has a copy of this variable, which is protected by
147 * the graph's mutex.
148 *
149 * The per-graph copies are used to synchronize lock requests with shutdown
150 * requests. The global copy is used to initialize the per-graph field when a
151 * new graph is created.
152 */
153 struct flock_globals {
154 flk_lockmgr_status_t flk_lockmgr_status;
155 flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
156 };
157
158 zone_key_t flock_zone_key;
159
160 static void create_flock(lock_descriptor_t *, flock64_t *);
161 static lock_descriptor_t *flk_get_lock(void);
162 static void flk_free_lock(lock_descriptor_t *lock);
163 static void flk_get_first_blocking_lock(lock_descriptor_t *request);
164 static int flk_process_request(lock_descriptor_t *);
165 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
166 static edge_t *flk_get_edge(void);
167 static int flk_wait_execute_request(lock_descriptor_t *);
168 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
169 static void flk_insert_active_lock(lock_descriptor_t *);
170 static void flk_delete_active_lock(lock_descriptor_t *, int);
171 static void flk_insert_sleeping_lock(lock_descriptor_t *);
172 static void flk_graph_uncolor(graph_t *);
173 static void flk_wakeup(lock_descriptor_t *, int);
174 static void flk_free_edge(edge_t *);
175 static void flk_recompute_dependencies(lock_descriptor_t *,
176 lock_descriptor_t **, int, int);
177 static int flk_find_barriers(lock_descriptor_t *);
178 static void flk_update_barriers(lock_descriptor_t *);
179 static int flk_color_reachables(lock_descriptor_t *);
180 static int flk_canceled(lock_descriptor_t *);
181 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
182 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
183 static void wait_for_lock(lock_descriptor_t *);
184 static void unlock_lockmgr_granted(struct flock_globals *);
185 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
186
187 /* Clustering hooks */
188 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
189 static void cl_flk_wakeup_sleeping_nlm_locks(int);
190 static void cl_flk_unlock_nlm_granted(int);
191
192 #ifdef DEBUG
193 static int check_lock_transition(int, int);
194 static void check_sleeping_locks(graph_t *);
195 static void check_active_locks(graph_t *);
196 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
197 static void path(lock_descriptor_t *, lock_descriptor_t *);
198 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
199 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
200 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
201 #endif
202
203 /* proc_graph function definitions */
204 static int flk_check_deadlock(lock_descriptor_t *);
205 static void flk_proc_graph_uncolor(void);
206 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
207 static proc_edge_t *flk_get_proc_edge(void);
208 static void flk_proc_release(proc_vertex_t *);
209 static void flk_free_proc_edge(proc_edge_t *);
210 static void flk_update_proc_graph(edge_t *, int);
211
212 /* Non-blocking mandatory locking */
213 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
214 u_offset_t);
215
216 static struct flock_globals *
flk_get_globals(void)217 flk_get_globals(void)
218 {
219 /*
220 * The KLM module had better be loaded if we're attempting to handle
221 * lockmgr requests.
222 */
223 ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
224 return (zone_getspecific(flock_zone_key, curproc->p_zone));
225 }
226
227 static flk_lockmgr_status_t
flk_get_lockmgr_status(void)228 flk_get_lockmgr_status(void)
229 {
230 struct flock_globals *fg;
231
232 ASSERT(MUTEX_HELD(&flock_lock));
233
234 if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
235 /*
236 * KLM module not loaded; lock manager definitely not running.
237 */
238 return (FLK_LOCKMGR_DOWN);
239 }
240 fg = flk_get_globals();
241 return (fg->flk_lockmgr_status);
242 }
243
244 /*
245 * Routine called from fs_frlock in fs/fs_subr.c
246 */
247
248 int
reclock(vnode_t * vp,flock64_t * lckdat,int cmd,int flag,u_offset_t offset,flk_callback_t * flk_cbp)249 reclock(vnode_t *vp,
250 flock64_t *lckdat,
251 int cmd,
252 int flag,
253 u_offset_t offset,
254 flk_callback_t *flk_cbp)
255 {
256 lock_descriptor_t stack_lock_request;
257 lock_descriptor_t *lock_request;
258 int error = 0;
259 graph_t *gp;
260 int nlmid;
261
262 /*
263 * Check access permissions
264 */
265 if ((cmd & SETFLCK) &&
266 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
267 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
268 return (EBADF);
269
270 /*
271 * for query and unlock we use the stack_lock_request
272 */
273
274 if ((lckdat->l_type == F_UNLCK) ||
275 !((cmd & INOFLCK) || (cmd & SETFLCK))) {
276 lock_request = &stack_lock_request;
277 (void) bzero((caddr_t)lock_request,
278 sizeof (lock_descriptor_t));
279
280 /*
281 * following is added to make the assertions in
282 * flk_execute_request() to pass through
283 */
284
285 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
286 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
287 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
288 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
289 lock_request->l_status = FLK_INITIAL_STATE;
290 } else {
291 lock_request = flk_get_lock();
292 }
293 lock_request->l_state = 0;
294 lock_request->l_vnode = vp;
295 lock_request->l_zoneid = getzoneid();
296
297 /*
298 * Convert the request range into the canonical start and end
299 * values. The NLM protocol supports locking over the entire
300 * 32-bit range, so there's no range checking for remote requests,
301 * but we still need to verify that local requests obey the rules.
302 */
303 /* Clustering */
304 if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
305 ASSERT(lckdat->l_whence == 0);
306 lock_request->l_start = lckdat->l_start;
307 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
308 lckdat->l_start + (lckdat->l_len - 1);
309 } else {
310 /* check the validity of the lock range */
311 error = flk_convert_lock_data(vp, lckdat,
312 &lock_request->l_start, &lock_request->l_end,
313 offset);
314 if (error) {
315 goto done;
316 }
317 error = flk_check_lock_data(lock_request->l_start,
318 lock_request->l_end, MAXEND);
319 if (error) {
320 goto done;
321 }
322 }
323
324 ASSERT(lock_request->l_end >= lock_request->l_start);
325
326 lock_request->l_type = lckdat->l_type;
327 if (cmd & INOFLCK)
328 lock_request->l_state |= IO_LOCK;
329 if (cmd & SLPFLCK)
330 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
331 if (cmd & RCMDLCK)
332 lock_request->l_state |= LOCKMGR_LOCK;
333 if (cmd & NBMLCK)
334 lock_request->l_state |= NBMAND_LOCK;
335 /*
336 * Clustering: set flag for PXFS locks
337 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
338 * also be of type 'RCMDLCK'.
339 * We do not _only_ check the GETPXFSID() macro because local PXFS
340 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
341 */
342
343 if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
344 lock_request->l_state |= PXFS_LOCK;
345 }
346 if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
347 if (lock_request->l_type == F_RDLCK ||
348 lock_request->l_type == F_WRLCK)
349 lock_request->l_state |= QUERY_LOCK;
350 }
351 lock_request->l_flock = (*lckdat);
352 lock_request->l_callbacks = flk_cbp;
353
354 /*
355 * We are ready for processing the request
356 */
357 if (IS_LOCKMGR(lock_request)) {
358 /*
359 * If the lock request is an NLM server request ....
360 */
361 if (nlm_status_size == 0) { /* not booted as cluster */
362 mutex_enter(&flock_lock);
363 /*
364 * Bail out if this is a lock manager request and the
365 * lock manager is not supposed to be running.
366 */
367 if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
368 mutex_exit(&flock_lock);
369 error = ENOLCK;
370 goto done;
371 }
372 mutex_exit(&flock_lock);
373 } else { /* booted as a cluster */
374 nlmid = GETNLMID(lock_request->l_flock.l_sysid);
375 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
376
377 mutex_enter(&nlm_reg_lock);
378 /*
379 * If the NLM registry does not know about this
380 * NLM server making the request, add its nlmid
381 * to the registry.
382 */
383 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
384 nlmid)) {
385 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
386 } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
387 nlmid)) {
388 /*
389 * If the NLM server is already known (has made
390 * previous lock requests) and its state is
391 * not NLM_UP (means that NLM server is
392 * shutting down), then bail out with an
393 * error to deny the lock request.
394 */
395 mutex_exit(&nlm_reg_lock);
396 error = ENOLCK;
397 goto done;
398 }
399 mutex_exit(&nlm_reg_lock);
400 }
401 }
402
403 /* Now get the lock graph for a particular vnode */
404 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
405
406 /*
407 * We drop rwlock here otherwise this might end up causing a
408 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
409 */
410
411 if (IS_IO_LOCK(lock_request)) {
412 VOP_RWUNLOCK(vp,
413 (lock_request->l_type == F_RDLCK) ?
414 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
415 }
416 mutex_enter(&gp->gp_mutex);
417
418 lock_request->l_state |= REFERENCED_LOCK;
419 lock_request->l_graph = gp;
420
421 switch (lock_request->l_type) {
422 case F_RDLCK:
423 case F_WRLCK:
424 if (IS_QUERY_LOCK(lock_request)) {
425 flk_get_first_blocking_lock(lock_request);
426 (*lckdat) = lock_request->l_flock;
427 break;
428 }
429
430 /* process the request now */
431
432 error = flk_process_request(lock_request);
433 break;
434
435 case F_UNLCK:
436 /* unlock request will not block so execute it immediately */
437
438 if (IS_LOCKMGR(lock_request) &&
439 flk_canceled(lock_request)) {
440 error = 0;
441 } else {
442 error = flk_execute_request(lock_request);
443 }
444 break;
445
446 case F_UNLKSYS:
447 /*
448 * Recovery mechanism to release lock manager locks when
449 * NFS client crashes and restart. NFS server will clear
450 * old locks and grant new locks.
451 */
452
453 if (lock_request->l_flock.l_sysid == 0) {
454 mutex_exit(&gp->gp_mutex);
455 return (EINVAL);
456 }
457 if (secpolicy_nfs(CRED()) != 0) {
458 mutex_exit(&gp->gp_mutex);
459 return (EPERM);
460 }
461 flk_delete_locks_by_sysid(lock_request);
462 lock_request->l_state &= ~REFERENCED_LOCK;
463 flk_set_state(lock_request, FLK_DEAD_STATE);
464 flk_free_lock(lock_request);
465 mutex_exit(&gp->gp_mutex);
466 return (0);
467
468 default:
469 error = EINVAL;
470 break;
471 }
472
473 /* Clustering: For blocked PXFS locks, return */
474 if (error == PXFS_LOCK_BLOCKED) {
475 lock_request->l_state &= ~REFERENCED_LOCK;
476 mutex_exit(&gp->gp_mutex);
477 return (error);
478 }
479
480 /*
481 * Now that we have seen the status of locks in the system for
482 * this vnode we acquire the rwlock if it is an IO_LOCK.
483 */
484
485 if (IS_IO_LOCK(lock_request)) {
486 (void) VOP_RWLOCK(vp,
487 (lock_request->l_type == F_RDLCK) ?
488 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
489 if (!error) {
490 lckdat->l_type = F_UNLCK;
491
492 /*
493 * This wake up is needed otherwise
494 * if IO_LOCK has slept the dependents on this
495 * will not be woken up at all. (bugid # 1185482).
496 */
497
498 flk_wakeup(lock_request, 1);
499 flk_set_state(lock_request, FLK_DEAD_STATE);
500 flk_free_lock(lock_request);
501 }
502 /*
503 * else if error had occurred either flk_process_request()
504 * has returned EDEADLK in which case there will be no
505 * dependents for this lock or EINTR from flk_wait_execute_
506 * request() in which case flk_cancel_sleeping_lock()
507 * would have been done. same is true with EBADF.
508 */
509 }
510
511 if (lock_request == &stack_lock_request) {
512 flk_set_state(lock_request, FLK_DEAD_STATE);
513 } else {
514 lock_request->l_state &= ~REFERENCED_LOCK;
515 if ((error != 0) || IS_DELETED(lock_request)) {
516 flk_set_state(lock_request, FLK_DEAD_STATE);
517 flk_free_lock(lock_request);
518 }
519 }
520
521 mutex_exit(&gp->gp_mutex);
522 return (error);
523
524 done:
525 flk_set_state(lock_request, FLK_DEAD_STATE);
526 if (lock_request != &stack_lock_request)
527 flk_free_lock(lock_request);
528 return (error);
529 }
530
531 /*
532 * Invoke the callbacks in the given list. If before sleeping, invoke in
533 * list order. If after sleeping, invoke in reverse order.
534 *
535 * CPR (suspend/resume) support: if one of the callbacks returns a
536 * callb_cpr_t, return it. This will be used to make the thread CPR-safe
537 * while it is sleeping. There should be at most one callb_cpr_t for the
538 * thread.
539 * XXX This is unnecessarily complicated. The CPR information should just
540 * get passed in directly through VOP_FRLOCK and reclock, rather than
541 * sneaking it in via a callback.
542 */
543
544 callb_cpr_t *
flk_invoke_callbacks(flk_callback_t * cblist,flk_cb_when_t when)545 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
546 {
547 callb_cpr_t *cpr_callbackp = NULL;
548 callb_cpr_t *one_result;
549 flk_callback_t *cb;
550
551 if (cblist == NULL)
552 return (NULL);
553
554 if (when == FLK_BEFORE_SLEEP) {
555 cb = cblist;
556 do {
557 one_result = (*cb->cb_callback)(when, cb->cb_data);
558 if (one_result != NULL) {
559 ASSERT(cpr_callbackp == NULL);
560 cpr_callbackp = one_result;
561 }
562 cb = cb->cb_next;
563 } while (cb != cblist);
564 } else {
565 cb = cblist->cb_prev;
566 do {
567 one_result = (*cb->cb_callback)(when, cb->cb_data);
568 if (one_result != NULL) {
569 cpr_callbackp = one_result;
570 }
571 cb = cb->cb_prev;
572 } while (cb != cblist->cb_prev);
573 }
574
575 return (cpr_callbackp);
576 }
577
578 /*
579 * Initialize a flk_callback_t to hold the given callback.
580 */
581
582 void
flk_init_callback(flk_callback_t * flk_cb,callb_cpr_t * (* cb_fcn)(flk_cb_when_t,void *),void * cbdata)583 flk_init_callback(flk_callback_t *flk_cb,
584 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
585 {
586 flk_cb->cb_next = flk_cb;
587 flk_cb->cb_prev = flk_cb;
588 flk_cb->cb_callback = cb_fcn;
589 flk_cb->cb_data = cbdata;
590 }
591
592 /*
593 * Initialize an flk_callback_t and then link it into the head of an
594 * existing list (which may be NULL).
595 */
596
597 void
flk_add_callback(flk_callback_t * newcb,callb_cpr_t * (* cb_fcn)(flk_cb_when_t,void *),void * cbdata,flk_callback_t * cblist)598 flk_add_callback(flk_callback_t *newcb,
599 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
600 void *cbdata, flk_callback_t *cblist)
601 {
602 flk_init_callback(newcb, cb_fcn, cbdata);
603
604 if (cblist == NULL)
605 return;
606
607 newcb->cb_prev = cblist->cb_prev;
608 newcb->cb_next = cblist;
609 cblist->cb_prev->cb_next = newcb;
610 cblist->cb_prev = newcb;
611 }
612 /* ONC_PLUS EXTRACT END */
613
614 /*
615 * Initialize the flk_edge_cache data structure and create the
616 * nlm_reg_status array.
617 */
618
619 void
flk_init(void)620 flk_init(void)
621 {
622 uint_t i;
623
624 flk_edge_cache = kmem_cache_create("flk_edges",
625 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
626 if (flk_edge_cache == NULL) {
627 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
628 }
629 /*
630 * Create the NLM registry object.
631 */
632
633 if (cluster_bootflags & CLUSTER_BOOTED) {
634 /*
635 * This routine tells you the maximum node id that will be used
636 * in the cluster. This number will be the size of the nlm
637 * registry status array. We add 1 because we will be using
638 * all entries indexed from 0 to maxnodeid; e.g., from 0
639 * to 64, for a total of 65 entries.
640 */
641 nlm_status_size = clconf_maximum_nodeid() + 1;
642 } else {
643 nlm_status_size = 0;
644 }
645
646 if (nlm_status_size != 0) { /* booted as a cluster */
647 nlm_reg_status = (flk_nlm_status_t *)
648 kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
649 KM_SLEEP);
650
651 /* initialize all NLM states in array to NLM_UNKNOWN */
652 for (i = 0; i < nlm_status_size; i++) {
653 nlm_reg_status[i] = FLK_NLM_UNKNOWN;
654 }
655 }
656 }
657
658 /*
659 * Zone constructor/destructor callbacks to be executed when a zone is
660 * created/destroyed.
661 */
662 /* ARGSUSED */
663 void *
flk_zone_init(zoneid_t zoneid)664 flk_zone_init(zoneid_t zoneid)
665 {
666 struct flock_globals *fg;
667 uint_t i;
668
669 fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
670 fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
671 for (i = 0; i < HASH_SIZE; i++)
672 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
673 return (fg);
674 }
675
676 /* ARGSUSED */
677 void
flk_zone_fini(zoneid_t zoneid,void * data)678 flk_zone_fini(zoneid_t zoneid, void *data)
679 {
680 struct flock_globals *fg = data;
681
682 kmem_free(fg, sizeof (*fg));
683 }
684
685 /*
686 * Get a lock_descriptor structure with initialization of edge lists.
687 */
688
689 static lock_descriptor_t *
flk_get_lock(void)690 flk_get_lock(void)
691 {
692 lock_descriptor_t *l;
693
694 l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
695
696 cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
697 l->l_edge.edge_in_next = &l->l_edge;
698 l->l_edge.edge_in_prev = &l->l_edge;
699 l->l_edge.edge_adj_next = &l->l_edge;
700 l->l_edge.edge_adj_prev = &l->l_edge;
701 l->pvertex = -1;
702 l->l_status = FLK_INITIAL_STATE;
703 flk_lock_allocs++;
704 return (l);
705 }
706
707 /*
708 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
709 * when some thread has a reference to it as in reclock().
710 */
711
712 void
flk_free_lock(lock_descriptor_t * lock)713 flk_free_lock(lock_descriptor_t *lock)
714 {
715 ASSERT(IS_DEAD(lock));
716 if (IS_REFERENCED(lock)) {
717 lock->l_state |= DELETED_LOCK;
718 return;
719 }
720 flk_lock_frees++;
721 kmem_free((void *)lock, sizeof (lock_descriptor_t));
722 }
723
724 void
flk_set_state(lock_descriptor_t * lock,int new_state)725 flk_set_state(lock_descriptor_t *lock, int new_state)
726 {
727 /*
728 * Locks in the sleeping list may be woken up in a number of ways,
729 * and more than once. If a sleeping lock is signaled awake more
730 * than once, then it may or may not change state depending on its
731 * current state.
732 * Also note that NLM locks that are sleeping could be moved to an
733 * interrupted state more than once if the unlock request is
734 * retransmitted by the NLM client - the second time around, this is
735 * just a nop.
736 * The ordering of being signaled awake is:
737 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
738 * The checks below implement this ordering.
739 */
740 if (IS_INTERRUPTED(lock)) {
741 if ((new_state == FLK_CANCELLED_STATE) ||
742 (new_state == FLK_GRANTED_STATE) ||
743 (new_state == FLK_INTERRUPTED_STATE)) {
744 return;
745 }
746 }
747 if (IS_CANCELLED(lock)) {
748 if ((new_state == FLK_GRANTED_STATE) ||
749 (new_state == FLK_CANCELLED_STATE)) {
750 return;
751 }
752 }
753 CHECK_LOCK_TRANSITION(lock->l_status, new_state);
754 if (IS_PXFS(lock)) {
755 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
756 }
757 lock->l_status = new_state;
758 }
759
760 /*
761 * Routine that checks whether there are any blocking locks in the system.
762 *
763 * The policy followed is if a write lock is sleeping we don't allow read
764 * locks before this write lock even though there may not be any active
765 * locks corresponding to the read locks' region.
766 *
767 * flk_add_edge() function adds an edge between l1 and l2 iff there
768 * is no path between l1 and l2. This is done to have a "minimum
769 * storage representation" of the dependency graph.
770 *
771 * Another property of the graph is since only the new request throws
772 * edges to the existing locks in the graph, the graph is always topologically
773 * ordered.
774 */
775
776 static int
flk_process_request(lock_descriptor_t * request)777 flk_process_request(lock_descriptor_t *request)
778 {
779 graph_t *gp = request->l_graph;
780 lock_descriptor_t *lock;
781 int request_blocked_by_active = 0;
782 int request_blocked_by_granted = 0;
783 int request_blocked_by_sleeping = 0;
784 vnode_t *vp = request->l_vnode;
785 int error = 0;
786 int request_will_wait = 0;
787 int found_covering_lock = 0;
788 lock_descriptor_t *covered_by = NULL;
789
790 ASSERT(MUTEX_HELD(&gp->gp_mutex));
791 request_will_wait = IS_WILLING_TO_SLEEP(request);
792
793 /*
794 * check active locks
795 */
796
797 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
798
799
800 if (lock) {
801 do {
802 if (BLOCKS(lock, request)) {
803 if (!request_will_wait)
804 return (EAGAIN);
805 request_blocked_by_active = 1;
806 break;
807 }
808 /*
809 * Grant lock if it is for the same owner holding active
810 * lock that covers the request.
811 */
812
813 if (SAME_OWNER(lock, request) &&
814 COVERS(lock, request) &&
815 (request->l_type == F_RDLCK))
816 return (flk_execute_request(request));
817 lock = lock->l_next;
818 } while (lock->l_vnode == vp);
819 }
820
821 if (!request_blocked_by_active) {
822 lock_descriptor_t *lk[1];
823 lock_descriptor_t *first_glock = NULL;
824 /*
825 * Shall we grant this?! NO!!
826 * What about those locks that were just granted and still
827 * in sleep queue. Those threads are woken up and so locks
828 * are almost active.
829 */
830 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
831 if (lock) {
832 do {
833 if (BLOCKS(lock, request)) {
834 if (IS_GRANTED(lock)) {
835 request_blocked_by_granted = 1;
836 } else {
837 request_blocked_by_sleeping = 1;
838 }
839 }
840
841 lock = lock->l_next;
842 } while ((lock->l_vnode == vp));
843 first_glock = lock->l_prev;
844 ASSERT(first_glock->l_vnode == vp);
845 }
846
847 if (request_blocked_by_granted)
848 goto block;
849
850 if (!request_blocked_by_sleeping) {
851 /*
852 * If the request isn't going to be blocked by a
853 * sleeping request, we know that it isn't going to
854 * be blocked; we can just execute the request --
855 * without performing costly deadlock detection.
856 */
857 ASSERT(!request_blocked_by_active);
858 return (flk_execute_request(request));
859 } else if (request->l_type == F_RDLCK) {
860 /*
861 * If we have a sleeping writer in the requested
862 * lock's range, block.
863 */
864 goto block;
865 }
866
867 lk[0] = request;
868 request->l_state |= RECOMPUTE_LOCK;
869 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
870 if (lock) {
871 do {
872 flk_recompute_dependencies(lock, lk, 1, 0);
873 lock = lock->l_next;
874 } while (lock->l_vnode == vp);
875 }
876 lock = first_glock;
877 if (lock) {
878 do {
879 if (IS_GRANTED(lock)) {
880 flk_recompute_dependencies(lock, lk, 1, 0);
881 }
882 lock = lock->l_prev;
883 } while ((lock->l_vnode == vp));
884 }
885 request->l_state &= ~RECOMPUTE_LOCK;
886 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
887 return (EDEADLK);
888 return (flk_execute_request(request));
889 }
890
891 block:
892 if (request_will_wait)
893 flk_graph_uncolor(gp);
894
895 /* check sleeping locks */
896
897 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
898
899 /*
900 * If we find a sleeping write lock that is a superset of the
901 * region wanted by request we can be assured that by adding an
902 * edge to this write lock we have paths to all locks in the
903 * graph that blocks the request except in one case and that is why
904 * another check for SAME_OWNER in the loop below. The exception
905 * case is when this process that owns the sleeping write lock 'l1'
906 * has other locks l2, l3, l4 that are in the system and arrived
907 * before l1. l1 does not have path to these locks as they are from
908 * same process. We break when we find a second covering sleeping
909 * lock l5 owned by a process different from that owning l1, because
910 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
911 * it has l1 would have produced a deadlock already.
912 */
913
914 if (lock) {
915 do {
916 if (BLOCKS(lock, request)) {
917 if (!request_will_wait)
918 return (EAGAIN);
919 if (COVERS(lock, request) &&
920 lock->l_type == F_WRLCK) {
921 if (found_covering_lock &&
922 !SAME_OWNER(lock, covered_by)) {
923 found_covering_lock++;
924 break;
925 }
926 found_covering_lock = 1;
927 covered_by = lock;
928 }
929 if (found_covering_lock &&
930 !SAME_OWNER(lock, covered_by)) {
931 lock = lock->l_next;
932 continue;
933 }
934 if ((error = flk_add_edge(request, lock,
935 !found_covering_lock, 0)))
936 return (error);
937 }
938 lock = lock->l_next;
939 } while (lock->l_vnode == vp);
940 }
941
942 /*
943 * found_covering_lock == 2 iff at this point 'request' has paths
944 * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
945 * point 'request' has paths to all locks that blocks 'request' whose owners
946 * are not same as the one that covers 'request' (covered_by above) and
947 * we can have locks whose owner is same as covered_by in the active list.
948 */
949
950 if (request_blocked_by_active && found_covering_lock != 2) {
951 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
952 ASSERT(lock != NULL);
953 do {
954 if (BLOCKS(lock, request)) {
955 if (found_covering_lock &&
956 !SAME_OWNER(lock, covered_by)) {
957 lock = lock->l_next;
958 continue;
959 }
960 if ((error = flk_add_edge(request, lock,
961 CHECK_CYCLE, 0)))
962 return (error);
963 }
964 lock = lock->l_next;
965 } while (lock->l_vnode == vp);
966 }
967
968 if (NOT_BLOCKED(request)) {
969 /*
970 * request not dependent on any other locks
971 * so execute this request
972 */
973 return (flk_execute_request(request));
974 } else {
975 /*
976 * check for deadlock
977 */
978 if (flk_check_deadlock(request))
979 return (EDEADLK);
980 /*
981 * this thread has to sleep
982 */
983 return (flk_wait_execute_request(request));
984 }
985 }
986
987 /* ONC_PLUS EXTRACT START */
988 /*
989 * The actual execution of the request in the simple case is only to
990 * insert the 'request' in the list of active locks if it is not an
991 * UNLOCK.
992 * We have to consider the existing active locks' relation to
993 * this 'request' if they are owned by same process. flk_relation() does
994 * this job and sees to that the dependency graph information is maintained
995 * properly.
996 */
997
998 int
flk_execute_request(lock_descriptor_t * request)999 flk_execute_request(lock_descriptor_t *request)
1000 {
1001 graph_t *gp = request->l_graph;
1002 vnode_t *vp = request->l_vnode;
1003 lock_descriptor_t *lock, *lock1;
1004 int done_searching = 0;
1005
1006 CHECK_SLEEPING_LOCKS(gp);
1007 CHECK_ACTIVE_LOCKS(gp);
1008
1009 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1010
1011 flk_set_state(request, FLK_START_STATE);
1012
1013 ASSERT(NOT_BLOCKED(request));
1014
1015 /* IO_LOCK requests are only to check status */
1016
1017 if (IS_IO_LOCK(request))
1018 return (0);
1019
1020 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1021
1022 if (lock == NULL && request->l_type == F_UNLCK)
1023 return (0);
1024 if (lock == NULL) {
1025 flk_insert_active_lock(request);
1026 return (0);
1027 }
1028
1029 do {
1030 lock1 = lock->l_next;
1031 if (SAME_OWNER(request, lock)) {
1032 done_searching = flk_relation(lock, request);
1033 }
1034 lock = lock1;
1035 } while (lock->l_vnode == vp && !done_searching);
1036
1037 /*
1038 * insert in active queue
1039 */
1040
1041 if (request->l_type != F_UNLCK)
1042 flk_insert_active_lock(request);
1043
1044 return (0);
1045 }
1046 /* ONC_PLUS EXTRACT END */
1047
1048 /*
1049 * 'request' is blocked by some one therefore we put it into sleep queue.
1050 */
1051 static int
flk_wait_execute_request(lock_descriptor_t * request)1052 flk_wait_execute_request(lock_descriptor_t *request)
1053 {
1054 graph_t *gp = request->l_graph;
1055 callb_cpr_t *cprp; /* CPR info from callback */
1056 struct flock_globals *fg;
1057 int index;
1058
1059 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1060 ASSERT(IS_WILLING_TO_SLEEP(request));
1061
1062 flk_insert_sleeping_lock(request);
1063
1064 if (IS_LOCKMGR(request)) {
1065 index = HASH_INDEX(request->l_vnode);
1066 fg = flk_get_globals();
1067
1068 if (nlm_status_size == 0) { /* not booted as a cluster */
1069 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1070 flk_cancel_sleeping_lock(request, 1);
1071 return (ENOLCK);
1072 }
1073 } else { /* booted as a cluster */
1074 /*
1075 * If the request is an NLM server lock request,
1076 * and the NLM state of the lock request is not
1077 * NLM_UP (because the NLM server is shutting
1078 * down), then cancel the sleeping lock and
1079 * return error ENOLCK that will encourage the
1080 * client to retransmit.
1081 */
1082 if (!IS_NLM_UP(request)) {
1083 flk_cancel_sleeping_lock(request, 1);
1084 return (ENOLCK);
1085 }
1086 }
1087 }
1088
1089 /* Clustering: For blocking PXFS locks, return */
1090 if (IS_PXFS(request)) {
1091 /*
1092 * PXFS locks sleep on the client side.
1093 * The callback argument is used to wake up the sleeper
1094 * when the lock is granted.
1095 * We return -1 (rather than an errno value) to indicate
1096 * the client side should sleep
1097 */
1098 return (PXFS_LOCK_BLOCKED);
1099 }
1100
1101 if (request->l_callbacks != NULL) {
1102 /*
1103 * To make sure the shutdown code works correctly, either
1104 * the callback must happen after putting the lock on the
1105 * sleep list, or we must check the shutdown status after
1106 * returning from the callback (and before sleeping). At
1107 * least for now, we'll use the first option. If a
1108 * shutdown or signal or whatever happened while the graph
1109 * mutex was dropped, that will be detected by
1110 * wait_for_lock().
1111 */
1112 mutex_exit(&gp->gp_mutex);
1113
1114 cprp = flk_invoke_callbacks(request->l_callbacks,
1115 FLK_BEFORE_SLEEP);
1116
1117 mutex_enter(&gp->gp_mutex);
1118
1119 if (cprp == NULL) {
1120 wait_for_lock(request);
1121 } else {
1122 mutex_enter(cprp->cc_lockp);
1123 CALLB_CPR_SAFE_BEGIN(cprp);
1124 mutex_exit(cprp->cc_lockp);
1125 wait_for_lock(request);
1126 mutex_enter(cprp->cc_lockp);
1127 CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1128 mutex_exit(cprp->cc_lockp);
1129 }
1130
1131 mutex_exit(&gp->gp_mutex);
1132 (void) flk_invoke_callbacks(request->l_callbacks,
1133 FLK_AFTER_SLEEP);
1134 mutex_enter(&gp->gp_mutex);
1135 } else {
1136 wait_for_lock(request);
1137 }
1138
1139 if (IS_LOCKMGR(request)) {
1140 /*
1141 * If the lock manager is shutting down, return an
1142 * error that will encourage the client to retransmit.
1143 */
1144 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1145 !IS_GRANTED(request)) {
1146 flk_cancel_sleeping_lock(request, 1);
1147 return (ENOLCK);
1148 }
1149 }
1150
1151 if (IS_INTERRUPTED(request)) {
1152 /* we got a signal, or act like we did */
1153 flk_cancel_sleeping_lock(request, 1);
1154 return (EINTR);
1155 }
1156
1157 /* Cancelled if some other thread has closed the file */
1158
1159 if (IS_CANCELLED(request)) {
1160 flk_cancel_sleeping_lock(request, 1);
1161 return (EBADF);
1162 }
1163
1164 request->l_state &= ~GRANTED_LOCK;
1165 REMOVE_SLEEP_QUEUE(request);
1166 return (flk_execute_request(request));
1167 }
1168
1169 /*
1170 * This routine adds an edge between from and to because from depends
1171 * to. If asked to check for deadlock it checks whether there are any
1172 * reachable locks from "from_lock" that is owned by the same process
1173 * as "from_lock".
1174 * NOTE: It is the caller's responsibility to make sure that the color
1175 * of the graph is consistent between the calls to flk_add_edge as done
1176 * in flk_process_request. This routine does not color and check for
1177 * deadlock explicitly.
1178 */
1179
1180 static int
flk_add_edge(lock_descriptor_t * from_lock,lock_descriptor_t * to_lock,int check_cycle,int update_graph)1181 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1182 int check_cycle, int update_graph)
1183 {
1184 edge_t *edge;
1185 edge_t *ep;
1186 lock_descriptor_t *vertex;
1187 lock_descriptor_t *vertex_stack;
1188
1189 STACK_INIT(vertex_stack);
1190
1191 /*
1192 * if to vertex already has mark_color just return
1193 * don't add an edge as it is reachable from from vertex
1194 * before itself.
1195 */
1196
1197 if (COLORED(to_lock))
1198 return (0);
1199
1200 edge = flk_get_edge();
1201
1202 /*
1203 * set the from and to vertex
1204 */
1205
1206 edge->from_vertex = from_lock;
1207 edge->to_vertex = to_lock;
1208
1209 /*
1210 * put in adjacency list of from vertex
1211 */
1212
1213 from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1214 edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1215 edge->edge_adj_prev = &from_lock->l_edge;
1216 from_lock->l_edge.edge_adj_next = edge;
1217
1218 /*
1219 * put in in list of to vertex
1220 */
1221
1222 to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1223 edge->edge_in_next = to_lock->l_edge.edge_in_next;
1224 to_lock->l_edge.edge_in_next = edge;
1225 edge->edge_in_prev = &to_lock->l_edge;
1226
1227
1228 if (update_graph) {
1229 flk_update_proc_graph(edge, 0);
1230 return (0);
1231 }
1232 if (!check_cycle) {
1233 return (0);
1234 }
1235
1236 STACK_PUSH(vertex_stack, from_lock, l_stack);
1237
1238 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1239
1240 STACK_POP(vertex_stack, l_stack);
1241
1242 for (ep = FIRST_ADJ(vertex);
1243 ep != HEAD(vertex);
1244 ep = NEXT_ADJ(ep)) {
1245 if (COLORED(ep->to_vertex))
1246 continue;
1247 COLOR(ep->to_vertex);
1248 if (SAME_OWNER(ep->to_vertex, from_lock))
1249 goto dead_lock;
1250 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1251 }
1252 }
1253 return (0);
1254
1255 dead_lock:
1256
1257 /*
1258 * remove all edges
1259 */
1260
1261 ep = FIRST_ADJ(from_lock);
1262
1263 while (ep != HEAD(from_lock)) {
1264 IN_LIST_REMOVE(ep);
1265 from_lock->l_sedge = NEXT_ADJ(ep);
1266 ADJ_LIST_REMOVE(ep);
1267 flk_free_edge(ep);
1268 ep = from_lock->l_sedge;
1269 }
1270 return (EDEADLK);
1271 }
1272
1273 /*
1274 * Get an edge structure for representing the dependency between two locks.
1275 */
1276
1277 static edge_t *
flk_get_edge()1278 flk_get_edge()
1279 {
1280 edge_t *ep;
1281
1282 ASSERT(flk_edge_cache != NULL);
1283
1284 ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1285 edge_allocs++;
1286 return (ep);
1287 }
1288
1289 /*
1290 * Free the edge structure.
1291 */
1292
1293 static void
flk_free_edge(edge_t * ep)1294 flk_free_edge(edge_t *ep)
1295 {
1296 edge_frees++;
1297 kmem_cache_free(flk_edge_cache, (void *)ep);
1298 }
1299
1300 /*
1301 * Check the relationship of request with lock and perform the
1302 * recomputation of dependencies, break lock if required, and return
1303 * 1 if request cannot have any more relationship with the next
1304 * active locks.
1305 * The 'lock' and 'request' are compared and in case of overlap we
1306 * delete the 'lock' and form new locks to represent the non-overlapped
1307 * portion of original 'lock'. This function has side effects such as
1308 * 'lock' will be freed, new locks will be added to the active list.
1309 */
1310
1311 static int
flk_relation(lock_descriptor_t * lock,lock_descriptor_t * request)1312 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1313 {
1314 int lock_effect;
1315 lock_descriptor_t *lock1, *lock2;
1316 lock_descriptor_t *topology[3];
1317 int nvertex = 0;
1318 int i;
1319 edge_t *ep;
1320 graph_t *gp = (lock->l_graph);
1321
1322
1323 CHECK_SLEEPING_LOCKS(gp);
1324 CHECK_ACTIVE_LOCKS(gp);
1325
1326 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1327
1328 topology[0] = topology[1] = topology[2] = NULL;
1329
1330 if (request->l_type == F_UNLCK)
1331 lock_effect = FLK_UNLOCK;
1332 else if (request->l_type == F_RDLCK &&
1333 lock->l_type == F_WRLCK)
1334 lock_effect = FLK_DOWNGRADE;
1335 else if (request->l_type == F_WRLCK &&
1336 lock->l_type == F_RDLCK)
1337 lock_effect = FLK_UPGRADE;
1338 else
1339 lock_effect = FLK_STAY_SAME;
1340
1341 if (lock->l_end < request->l_start) {
1342 if (lock->l_end == request->l_start - 1 &&
1343 lock_effect == FLK_STAY_SAME) {
1344 topology[0] = request;
1345 request->l_start = lock->l_start;
1346 nvertex = 1;
1347 goto recompute;
1348 } else {
1349 return (0);
1350 }
1351 }
1352
1353 if (lock->l_start > request->l_end) {
1354 if (request->l_end == lock->l_start - 1 &&
1355 lock_effect == FLK_STAY_SAME) {
1356 topology[0] = request;
1357 request->l_end = lock->l_end;
1358 nvertex = 1;
1359 goto recompute;
1360 } else {
1361 return (1);
1362 }
1363 }
1364
1365 if (request->l_end < lock->l_end) {
1366 if (request->l_start > lock->l_start) {
1367 if (lock_effect == FLK_STAY_SAME) {
1368 request->l_start = lock->l_start;
1369 request->l_end = lock->l_end;
1370 topology[0] = request;
1371 nvertex = 1;
1372 } else {
1373 lock1 = flk_get_lock();
1374 lock2 = flk_get_lock();
1375 COPY(lock1, lock);
1376 COPY(lock2, lock);
1377 lock1->l_start = lock->l_start;
1378 lock1->l_end = request->l_start - 1;
1379 lock2->l_start = request->l_end + 1;
1380 lock2->l_end = lock->l_end;
1381 topology[0] = lock1;
1382 topology[1] = lock2;
1383 topology[2] = request;
1384 nvertex = 3;
1385 }
1386 } else if (request->l_start < lock->l_start) {
1387 if (lock_effect == FLK_STAY_SAME) {
1388 request->l_end = lock->l_end;
1389 topology[0] = request;
1390 nvertex = 1;
1391 } else {
1392 lock1 = flk_get_lock();
1393 COPY(lock1, lock);
1394 lock1->l_start = request->l_end + 1;
1395 topology[0] = lock1;
1396 topology[1] = request;
1397 nvertex = 2;
1398 }
1399 } else {
1400 if (lock_effect == FLK_STAY_SAME) {
1401 request->l_start = lock->l_start;
1402 request->l_end = lock->l_end;
1403 topology[0] = request;
1404 nvertex = 1;
1405 } else {
1406 lock1 = flk_get_lock();
1407 COPY(lock1, lock);
1408 lock1->l_start = request->l_end + 1;
1409 topology[0] = lock1;
1410 topology[1] = request;
1411 nvertex = 2;
1412 }
1413 }
1414 } else if (request->l_end > lock->l_end) {
1415 if (request->l_start > lock->l_start) {
1416 if (lock_effect == FLK_STAY_SAME) {
1417 request->l_start = lock->l_start;
1418 topology[0] = request;
1419 nvertex = 1;
1420 } else {
1421 lock1 = flk_get_lock();
1422 COPY(lock1, lock);
1423 lock1->l_end = request->l_start - 1;
1424 topology[0] = lock1;
1425 topology[1] = request;
1426 nvertex = 2;
1427 }
1428 } else if (request->l_start < lock->l_start) {
1429 topology[0] = request;
1430 nvertex = 1;
1431 } else {
1432 topology[0] = request;
1433 nvertex = 1;
1434 }
1435 } else {
1436 if (request->l_start > lock->l_start) {
1437 if (lock_effect == FLK_STAY_SAME) {
1438 request->l_start = lock->l_start;
1439 topology[0] = request;
1440 nvertex = 1;
1441 } else {
1442 lock1 = flk_get_lock();
1443 COPY(lock1, lock);
1444 lock1->l_end = request->l_start - 1;
1445 topology[0] = lock1;
1446 topology[1] = request;
1447 nvertex = 2;
1448 }
1449 } else if (request->l_start < lock->l_start) {
1450 topology[0] = request;
1451 nvertex = 1;
1452 } else {
1453 if (lock_effect != FLK_UNLOCK) {
1454 topology[0] = request;
1455 nvertex = 1;
1456 } else {
1457 flk_delete_active_lock(lock, 0);
1458 flk_wakeup(lock, 1);
1459 flk_free_lock(lock);
1460 CHECK_SLEEPING_LOCKS(gp);
1461 CHECK_ACTIVE_LOCKS(gp);
1462 return (1);
1463 }
1464 }
1465 }
1466
1467 recompute:
1468
1469 /*
1470 * For unlock we don't send the 'request' to for recomputing
1471 * dependencies because no lock will add an edge to this.
1472 */
1473
1474 if (lock_effect == FLK_UNLOCK) {
1475 topology[nvertex-1] = NULL;
1476 nvertex--;
1477 }
1478 for (i = 0; i < nvertex; i++) {
1479 topology[i]->l_state |= RECOMPUTE_LOCK;
1480 topology[i]->l_color = NO_COLOR;
1481 }
1482
1483 ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1484
1485 /*
1486 * we remove the adjacent edges for all vertices' to this vertex
1487 * 'lock'.
1488 */
1489
1490 ep = FIRST_IN(lock);
1491 while (ep != HEAD(lock)) {
1492 ADJ_LIST_REMOVE(ep);
1493 ep = NEXT_IN(ep);
1494 }
1495
1496 flk_delete_active_lock(lock, 0);
1497
1498 /* We are ready for recomputing the dependencies now */
1499
1500 flk_recompute_dependencies(lock, topology, nvertex, 1);
1501
1502 for (i = 0; i < nvertex; i++) {
1503 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1504 topology[i]->l_color = NO_COLOR;
1505 }
1506
1507
1508 if (lock_effect == FLK_UNLOCK) {
1509 nvertex++;
1510 }
1511 for (i = 0; i < nvertex - 1; i++) {
1512 flk_insert_active_lock(topology[i]);
1513 }
1514
1515
1516 if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1517 flk_wakeup(lock, 0);
1518 } else {
1519 ep = FIRST_IN(lock);
1520 while (ep != HEAD(lock)) {
1521 lock->l_sedge = NEXT_IN(ep);
1522 IN_LIST_REMOVE(ep);
1523 flk_update_proc_graph(ep, 1);
1524 flk_free_edge(ep);
1525 ep = lock->l_sedge;
1526 }
1527 }
1528 flk_free_lock(lock);
1529
1530 CHECK_SLEEPING_LOCKS(gp);
1531 CHECK_ACTIVE_LOCKS(gp);
1532 return (0);
1533 }
1534
1535 /*
1536 * Insert a lock into the active queue.
1537 */
1538
1539 static void
flk_insert_active_lock(lock_descriptor_t * new_lock)1540 flk_insert_active_lock(lock_descriptor_t *new_lock)
1541 {
1542 graph_t *gp = new_lock->l_graph;
1543 vnode_t *vp = new_lock->l_vnode;
1544 lock_descriptor_t *first_lock, *lock;
1545
1546 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1547
1548 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1549 first_lock = lock;
1550
1551 if (first_lock != NULL) {
1552 for (; (lock->l_vnode == vp &&
1553 lock->l_start < new_lock->l_start); lock = lock->l_next)
1554 ;
1555 } else {
1556 lock = ACTIVE_HEAD(gp);
1557 }
1558
1559 lock->l_prev->l_next = new_lock;
1560 new_lock->l_next = lock;
1561 new_lock->l_prev = lock->l_prev;
1562 lock->l_prev = new_lock;
1563
1564 if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1565 vp->v_filocks = (struct filock *)new_lock;
1566 }
1567 flk_set_state(new_lock, FLK_ACTIVE_STATE);
1568 new_lock->l_state |= ACTIVE_LOCK;
1569
1570 CHECK_ACTIVE_LOCKS(gp);
1571 CHECK_SLEEPING_LOCKS(gp);
1572 }
1573
1574 /*
1575 * Delete the active lock : Performs two functions depending on the
1576 * value of second parameter. One is to remove from the active lists
1577 * only and other is to both remove and free the lock.
1578 */
1579
1580 static void
flk_delete_active_lock(lock_descriptor_t * lock,int free_lock)1581 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1582 {
1583 vnode_t *vp = lock->l_vnode;
1584 graph_t *gp = lock->l_graph;
1585
1586 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1587 if (free_lock)
1588 ASSERT(NO_DEPENDENTS(lock));
1589 ASSERT(NOT_BLOCKED(lock));
1590 ASSERT(IS_ACTIVE(lock));
1591
1592 ASSERT((vp->v_filocks != NULL));
1593
1594 if (vp->v_filocks == (struct filock *)lock) {
1595 vp->v_filocks = (struct filock *)
1596 ((lock->l_next->l_vnode == vp) ? lock->l_next :
1597 NULL);
1598 }
1599 lock->l_next->l_prev = lock->l_prev;
1600 lock->l_prev->l_next = lock->l_next;
1601 lock->l_next = lock->l_prev = NULL;
1602 flk_set_state(lock, FLK_DEAD_STATE);
1603 lock->l_state &= ~ACTIVE_LOCK;
1604
1605 if (free_lock)
1606 flk_free_lock(lock);
1607 CHECK_ACTIVE_LOCKS(gp);
1608 CHECK_SLEEPING_LOCKS(gp);
1609 }
1610
1611 /*
1612 * Insert into the sleep queue.
1613 */
1614
1615 static void
flk_insert_sleeping_lock(lock_descriptor_t * request)1616 flk_insert_sleeping_lock(lock_descriptor_t *request)
1617 {
1618 graph_t *gp = request->l_graph;
1619 vnode_t *vp = request->l_vnode;
1620 lock_descriptor_t *lock;
1621
1622 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1623 ASSERT(IS_INITIAL(request));
1624
1625 for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1626 lock->l_vnode < vp); lock = lock->l_next)
1627 ;
1628
1629 lock->l_prev->l_next = request;
1630 request->l_prev = lock->l_prev;
1631 lock->l_prev = request;
1632 request->l_next = lock;
1633 flk_set_state(request, FLK_SLEEPING_STATE);
1634 request->l_state |= SLEEPING_LOCK;
1635 }
1636
1637 /*
1638 * Cancelling a sleeping lock implies removing a vertex from the
1639 * dependency graph and therefore we should recompute the dependencies
1640 * of all vertices that have a path to this vertex, w.r.t. all
1641 * vertices reachable from this vertex.
1642 */
1643
1644 void
flk_cancel_sleeping_lock(lock_descriptor_t * request,int remove_from_queue)1645 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1646 {
1647 graph_t *gp = request->l_graph;
1648 vnode_t *vp = request->l_vnode;
1649 lock_descriptor_t **topology = NULL;
1650 edge_t *ep;
1651 lock_descriptor_t *vertex, *lock;
1652 int nvertex = 0;
1653 int i;
1654 lock_descriptor_t *vertex_stack;
1655
1656 STACK_INIT(vertex_stack);
1657
1658 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1659 /*
1660 * count number of vertex pointers that has to be allocated
1661 * All vertices that are reachable from request.
1662 */
1663
1664 STACK_PUSH(vertex_stack, request, l_stack);
1665
1666 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1667 STACK_POP(vertex_stack, l_stack);
1668 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1669 ep = NEXT_ADJ(ep)) {
1670 if (IS_RECOMPUTE(ep->to_vertex))
1671 continue;
1672 ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1673 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1674 nvertex++;
1675 }
1676 }
1677
1678 /*
1679 * allocate memory for holding the vertex pointers
1680 */
1681
1682 if (nvertex) {
1683 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1684 KM_SLEEP);
1685 }
1686
1687 /*
1688 * one more pass to actually store the vertices in the
1689 * allocated array.
1690 * We first check sleeping locks and then active locks
1691 * so that topology array will be in a topological
1692 * order.
1693 */
1694
1695 nvertex = 0;
1696 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1697
1698 if (lock) {
1699 do {
1700 if (IS_RECOMPUTE(lock)) {
1701 lock->l_index = nvertex;
1702 topology[nvertex++] = lock;
1703 }
1704 lock->l_color = NO_COLOR;
1705 lock = lock->l_next;
1706 } while (lock->l_vnode == vp);
1707 }
1708
1709 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1710
1711 if (lock) {
1712 do {
1713 if (IS_RECOMPUTE(lock)) {
1714 lock->l_index = nvertex;
1715 topology[nvertex++] = lock;
1716 }
1717 lock->l_color = NO_COLOR;
1718 lock = lock->l_next;
1719 } while (lock->l_vnode == vp);
1720 }
1721
1722 /*
1723 * remove in and out edges of request
1724 * They are freed after updating proc_graph below.
1725 */
1726
1727 for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1728 ADJ_LIST_REMOVE(ep);
1729 }
1730
1731
1732 if (remove_from_queue)
1733 REMOVE_SLEEP_QUEUE(request);
1734
1735 /* we are ready to recompute */
1736
1737 flk_recompute_dependencies(request, topology, nvertex, 1);
1738
1739 ep = FIRST_ADJ(request);
1740 while (ep != HEAD(request)) {
1741 IN_LIST_REMOVE(ep);
1742 request->l_sedge = NEXT_ADJ(ep);
1743 ADJ_LIST_REMOVE(ep);
1744 flk_update_proc_graph(ep, 1);
1745 flk_free_edge(ep);
1746 ep = request->l_sedge;
1747 }
1748
1749
1750 /*
1751 * unset the RECOMPUTE flag in those vertices
1752 */
1753
1754 for (i = 0; i < nvertex; i++) {
1755 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1756 }
1757
1758 /*
1759 * free the topology
1760 */
1761 if (nvertex)
1762 kmem_free((void *)topology,
1763 (nvertex * sizeof (lock_descriptor_t *)));
1764 /*
1765 * Possibility of some locks unblocked now
1766 */
1767
1768 flk_wakeup(request, 0);
1769
1770 /*
1771 * we expect to have a correctly recomputed graph now.
1772 */
1773 flk_set_state(request, FLK_DEAD_STATE);
1774 flk_free_lock(request);
1775 CHECK_SLEEPING_LOCKS(gp);
1776 CHECK_ACTIVE_LOCKS(gp);
1777
1778 }
1779
1780 /*
1781 * Uncoloring the graph is simply to increment the mark value of the graph
1782 * And only when wrap round takes place will we color all vertices in
1783 * the graph explicitly.
1784 */
1785
1786 static void
flk_graph_uncolor(graph_t * gp)1787 flk_graph_uncolor(graph_t *gp)
1788 {
1789 lock_descriptor_t *lock;
1790
1791 if (gp->mark == UINT_MAX) {
1792 gp->mark = 1;
1793 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1794 lock = lock->l_next)
1795 lock->l_color = 0;
1796
1797 for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1798 lock = lock->l_next)
1799 lock->l_color = 0;
1800 } else {
1801 gp->mark++;
1802 }
1803 }
1804
1805 /*
1806 * Wake up locks that are blocked on the given lock.
1807 */
1808
1809 static void
flk_wakeup(lock_descriptor_t * lock,int adj_list_remove)1810 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1811 {
1812 edge_t *ep;
1813 graph_t *gp = lock->l_graph;
1814 lock_descriptor_t *lck;
1815
1816 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1817 if (NO_DEPENDENTS(lock))
1818 return;
1819 ep = FIRST_IN(lock);
1820 do {
1821 /*
1822 * delete the edge from the adjacency list
1823 * of from vertex. if no more adjacent edges
1824 * for this vertex wake this process.
1825 */
1826 lck = ep->from_vertex;
1827 if (adj_list_remove)
1828 ADJ_LIST_REMOVE(ep);
1829 flk_update_proc_graph(ep, 1);
1830 if (NOT_BLOCKED(lck)) {
1831 GRANT_WAKEUP(lck);
1832 }
1833 lock->l_sedge = NEXT_IN(ep);
1834 IN_LIST_REMOVE(ep);
1835 flk_free_edge(ep);
1836 ep = lock->l_sedge;
1837 } while (ep != HEAD(lock));
1838 ASSERT(NO_DEPENDENTS(lock));
1839 }
1840
1841 /*
1842 * The dependents of request, is checked for its dependency against the
1843 * locks in topology (called topology because the array is and should be in
1844 * topological order for this algorithm, if not in topological order the
1845 * inner loop below might add more edges than necessary. Topological ordering
1846 * of vertices satisfies the property that all edges will be from left to
1847 * right i.e., topology[i] can have an edge to topology[j], iff i<j)
1848 * If lock l1 in the dependent set of request is dependent (blocked by)
1849 * on lock l2 in topology but does not have a path to it, we add an edge
1850 * in the inner loop below.
1851 *
1852 * We don't want to add an edge between l1 and l2 if there exists
1853 * already a path from l1 to l2, so care has to be taken for those vertices
1854 * that have two paths to 'request'. These vertices are referred to here
1855 * as barrier locks.
1856 *
1857 * The barriers has to be found (those vertex that originally had two paths
1858 * to request) because otherwise we may end up adding edges unnecessarily
1859 * to vertices in topology, and thus barrier vertices can have an edge
1860 * to a vertex in topology as well a path to it.
1861 */
1862
1863 static void
flk_recompute_dependencies(lock_descriptor_t * request,lock_descriptor_t ** topology,int nvertex,int update_graph)1864 flk_recompute_dependencies(lock_descriptor_t *request,
1865 lock_descriptor_t **topology,
1866 int nvertex, int update_graph)
1867 {
1868 lock_descriptor_t *vertex, *lock;
1869 graph_t *gp = request->l_graph;
1870 int i, count;
1871 int barrier_found = 0;
1872 edge_t *ep;
1873 lock_descriptor_t *vertex_stack;
1874
1875 STACK_INIT(vertex_stack);
1876
1877 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1878 if (nvertex == 0)
1879 return;
1880 flk_graph_uncolor(request->l_graph);
1881 barrier_found = flk_find_barriers(request);
1882 request->l_state |= RECOMPUTE_DONE;
1883
1884 STACK_PUSH(vertex_stack, request, l_stack);
1885 request->l_sedge = FIRST_IN(request);
1886
1887
1888 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1889 if (vertex->l_state & RECOMPUTE_DONE) {
1890 count = 0;
1891 goto next_in_edge;
1892 }
1893 if (IS_BARRIER(vertex)) {
1894 /* decrement the barrier count */
1895 if (vertex->l_index) {
1896 vertex->l_index--;
1897 /* this guy will be pushed again anyway ? */
1898 STACK_POP(vertex_stack, l_stack);
1899 if (vertex->l_index == 0) {
1900 /*
1901 * barrier is over we can recompute
1902 * dependencies for this lock in the
1903 * next stack pop
1904 */
1905 vertex->l_state &= ~BARRIER_LOCK;
1906 }
1907 continue;
1908 }
1909 }
1910 vertex->l_state |= RECOMPUTE_DONE;
1911 flk_graph_uncolor(gp);
1912 count = flk_color_reachables(vertex);
1913 for (i = 0; i < nvertex; i++) {
1914 lock = topology[i];
1915 if (COLORED(lock))
1916 continue;
1917 if (BLOCKS(lock, vertex)) {
1918 (void) flk_add_edge(vertex, lock,
1919 NO_CHECK_CYCLE, update_graph);
1920 COLOR(lock);
1921 count++;
1922 count += flk_color_reachables(lock);
1923 }
1924
1925 }
1926
1927 next_in_edge:
1928 if (count == nvertex ||
1929 vertex->l_sedge == HEAD(vertex)) {
1930 /* prune the tree below this */
1931 STACK_POP(vertex_stack, l_stack);
1932 vertex->l_state &= ~RECOMPUTE_DONE;
1933 /* update the barrier locks below this! */
1934 if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1935 flk_graph_uncolor(gp);
1936 flk_update_barriers(vertex);
1937 }
1938 continue;
1939 }
1940
1941 ep = vertex->l_sedge;
1942 lock = ep->from_vertex;
1943 STACK_PUSH(vertex_stack, lock, l_stack);
1944 lock->l_sedge = FIRST_IN(lock);
1945 vertex->l_sedge = NEXT_IN(ep);
1946 }
1947
1948 }
1949
1950 /*
1951 * Color all reachable vertices from vertex that belongs to topology (here
1952 * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1953 *
1954 * Note: we need to use a different stack_link l_stack1 because this is
1955 * called from flk_recompute_dependencies() that already uses a stack with
1956 * l_stack as stack_link.
1957 */
1958
1959 static int
flk_color_reachables(lock_descriptor_t * vertex)1960 flk_color_reachables(lock_descriptor_t *vertex)
1961 {
1962 lock_descriptor_t *ver, *lock;
1963 int count;
1964 edge_t *ep;
1965 lock_descriptor_t *vertex_stack;
1966
1967 STACK_INIT(vertex_stack);
1968
1969 STACK_PUSH(vertex_stack, vertex, l_stack1);
1970 count = 0;
1971 while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1972
1973 STACK_POP(vertex_stack, l_stack1);
1974 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1975 ep = NEXT_ADJ(ep)) {
1976 lock = ep->to_vertex;
1977 if (COLORED(lock))
1978 continue;
1979 COLOR(lock);
1980 if (IS_RECOMPUTE(lock))
1981 count++;
1982 STACK_PUSH(vertex_stack, lock, l_stack1);
1983 }
1984
1985 }
1986 return (count);
1987 }
1988
1989 /*
1990 * Called from flk_recompute_dependencies() this routine decrements
1991 * the barrier count of barrier vertices that are reachable from lock.
1992 */
1993
1994 static void
flk_update_barriers(lock_descriptor_t * lock)1995 flk_update_barriers(lock_descriptor_t *lock)
1996 {
1997 lock_descriptor_t *vertex, *lck;
1998 edge_t *ep;
1999 lock_descriptor_t *vertex_stack;
2000
2001 STACK_INIT(vertex_stack);
2002
2003 STACK_PUSH(vertex_stack, lock, l_stack1);
2004
2005 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2006 STACK_POP(vertex_stack, l_stack1);
2007 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2008 ep = NEXT_IN(ep)) {
2009 lck = ep->from_vertex;
2010 if (COLORED(lck)) {
2011 if (IS_BARRIER(lck)) {
2012 ASSERT(lck->l_index > 0);
2013 lck->l_index--;
2014 if (lck->l_index == 0)
2015 lck->l_state &= ~BARRIER_LOCK;
2016 }
2017 continue;
2018 }
2019 COLOR(lck);
2020 if (IS_BARRIER(lck)) {
2021 ASSERT(lck->l_index > 0);
2022 lck->l_index--;
2023 if (lck->l_index == 0)
2024 lck->l_state &= ~BARRIER_LOCK;
2025 }
2026 STACK_PUSH(vertex_stack, lck, l_stack1);
2027 }
2028 }
2029 }
2030
2031 /*
2032 * Finds all vertices that are reachable from 'lock' more than once and
2033 * mark them as barrier vertices and increment their barrier count.
2034 * The barrier count is one minus the total number of paths from lock
2035 * to that vertex.
2036 */
2037
2038 static int
flk_find_barriers(lock_descriptor_t * lock)2039 flk_find_barriers(lock_descriptor_t *lock)
2040 {
2041 lock_descriptor_t *vertex, *lck;
2042 int found = 0;
2043 edge_t *ep;
2044 lock_descriptor_t *vertex_stack;
2045
2046 STACK_INIT(vertex_stack);
2047
2048 STACK_PUSH(vertex_stack, lock, l_stack1);
2049
2050 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2051 STACK_POP(vertex_stack, l_stack1);
2052 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2053 ep = NEXT_IN(ep)) {
2054 lck = ep->from_vertex;
2055 if (COLORED(lck)) {
2056 /* this is a barrier */
2057 lck->l_state |= BARRIER_LOCK;
2058 /* index will have barrier count */
2059 lck->l_index++;
2060 if (!found)
2061 found = 1;
2062 continue;
2063 }
2064 COLOR(lck);
2065 lck->l_index = 0;
2066 STACK_PUSH(vertex_stack, lck, l_stack1);
2067 }
2068 }
2069 return (found);
2070 }
2071
2072 /*
2073 * Finds the first lock that is mainly responsible for blocking this
2074 * request. If there is no such lock, request->l_flock.l_type is set to
2075 * F_UNLCK. Otherwise, request->l_flock is filled in with the particulars
2076 * of the blocking lock.
2077 *
2078 * Note: It is possible a request is blocked by a sleeping lock because
2079 * of the fairness policy used in flk_process_request() to construct the
2080 * dependencies. (see comments before flk_process_request()).
2081 */
2082
2083 static void
flk_get_first_blocking_lock(lock_descriptor_t * request)2084 flk_get_first_blocking_lock(lock_descriptor_t *request)
2085 {
2086 graph_t *gp = request->l_graph;
2087 vnode_t *vp = request->l_vnode;
2088 lock_descriptor_t *lock, *blocker;
2089
2090 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2091 blocker = NULL;
2092 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2093
2094 if (lock) {
2095 do {
2096 if (BLOCKS(lock, request)) {
2097 blocker = lock;
2098 break;
2099 }
2100 lock = lock->l_next;
2101 } while (lock->l_vnode == vp);
2102 }
2103
2104 if (blocker) {
2105 report_blocker(blocker, request);
2106 } else
2107 request->l_flock.l_type = F_UNLCK;
2108 }
2109
2110 /*
2111 * Get the graph_t structure associated with a vnode.
2112 * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2113 * not yet been initialized, then a new element is allocated and returned.
2114 */
2115 graph_t *
flk_get_lock_graph(vnode_t * vp,int initialize)2116 flk_get_lock_graph(vnode_t *vp, int initialize)
2117 {
2118 graph_t *gp;
2119 graph_t *gp_alloc = NULL;
2120 int index = HASH_INDEX(vp);
2121
2122 if (initialize == FLK_USE_GRAPH) {
2123 mutex_enter(&flock_lock);
2124 gp = lock_graph[index];
2125 mutex_exit(&flock_lock);
2126 return (gp);
2127 }
2128
2129 ASSERT(initialize == FLK_INIT_GRAPH);
2130
2131 if (lock_graph[index] == NULL) {
2132
2133 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2134
2135 /* Initialize the graph */
2136
2137 gp_alloc->active_locks.l_next =
2138 gp_alloc->active_locks.l_prev =
2139 (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2140 gp_alloc->sleeping_locks.l_next =
2141 gp_alloc->sleeping_locks.l_prev =
2142 (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2143 gp_alloc->index = index;
2144 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2145 }
2146
2147 mutex_enter(&flock_lock);
2148
2149 gp = lock_graph[index];
2150
2151 /* Recheck the value within flock_lock */
2152 if (gp == NULL) {
2153 struct flock_globals *fg;
2154
2155 /* We must have previously allocated the graph_t structure */
2156 ASSERT(gp_alloc != NULL);
2157 lock_graph[index] = gp = gp_alloc;
2158 /*
2159 * The lockmgr status is only needed if KLM is loaded.
2160 */
2161 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2162 fg = flk_get_globals();
2163 fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2164 }
2165 }
2166
2167 mutex_exit(&flock_lock);
2168
2169 if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2170 /* There was a race to allocate the graph_t and we lost */
2171 mutex_destroy(&gp_alloc->gp_mutex);
2172 kmem_free(gp_alloc, sizeof (graph_t));
2173 }
2174
2175 return (gp);
2176 }
2177
2178 /*
2179 * PSARC case 1997/292
2180 */
2181 int
cl_flk_has_remote_locks_for_nlmid(vnode_t * vp,int nlmid)2182 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2183 {
2184 lock_descriptor_t *lock;
2185 int result = 0;
2186 graph_t *gp;
2187 int lock_nlmid;
2188
2189 /*
2190 * Check to see if node is booted as a cluster. If not, return.
2191 */
2192 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2193 return (0);
2194 }
2195
2196 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2197 if (gp == NULL) {
2198 return (0);
2199 }
2200
2201 mutex_enter(&gp->gp_mutex);
2202
2203 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2204
2205 if (lock) {
2206 while (lock->l_vnode == vp) {
2207 /* get NLM id from sysid */
2208 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2209
2210 /*
2211 * If NLM server request _and_ nlmid of lock matches
2212 * nlmid of argument, then we've found a remote lock.
2213 */
2214 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2215 result = 1;
2216 goto done;
2217 }
2218 lock = lock->l_next;
2219 }
2220 }
2221
2222 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2223
2224 if (lock) {
2225 while (lock->l_vnode == vp) {
2226 /* get NLM id from sysid */
2227 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2228
2229 /*
2230 * If NLM server request _and_ nlmid of lock matches
2231 * nlmid of argument, then we've found a remote lock.
2232 */
2233 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2234 result = 1;
2235 goto done;
2236 }
2237 lock = lock->l_next;
2238 }
2239 }
2240
2241 done:
2242 mutex_exit(&gp->gp_mutex);
2243 return (result);
2244 }
2245
2246 /* ONC_PLUS EXTRACT START */
2247 /*
2248 * Determine whether there are any locks for the given vnode with a remote
2249 * sysid. Returns zero if not, non-zero if there are.
2250 *
2251 * Note that the return value from this function is potentially invalid
2252 * once it has been returned. The caller is responsible for providing its
2253 * own synchronization mechanism to ensure that the return value is useful
2254 * (e.g., see nfs_lockcompletion()).
2255 */
2256 int
flk_has_remote_locks(vnode_t * vp)2257 flk_has_remote_locks(vnode_t *vp)
2258 {
2259 lock_descriptor_t *lock;
2260 int result = 0;
2261 graph_t *gp;
2262
2263 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2264 if (gp == NULL) {
2265 return (0);
2266 }
2267
2268 mutex_enter(&gp->gp_mutex);
2269
2270 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2271
2272 if (lock) {
2273 while (lock->l_vnode == vp) {
2274 if (IS_REMOTE(lock)) {
2275 result = 1;
2276 goto done;
2277 }
2278 lock = lock->l_next;
2279 }
2280 }
2281
2282 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2283
2284 if (lock) {
2285 while (lock->l_vnode == vp) {
2286 if (IS_REMOTE(lock)) {
2287 result = 1;
2288 goto done;
2289 }
2290 lock = lock->l_next;
2291 }
2292 }
2293
2294 done:
2295 mutex_exit(&gp->gp_mutex);
2296 return (result);
2297 }
2298
2299 /*
2300 * Determine if there are any locks owned by the given sysid.
2301 * Returns zero if not, non-zero if there are. Note that this return code
2302 * could be derived from flk_get_{sleeping,active}_locks, but this routine
2303 * avoids all the memory allocations of those routines.
2304 *
2305 * This routine has the same synchronization issues as
2306 * flk_has_remote_locks.
2307 */
2308
2309 int
flk_sysid_has_locks(int sysid,int lck_type)2310 flk_sysid_has_locks(int sysid, int lck_type)
2311 {
2312 int has_locks = 0;
2313 lock_descriptor_t *lock;
2314 graph_t *gp;
2315 int i;
2316
2317 for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2318 mutex_enter(&flock_lock);
2319 gp = lock_graph[i];
2320 mutex_exit(&flock_lock);
2321 if (gp == NULL) {
2322 continue;
2323 }
2324
2325 mutex_enter(&gp->gp_mutex);
2326
2327 if (lck_type & FLK_QUERY_ACTIVE) {
2328 for (lock = ACTIVE_HEAD(gp)->l_next;
2329 lock != ACTIVE_HEAD(gp) && !has_locks;
2330 lock = lock->l_next) {
2331 if (lock->l_flock.l_sysid == sysid)
2332 has_locks = 1;
2333 }
2334 }
2335
2336 if (lck_type & FLK_QUERY_SLEEPING) {
2337 for (lock = SLEEPING_HEAD(gp)->l_next;
2338 lock != SLEEPING_HEAD(gp) && !has_locks;
2339 lock = lock->l_next) {
2340 if (lock->l_flock.l_sysid == sysid)
2341 has_locks = 1;
2342 }
2343 }
2344 mutex_exit(&gp->gp_mutex);
2345 }
2346
2347 return (has_locks);
2348 }
2349
2350
2351 /*
2352 * PSARC case 1997/292
2353 *
2354 * Requires: "sysid" is a pair [nlmid, sysid]. The lower half is 16-bit
2355 * quantity, the real sysid generated by the NLM server; the upper half
2356 * identifies the node of the cluster where the NLM server ran.
2357 * This routine is only called by an NLM server running in a cluster.
2358 * Effects: Remove all locks held on behalf of the client identified
2359 * by "sysid."
2360 */
2361 void
cl_flk_remove_locks_by_sysid(int sysid)2362 cl_flk_remove_locks_by_sysid(int sysid)
2363 {
2364 graph_t *gp;
2365 int i;
2366 lock_descriptor_t *lock, *nlock;
2367
2368 /*
2369 * Check to see if node is booted as a cluster. If not, return.
2370 */
2371 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2372 return;
2373 }
2374
2375 ASSERT(sysid != 0);
2376 for (i = 0; i < HASH_SIZE; i++) {
2377 mutex_enter(&flock_lock);
2378 gp = lock_graph[i];
2379 mutex_exit(&flock_lock);
2380
2381 if (gp == NULL)
2382 continue;
2383
2384 mutex_enter(&gp->gp_mutex); /* get mutex on lock graph */
2385
2386 /* signal sleeping requests so that they bail out */
2387 lock = SLEEPING_HEAD(gp)->l_next;
2388 while (lock != SLEEPING_HEAD(gp)) {
2389 nlock = lock->l_next;
2390 if (lock->l_flock.l_sysid == sysid) {
2391 INTERRUPT_WAKEUP(lock);
2392 }
2393 lock = nlock;
2394 }
2395
2396 /* delete active locks */
2397 lock = ACTIVE_HEAD(gp)->l_next;
2398 while (lock != ACTIVE_HEAD(gp)) {
2399 nlock = lock->l_next;
2400 if (lock->l_flock.l_sysid == sysid) {
2401 flk_delete_active_lock(lock, 0);
2402 flk_wakeup(lock, 1);
2403 flk_free_lock(lock);
2404 }
2405 lock = nlock;
2406 }
2407 mutex_exit(&gp->gp_mutex); /* release mutex on lock graph */
2408 }
2409 }
2410
2411 /*
2412 * Delete all locks in the system that belongs to the sysid of the request.
2413 */
2414
2415 static void
flk_delete_locks_by_sysid(lock_descriptor_t * request)2416 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2417 {
2418 int sysid = request->l_flock.l_sysid;
2419 lock_descriptor_t *lock, *nlock;
2420 graph_t *gp;
2421 int i;
2422
2423 ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2424 ASSERT(sysid != 0);
2425
2426 mutex_exit(&request->l_graph->gp_mutex);
2427
2428 for (i = 0; i < HASH_SIZE; i++) {
2429 mutex_enter(&flock_lock);
2430 gp = lock_graph[i];
2431 mutex_exit(&flock_lock);
2432
2433 if (gp == NULL)
2434 continue;
2435
2436 mutex_enter(&gp->gp_mutex);
2437
2438 /* signal sleeping requests so that they bail out */
2439 lock = SLEEPING_HEAD(gp)->l_next;
2440 while (lock != SLEEPING_HEAD(gp)) {
2441 nlock = lock->l_next;
2442 if (lock->l_flock.l_sysid == sysid) {
2443 INTERRUPT_WAKEUP(lock);
2444 }
2445 lock = nlock;
2446 }
2447
2448 /* delete active locks */
2449 lock = ACTIVE_HEAD(gp)->l_next;
2450 while (lock != ACTIVE_HEAD(gp)) {
2451 nlock = lock->l_next;
2452 if (lock->l_flock.l_sysid == sysid) {
2453 flk_delete_active_lock(lock, 0);
2454 flk_wakeup(lock, 1);
2455 flk_free_lock(lock);
2456 }
2457 lock = nlock;
2458 }
2459 mutex_exit(&gp->gp_mutex);
2460 }
2461
2462 mutex_enter(&request->l_graph->gp_mutex);
2463 }
2464
2465 /*
2466 * Clustering: Deletes PXFS locks
2467 * Effects: Delete all locks on files in the given file system and with the
2468 * given PXFS id.
2469 */
2470 void
cl_flk_delete_pxfs_locks(struct vfs * vfsp,int pxfsid)2471 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2472 {
2473 lock_descriptor_t *lock, *nlock;
2474 graph_t *gp;
2475 int i;
2476
2477 for (i = 0; i < HASH_SIZE; i++) {
2478 mutex_enter(&flock_lock);
2479 gp = lock_graph[i];
2480 mutex_exit(&flock_lock);
2481
2482 if (gp == NULL)
2483 continue;
2484
2485 mutex_enter(&gp->gp_mutex);
2486
2487 /* signal sleeping requests so that they bail out */
2488 lock = SLEEPING_HEAD(gp)->l_next;
2489 while (lock != SLEEPING_HEAD(gp)) {
2490 nlock = lock->l_next;
2491 if (lock->l_vnode->v_vfsp == vfsp) {
2492 ASSERT(IS_PXFS(lock));
2493 if (GETPXFSID(lock->l_flock.l_sysid) ==
2494 pxfsid) {
2495 flk_set_state(lock,
2496 FLK_CANCELLED_STATE);
2497 flk_cancel_sleeping_lock(lock, 1);
2498 }
2499 }
2500 lock = nlock;
2501 }
2502
2503 /* delete active locks */
2504 lock = ACTIVE_HEAD(gp)->l_next;
2505 while (lock != ACTIVE_HEAD(gp)) {
2506 nlock = lock->l_next;
2507 if (lock->l_vnode->v_vfsp == vfsp) {
2508 ASSERT(IS_PXFS(lock));
2509 if (GETPXFSID(lock->l_flock.l_sysid) ==
2510 pxfsid) {
2511 flk_delete_active_lock(lock, 0);
2512 flk_wakeup(lock, 1);
2513 flk_free_lock(lock);
2514 }
2515 }
2516 lock = nlock;
2517 }
2518 mutex_exit(&gp->gp_mutex);
2519 }
2520 }
2521
2522 /*
2523 * Search for a sleeping lock manager lock which matches exactly this lock
2524 * request; if one is found, fake a signal to cancel it.
2525 *
2526 * Return 1 if a matching lock was found, 0 otherwise.
2527 */
2528
2529 static int
flk_canceled(lock_descriptor_t * request)2530 flk_canceled(lock_descriptor_t *request)
2531 {
2532 lock_descriptor_t *lock, *nlock;
2533 graph_t *gp = request->l_graph;
2534 vnode_t *vp = request->l_vnode;
2535
2536 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2537 ASSERT(IS_LOCKMGR(request));
2538 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2539
2540 if (lock) {
2541 while (lock->l_vnode == vp) {
2542 nlock = lock->l_next;
2543 if (SAME_OWNER(lock, request) &&
2544 lock->l_start == request->l_start &&
2545 lock->l_end == request->l_end) {
2546 INTERRUPT_WAKEUP(lock);
2547 return (1);
2548 }
2549 lock = nlock;
2550 }
2551 }
2552 return (0);
2553 }
2554
2555 /*
2556 * Remove all the locks for the vnode belonging to the given pid and sysid.
2557 */
2558
2559 void
cleanlocks(vnode_t * vp,pid_t pid,int sysid)2560 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2561 {
2562 graph_t *gp;
2563 lock_descriptor_t *lock, *nlock;
2564 lock_descriptor_t *link_stack;
2565
2566 STACK_INIT(link_stack);
2567
2568 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2569
2570 if (gp == NULL)
2571 return;
2572 mutex_enter(&gp->gp_mutex);
2573
2574 CHECK_SLEEPING_LOCKS(gp);
2575 CHECK_ACTIVE_LOCKS(gp);
2576
2577 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2578
2579 if (lock) {
2580 do {
2581 nlock = lock->l_next;
2582 if ((lock->l_flock.l_pid == pid ||
2583 pid == IGN_PID) &&
2584 lock->l_flock.l_sysid == sysid) {
2585 CANCEL_WAKEUP(lock);
2586 }
2587 lock = nlock;
2588 } while (lock->l_vnode == vp);
2589 }
2590
2591 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2592
2593 if (lock) {
2594 do {
2595 nlock = lock->l_next;
2596 if ((lock->l_flock.l_pid == pid ||
2597 pid == IGN_PID) &&
2598 lock->l_flock.l_sysid == sysid) {
2599 flk_delete_active_lock(lock, 0);
2600 STACK_PUSH(link_stack, lock, l_stack);
2601 }
2602 lock = nlock;
2603 } while (lock->l_vnode == vp);
2604 }
2605
2606 while ((lock = STACK_TOP(link_stack)) != NULL) {
2607 STACK_POP(link_stack, l_stack);
2608 flk_wakeup(lock, 1);
2609 flk_free_lock(lock);
2610 }
2611
2612 CHECK_SLEEPING_LOCKS(gp);
2613 CHECK_ACTIVE_LOCKS(gp);
2614 CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2615 mutex_exit(&gp->gp_mutex);
2616 }
2617 /* ONC_PLUS EXTRACT END */
2618
2619
2620 /*
2621 * Called from 'fs' read and write routines for files that have mandatory
2622 * locking enabled.
2623 */
2624
2625 int
chklock(struct vnode * vp,int iomode,u_offset_t offset,ssize_t len,int fmode,caller_context_t * ct)2626 chklock(
2627 struct vnode *vp,
2628 int iomode,
2629 u_offset_t offset,
2630 ssize_t len,
2631 int fmode,
2632 caller_context_t *ct)
2633 {
2634 register int i;
2635 struct flock64 bf;
2636 int error = 0;
2637
2638 bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2639 bf.l_whence = 0;
2640 bf.l_start = offset;
2641 bf.l_len = len;
2642 if (ct == NULL) {
2643 bf.l_pid = curproc->p_pid;
2644 bf.l_sysid = 0;
2645 } else {
2646 bf.l_pid = ct->cc_pid;
2647 bf.l_sysid = ct->cc_sysid;
2648 }
2649 i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2650 if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2651 bf.l_type != F_UNLCK)
2652 error = i ? i : EAGAIN;
2653 return (error);
2654 }
2655
2656 /* ONC_PLUS EXTRACT START */
2657 /*
2658 * convoff - converts the given data (start, whence) to the
2659 * given whence.
2660 */
2661 int
convoff(vp,lckdat,whence,offset)2662 convoff(vp, lckdat, whence, offset)
2663 struct vnode *vp;
2664 struct flock64 *lckdat;
2665 int whence;
2666 offset_t offset;
2667 {
2668 int error;
2669 struct vattr vattr;
2670
2671 if ((lckdat->l_whence == 2) || (whence == 2)) {
2672 vattr.va_mask = AT_SIZE;
2673 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2674 return (error);
2675 }
2676
2677 switch (lckdat->l_whence) {
2678 case 1:
2679 lckdat->l_start += offset;
2680 break;
2681 case 2:
2682 lckdat->l_start += vattr.va_size;
2683 /* FALLTHRU */
2684 case 0:
2685 break;
2686 default:
2687 return (EINVAL);
2688 }
2689
2690 if (lckdat->l_start < 0)
2691 return (EINVAL);
2692
2693 switch (whence) {
2694 case 1:
2695 lckdat->l_start -= offset;
2696 break;
2697 case 2:
2698 lckdat->l_start -= vattr.va_size;
2699 /* FALLTHRU */
2700 case 0:
2701 break;
2702 default:
2703 return (EINVAL);
2704 }
2705
2706 lckdat->l_whence = (short)whence;
2707 return (0);
2708 }
2709 /* ONC_PLUS EXTRACT END */
2710
2711
2712 /* proc_graph function definitions */
2713
2714 /*
2715 * Function checks for deadlock due to the new 'lock'. If deadlock found
2716 * edges of this lock are freed and returned.
2717 */
2718
2719 static int
flk_check_deadlock(lock_descriptor_t * lock)2720 flk_check_deadlock(lock_descriptor_t *lock)
2721 {
2722 proc_vertex_t *start_vertex, *pvertex;
2723 proc_vertex_t *dvertex;
2724 proc_edge_t *pep, *ppep;
2725 edge_t *ep, *nep;
2726 proc_vertex_t *process_stack;
2727
2728 STACK_INIT(process_stack);
2729
2730 mutex_enter(&flock_lock);
2731 start_vertex = flk_get_proc_vertex(lock);
2732 ASSERT(start_vertex != NULL);
2733
2734 /* construct the edges from this process to other processes */
2735
2736 ep = FIRST_ADJ(lock);
2737 while (ep != HEAD(lock)) {
2738 proc_vertex_t *adj_proc;
2739
2740 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2741 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2742 if (pep->to_proc == adj_proc) {
2743 ASSERT(pep->refcount);
2744 pep->refcount++;
2745 break;
2746 }
2747 }
2748 if (pep == NULL) {
2749 pep = flk_get_proc_edge();
2750 pep->to_proc = adj_proc;
2751 pep->refcount = 1;
2752 adj_proc->incount++;
2753 pep->next = start_vertex->edge;
2754 start_vertex->edge = pep;
2755 }
2756 ep = NEXT_ADJ(ep);
2757 }
2758
2759 ep = FIRST_IN(lock);
2760
2761 while (ep != HEAD(lock)) {
2762 proc_vertex_t *in_proc;
2763
2764 in_proc = flk_get_proc_vertex(ep->from_vertex);
2765
2766 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2767 if (pep->to_proc == start_vertex) {
2768 ASSERT(pep->refcount);
2769 pep->refcount++;
2770 break;
2771 }
2772 }
2773 if (pep == NULL) {
2774 pep = flk_get_proc_edge();
2775 pep->to_proc = start_vertex;
2776 pep->refcount = 1;
2777 start_vertex->incount++;
2778 pep->next = in_proc->edge;
2779 in_proc->edge = pep;
2780 }
2781 ep = NEXT_IN(ep);
2782 }
2783
2784 if (start_vertex->incount == 0) {
2785 mutex_exit(&flock_lock);
2786 return (0);
2787 }
2788
2789 flk_proc_graph_uncolor();
2790
2791 start_vertex->p_sedge = start_vertex->edge;
2792
2793 STACK_PUSH(process_stack, start_vertex, p_stack);
2794
2795 while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2796 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2797 dvertex = pep->to_proc;
2798 if (!PROC_ARRIVED(dvertex)) {
2799 STACK_PUSH(process_stack, dvertex, p_stack);
2800 dvertex->p_sedge = dvertex->edge;
2801 PROC_ARRIVE(pvertex);
2802 pvertex->p_sedge = pep->next;
2803 break;
2804 }
2805 if (!PROC_DEPARTED(dvertex))
2806 goto deadlock;
2807 }
2808 if (pep == NULL) {
2809 PROC_DEPART(pvertex);
2810 STACK_POP(process_stack, p_stack);
2811 }
2812 }
2813 mutex_exit(&flock_lock);
2814 return (0);
2815
2816 deadlock:
2817
2818 /* we remove all lock edges and proc edges */
2819
2820 ep = FIRST_ADJ(lock);
2821 while (ep != HEAD(lock)) {
2822 proc_vertex_t *adj_proc;
2823 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2824 nep = NEXT_ADJ(ep);
2825 IN_LIST_REMOVE(ep);
2826 ADJ_LIST_REMOVE(ep);
2827 flk_free_edge(ep);
2828 ppep = start_vertex->edge;
2829 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2830 pep = ppep->next) {
2831 if (pep->to_proc == adj_proc) {
2832 pep->refcount--;
2833 if (pep->refcount == 0) {
2834 if (pep == ppep) {
2835 start_vertex->edge = pep->next;
2836 } else {
2837 ppep->next = pep->next;
2838 }
2839 adj_proc->incount--;
2840 flk_proc_release(adj_proc);
2841 flk_free_proc_edge(pep);
2842 }
2843 break;
2844 }
2845 }
2846 ep = nep;
2847 }
2848 ep = FIRST_IN(lock);
2849 while (ep != HEAD(lock)) {
2850 proc_vertex_t *in_proc;
2851 in_proc = flk_get_proc_vertex(ep->from_vertex);
2852 nep = NEXT_IN(ep);
2853 IN_LIST_REMOVE(ep);
2854 ADJ_LIST_REMOVE(ep);
2855 flk_free_edge(ep);
2856 ppep = in_proc->edge;
2857 for (pep = in_proc->edge; pep != NULL; ppep = pep,
2858 pep = ppep->next) {
2859 if (pep->to_proc == start_vertex) {
2860 pep->refcount--;
2861 if (pep->refcount == 0) {
2862 if (pep == ppep) {
2863 in_proc->edge = pep->next;
2864 } else {
2865 ppep->next = pep->next;
2866 }
2867 start_vertex->incount--;
2868 flk_proc_release(in_proc);
2869 flk_free_proc_edge(pep);
2870 }
2871 break;
2872 }
2873 }
2874 ep = nep;
2875 }
2876 flk_proc_release(start_vertex);
2877 mutex_exit(&flock_lock);
2878 return (1);
2879 }
2880
2881 /*
2882 * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2883 * from the list we return that, otherwise we allocate one. If necessary,
2884 * we grow the list of vertices also.
2885 */
2886
2887 static proc_vertex_t *
flk_get_proc_vertex(lock_descriptor_t * lock)2888 flk_get_proc_vertex(lock_descriptor_t *lock)
2889 {
2890 int i;
2891 proc_vertex_t *pv;
2892 proc_vertex_t **palloc;
2893
2894 ASSERT(MUTEX_HELD(&flock_lock));
2895 if (lock->pvertex != -1) {
2896 ASSERT(lock->pvertex >= 0);
2897 pv = pgraph.proc[lock->pvertex];
2898 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2899 return (pv);
2900 }
2901 }
2902 for (i = 0; i < pgraph.gcount; i++) {
2903 pv = pgraph.proc[i];
2904 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2905 lock->pvertex = pv->index = i;
2906 return (pv);
2907 }
2908 }
2909 pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2910 pv->pid = lock->l_flock.l_pid;
2911 pv->sysid = lock->l_flock.l_sysid;
2912 flk_proc_vertex_allocs++;
2913 if (pgraph.free != 0) {
2914 for (i = 0; i < pgraph.gcount; i++) {
2915 if (pgraph.proc[i] == NULL) {
2916 pgraph.proc[i] = pv;
2917 lock->pvertex = pv->index = i;
2918 pgraph.free--;
2919 return (pv);
2920 }
2921 }
2922 }
2923 palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2924 sizeof (proc_vertex_t *), KM_SLEEP);
2925
2926 if (pgraph.proc) {
2927 bcopy(pgraph.proc, palloc,
2928 pgraph.gcount * sizeof (proc_vertex_t *));
2929
2930 kmem_free(pgraph.proc,
2931 pgraph.gcount * sizeof (proc_vertex_t *));
2932 }
2933 pgraph.proc = palloc;
2934 pgraph.free += (PROC_CHUNK - 1);
2935 pv->index = lock->pvertex = pgraph.gcount;
2936 pgraph.gcount += PROC_CHUNK;
2937 pgraph.proc[pv->index] = pv;
2938 return (pv);
2939 }
2940
2941 /*
2942 * Allocate a proc edge.
2943 */
2944
2945 static proc_edge_t *
flk_get_proc_edge()2946 flk_get_proc_edge()
2947 {
2948 proc_edge_t *pep;
2949
2950 pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
2951 flk_proc_edge_allocs++;
2952 return (pep);
2953 }
2954
2955 /*
2956 * Free the proc edge. Called whenever its reference count goes to zero.
2957 */
2958
2959 static void
flk_free_proc_edge(proc_edge_t * pep)2960 flk_free_proc_edge(proc_edge_t *pep)
2961 {
2962 ASSERT(pep->refcount == 0);
2963 kmem_free((void *)pep, sizeof (proc_edge_t));
2964 flk_proc_edge_frees++;
2965 }
2966
2967 /*
2968 * Color the graph explicitly done only when the mark value hits max value.
2969 */
2970
2971 static void
flk_proc_graph_uncolor()2972 flk_proc_graph_uncolor()
2973 {
2974 int i;
2975
2976 if (pgraph.mark == UINT_MAX) {
2977 for (i = 0; i < pgraph.gcount; i++)
2978 if (pgraph.proc[i] != NULL) {
2979 pgraph.proc[i]->atime = 0;
2980 pgraph.proc[i]->dtime = 0;
2981 }
2982 pgraph.mark = 1;
2983 } else {
2984 pgraph.mark++;
2985 }
2986 }
2987
2988 /*
2989 * Release the proc vertex iff both there are no in edges and out edges
2990 */
2991
2992 static void
flk_proc_release(proc_vertex_t * proc)2993 flk_proc_release(proc_vertex_t *proc)
2994 {
2995 ASSERT(MUTEX_HELD(&flock_lock));
2996 if (proc->edge == NULL && proc->incount == 0) {
2997 pgraph.proc[proc->index] = NULL;
2998 pgraph.free++;
2999 kmem_free(proc, sizeof (proc_vertex_t));
3000 flk_proc_vertex_frees++;
3001 }
3002 }
3003
3004 /*
3005 * Updates process graph to reflect change in a lock_graph.
3006 * Note: We should call this function only after we have a correctly
3007 * recomputed lock graph. Otherwise we might miss a deadlock detection.
3008 * eg: in function flk_relation() we call this function after flk_recompute_
3009 * dependencies() otherwise if a process tries to lock a vnode hashed
3010 * into another graph it might sleep for ever.
3011 */
3012
3013 static void
flk_update_proc_graph(edge_t * ep,int delete)3014 flk_update_proc_graph(edge_t *ep, int delete)
3015 {
3016 proc_vertex_t *toproc, *fromproc;
3017 proc_edge_t *pep, *prevpep;
3018
3019 mutex_enter(&flock_lock);
3020 toproc = flk_get_proc_vertex(ep->to_vertex);
3021 fromproc = flk_get_proc_vertex(ep->from_vertex);
3022
3023 if (!delete)
3024 goto add;
3025 pep = prevpep = fromproc->edge;
3026
3027 ASSERT(pep != NULL);
3028 while (pep != NULL) {
3029 if (pep->to_proc == toproc) {
3030 ASSERT(pep->refcount > 0);
3031 pep->refcount--;
3032 if (pep->refcount == 0) {
3033 if (pep == prevpep) {
3034 fromproc->edge = pep->next;
3035 } else {
3036 prevpep->next = pep->next;
3037 }
3038 toproc->incount--;
3039 flk_proc_release(toproc);
3040 flk_free_proc_edge(pep);
3041 }
3042 break;
3043 }
3044 prevpep = pep;
3045 pep = pep->next;
3046 }
3047 flk_proc_release(fromproc);
3048 mutex_exit(&flock_lock);
3049 return;
3050 add:
3051
3052 pep = fromproc->edge;
3053
3054 while (pep != NULL) {
3055 if (pep->to_proc == toproc) {
3056 ASSERT(pep->refcount > 0);
3057 pep->refcount++;
3058 break;
3059 }
3060 pep = pep->next;
3061 }
3062 if (pep == NULL) {
3063 pep = flk_get_proc_edge();
3064 pep->to_proc = toproc;
3065 pep->refcount = 1;
3066 toproc->incount++;
3067 pep->next = fromproc->edge;
3068 fromproc->edge = pep;
3069 }
3070 mutex_exit(&flock_lock);
3071 }
3072
3073 /* ONC_PLUS EXTRACT START */
3074 /*
3075 * Set the control status for lock manager requests.
3076 *
3077 */
3078
3079 /*
3080 * PSARC case 1997/292
3081 *
3082 * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3083 * Effects: Set the state of the NLM server identified by "nlmid"
3084 * in the NLM registry to state "nlm_state."
3085 * Raises exception no_such_nlm if "nlmid" doesn't identify a known
3086 * NLM server to this LLM.
3087 * Note that when this routine is called with NLM_SHUTTING_DOWN there
3088 * may be locks requests that have gotten started but not finished. In
3089 * particular, there may be blocking requests that are in the callback code
3090 * before sleeping (so they're not holding the lock for the graph). If
3091 * such a thread reacquires the graph's lock (to go to sleep) after
3092 * NLM state in the NLM registry is set to a non-up value,
3093 * it will notice the status and bail out. If the request gets
3094 * granted before the thread can check the NLM registry, let it
3095 * continue normally. It will get flushed when we are called with NLM_DOWN.
3096 *
3097 * Modifies: nlm_reg_obj (global)
3098 * Arguments:
3099 * nlmid (IN): id uniquely identifying an NLM server
3100 * nlm_state (IN): NLM server state to change "nlmid" to
3101 */
3102 void
cl_flk_set_nlm_status(int nlmid,flk_nlm_status_t nlm_state)3103 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3104 {
3105 /*
3106 * Check to see if node is booted as a cluster. If not, return.
3107 */
3108 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3109 return;
3110 }
3111
3112 /*
3113 * Check for development/debugging. It is possible to boot a node
3114 * in non-cluster mode, and then run a special script, currently
3115 * available only to developers, to bring up the node as part of a
3116 * cluster. The problem is that running such a script does not
3117 * result in the routine flk_init() being called and hence global array
3118 * nlm_reg_status is NULL. The NLM thinks it's in cluster mode,
3119 * but the LLM needs to do an additional check to see if the global
3120 * array has been created or not. If nlm_reg_status is NULL, then
3121 * return, else continue.
3122 */
3123 if (nlm_reg_status == NULL) {
3124 return;
3125 }
3126
3127 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3128 mutex_enter(&nlm_reg_lock);
3129
3130 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3131 /*
3132 * If the NLM server "nlmid" is unknown in the NLM registry,
3133 * add it to the registry in the nlm shutting down state.
3134 */
3135 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3136 FLK_NLM_SHUTTING_DOWN);
3137 } else {
3138 /*
3139 * Change the state of the NLM server identified by "nlmid"
3140 * in the NLM registry to the argument "nlm_state."
3141 */
3142 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3143 nlm_state);
3144 }
3145
3146 /*
3147 * The reason we must register the NLM server that is shutting down
3148 * with an LLM that doesn't already know about it (never sent a lock
3149 * request) is to handle correctly a race between shutdown and a new
3150 * lock request. Suppose that a shutdown request from the NLM server
3151 * invokes this routine at the LLM, and a thread is spawned to
3152 * service the request. Now suppose a new lock request is in
3153 * progress and has already passed the first line of defense in
3154 * reclock(), which denies new locks requests from NLM servers
3155 * that are not in the NLM_UP state. After the current routine
3156 * is invoked for both phases of shutdown, the routine will return,
3157 * having done nothing, and the lock request will proceed and
3158 * probably be granted. The problem is that the shutdown was ignored
3159 * by the lock request because there was no record of that NLM server
3160 * shutting down. We will be in the peculiar position of thinking
3161 * that we've shutdown the NLM server and all locks at all LLMs have
3162 * been discarded, but in fact there's still one lock held.
3163 * The solution is to record the existence of NLM server and change
3164 * its state immediately to NLM_SHUTTING_DOWN. The lock request in
3165 * progress may proceed because the next phase NLM_DOWN will catch
3166 * this lock and discard it.
3167 */
3168 mutex_exit(&nlm_reg_lock);
3169
3170 switch (nlm_state) {
3171 case FLK_NLM_UP:
3172 /*
3173 * Change the NLM state of all locks still held on behalf of
3174 * the NLM server identified by "nlmid" to NLM_UP.
3175 */
3176 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3177 break;
3178
3179 case FLK_NLM_SHUTTING_DOWN:
3180 /*
3181 * Wake up all sleeping locks for the NLM server identified
3182 * by "nlmid." Note that eventually all woken threads will
3183 * have their lock requests cancelled and descriptors
3184 * removed from the sleeping lock list. Note that the NLM
3185 * server state associated with each lock descriptor is
3186 * changed to FLK_NLM_SHUTTING_DOWN.
3187 */
3188 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3189 break;
3190
3191 case FLK_NLM_DOWN:
3192 /*
3193 * Discard all active, granted locks for this NLM server
3194 * identified by "nlmid."
3195 */
3196 cl_flk_unlock_nlm_granted(nlmid);
3197 break;
3198
3199 default:
3200 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3201 }
3202 }
3203
3204 /*
3205 * Set the control status for lock manager requests.
3206 *
3207 * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3208 * may be locks requests that have gotten started but not finished. In
3209 * particular, there may be blocking requests that are in the callback code
3210 * before sleeping (so they're not holding the lock for the graph). If
3211 * such a thread reacquires the graph's lock (to go to sleep) after
3212 * flk_lockmgr_status is set to a non-up value, it will notice the status
3213 * and bail out. If the request gets granted before the thread can check
3214 * flk_lockmgr_status, let it continue normally. It will get flushed when
3215 * we are called with FLK_LOCKMGR_DOWN.
3216 */
3217
3218 void
flk_set_lockmgr_status(flk_lockmgr_status_t status)3219 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3220 {
3221 int i;
3222 graph_t *gp;
3223 struct flock_globals *fg;
3224
3225 fg = flk_get_globals();
3226 ASSERT(fg != NULL);
3227
3228 mutex_enter(&flock_lock);
3229 fg->flk_lockmgr_status = status;
3230 mutex_exit(&flock_lock);
3231
3232 /*
3233 * If the lock manager is coming back up, all that's needed is to
3234 * propagate this information to the graphs. If the lock manager
3235 * is going down, additional action is required, and each graph's
3236 * copy of the state is updated atomically with this other action.
3237 */
3238 switch (status) {
3239 case FLK_LOCKMGR_UP:
3240 for (i = 0; i < HASH_SIZE; i++) {
3241 mutex_enter(&flock_lock);
3242 gp = lock_graph[i];
3243 mutex_exit(&flock_lock);
3244 if (gp == NULL)
3245 continue;
3246 mutex_enter(&gp->gp_mutex);
3247 fg->lockmgr_status[i] = status;
3248 mutex_exit(&gp->gp_mutex);
3249 }
3250 break;
3251 case FLK_WAKEUP_SLEEPERS:
3252 wakeup_sleeping_lockmgr_locks(fg);
3253 break;
3254 case FLK_LOCKMGR_DOWN:
3255 unlock_lockmgr_granted(fg);
3256 break;
3257 default:
3258 panic("flk_set_lockmgr_status: bad status (%d)", status);
3259 break;
3260 }
3261 }
3262
3263 /*
3264 * This routine returns all the locks that are active or sleeping and are
3265 * associated with a particular set of identifiers. If lock_state != 0, then
3266 * only locks that match the lock_state are returned. If lock_state == 0, then
3267 * all locks are returned. If pid == NOPID, the pid is ignored. If
3268 * use_sysid is FALSE, then the sysid is ignored. If vp is NULL, then the
3269 * vnode pointer is ignored.
3270 *
3271 * A list containing the vnode pointer and an flock structure
3272 * describing the lock is returned. Each element in the list is
3273 * dynamically allocated and must be freed by the caller. The
3274 * last item in the list is denoted by a NULL value in the ll_next
3275 * field.
3276 *
3277 * The vnode pointers returned are held. The caller is responsible
3278 * for releasing these. Note that the returned list is only a snapshot of
3279 * the current lock information, and that it is a snapshot of a moving
3280 * target (only one graph is locked at a time).
3281 */
3282
3283 locklist_t *
get_lock_list(int list_type,int lock_state,int sysid,boolean_t use_sysid,pid_t pid,const vnode_t * vp,zoneid_t zoneid)3284 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3285 pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3286 {
3287 lock_descriptor_t *lock;
3288 lock_descriptor_t *graph_head;
3289 locklist_t listhead;
3290 locklist_t *llheadp;
3291 locklist_t *llp;
3292 locklist_t *lltp;
3293 graph_t *gp;
3294 int i;
3295 int first_index; /* graph index */
3296 int num_indexes; /* graph index */
3297
3298 ASSERT((list_type == FLK_ACTIVE_STATE) ||
3299 (list_type == FLK_SLEEPING_STATE));
3300
3301 /*
3302 * Get a pointer to something to use as a list head while building
3303 * the rest of the list.
3304 */
3305 llheadp = &listhead;
3306 lltp = llheadp;
3307 llheadp->ll_next = (locklist_t *)NULL;
3308
3309 /* Figure out which graphs we want to look at. */
3310 if (vp == NULL) {
3311 first_index = 0;
3312 num_indexes = HASH_SIZE;
3313 } else {
3314 first_index = HASH_INDEX(vp);
3315 num_indexes = 1;
3316 }
3317
3318 for (i = first_index; i < first_index + num_indexes; i++) {
3319 mutex_enter(&flock_lock);
3320 gp = lock_graph[i];
3321 mutex_exit(&flock_lock);
3322 if (gp == NULL) {
3323 continue;
3324 }
3325
3326 mutex_enter(&gp->gp_mutex);
3327 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3328 ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3329 for (lock = graph_head->l_next;
3330 lock != graph_head;
3331 lock = lock->l_next) {
3332 if (use_sysid && lock->l_flock.l_sysid != sysid)
3333 continue;
3334 if (pid != NOPID && lock->l_flock.l_pid != pid)
3335 continue;
3336 if (vp != NULL && lock->l_vnode != vp)
3337 continue;
3338 if (lock_state && !(lock_state & lock->l_state))
3339 continue;
3340 if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3341 continue;
3342 /*
3343 * A matching lock was found. Allocate
3344 * space for a new locklist entry and fill
3345 * it in.
3346 */
3347 llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3348 lltp->ll_next = llp;
3349 VN_HOLD(lock->l_vnode);
3350 llp->ll_vp = lock->l_vnode;
3351 create_flock(lock, &(llp->ll_flock));
3352 llp->ll_next = (locklist_t *)NULL;
3353 lltp = llp;
3354 }
3355 mutex_exit(&gp->gp_mutex);
3356 }
3357
3358 llp = llheadp->ll_next;
3359 return (llp);
3360 }
3361
3362 /*
3363 * These two functions are simply interfaces to get_lock_list. They return
3364 * a list of sleeping or active locks for the given sysid and pid. See
3365 * get_lock_list for details.
3366 *
3367 * In either case we don't particularly care to specify the zone of interest;
3368 * the sysid-space is global across zones, so the sysid will map to exactly one
3369 * zone, and we'll return information for that zone.
3370 */
3371
3372 locklist_t *
flk_get_sleeping_locks(int sysid,pid_t pid)3373 flk_get_sleeping_locks(int sysid, pid_t pid)
3374 {
3375 return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3376 ALL_ZONES));
3377 }
3378
3379 locklist_t *
flk_get_active_locks(int sysid,pid_t pid)3380 flk_get_active_locks(int sysid, pid_t pid)
3381 {
3382 return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3383 ALL_ZONES));
3384 }
3385
3386 /*
3387 * Another interface to get_lock_list. This one returns all the active
3388 * locks for a given vnode. Again, see get_lock_list for details.
3389 *
3390 * We don't need to specify which zone's locks we're interested in. The matter
3391 * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3392 * be used by multiple zones, so the list of locks will all be from the right
3393 * zone.
3394 */
3395
3396 locklist_t *
flk_active_locks_for_vp(const vnode_t * vp)3397 flk_active_locks_for_vp(const vnode_t *vp)
3398 {
3399 return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3400 ALL_ZONES));
3401 }
3402
3403 /*
3404 * Another interface to get_lock_list. This one returns all the active
3405 * nbmand locks for a given vnode. Again, see get_lock_list for details.
3406 *
3407 * See the comment for flk_active_locks_for_vp() for why we don't care to
3408 * specify the particular zone of interest.
3409 */
3410 locklist_t *
flk_active_nbmand_locks_for_vp(const vnode_t * vp)3411 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3412 {
3413 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3414 NOPID, vp, ALL_ZONES));
3415 }
3416
3417 /*
3418 * Another interface to get_lock_list. This one returns all the active
3419 * nbmand locks for a given pid. Again, see get_lock_list for details.
3420 *
3421 * The zone doesn't need to be specified here; the locks held by a
3422 * particular process will either be local (ie, non-NFS) or from the zone
3423 * the process is executing in. This is because other parts of the system
3424 * ensure that an NFS vnode can't be used in a zone other than that in
3425 * which it was opened.
3426 */
3427 locklist_t *
flk_active_nbmand_locks(pid_t pid)3428 flk_active_nbmand_locks(pid_t pid)
3429 {
3430 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3431 pid, NULL, ALL_ZONES));
3432 }
3433
3434 /*
3435 * Free up all entries in the locklist.
3436 */
3437 void
flk_free_locklist(locklist_t * llp)3438 flk_free_locklist(locklist_t *llp)
3439 {
3440 locklist_t *next_llp;
3441
3442 while (llp) {
3443 next_llp = llp->ll_next;
3444 VN_RELE(llp->ll_vp);
3445 kmem_free(llp, sizeof (*llp));
3446 llp = next_llp;
3447 }
3448 }
3449
3450 static void
cl_flk_change_nlm_state_all_locks(int nlmid,flk_nlm_status_t nlm_state)3451 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3452 {
3453 /*
3454 * For each graph "lg" in the hash table lock_graph do
3455 * a. Get the list of sleeping locks
3456 * b. For each lock descriptor in the list do
3457 * i. If the requested lock is an NLM server request AND
3458 * the nlmid is the same as the routine argument then
3459 * change the lock descriptor's state field to
3460 * "nlm_state."
3461 * c. Get the list of active locks
3462 * d. For each lock descriptor in the list do
3463 * i. If the requested lock is an NLM server request AND
3464 * the nlmid is the same as the routine argument then
3465 * change the lock descriptor's state field to
3466 * "nlm_state."
3467 */
3468
3469 int i;
3470 graph_t *gp; /* lock graph */
3471 lock_descriptor_t *lock; /* lock */
3472 lock_descriptor_t *nlock = NULL; /* next lock */
3473 int lock_nlmid;
3474
3475 for (i = 0; i < HASH_SIZE; i++) {
3476 mutex_enter(&flock_lock);
3477 gp = lock_graph[i];
3478 mutex_exit(&flock_lock);
3479 if (gp == NULL) {
3480 continue;
3481 }
3482
3483 /* Get list of sleeping locks in current lock graph. */
3484 mutex_enter(&gp->gp_mutex);
3485 for (lock = SLEEPING_HEAD(gp)->l_next;
3486 lock != SLEEPING_HEAD(gp);
3487 lock = nlock) {
3488 nlock = lock->l_next;
3489 /* get NLM id */
3490 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3491
3492 /*
3493 * If NLM server request AND nlmid of lock matches
3494 * nlmid of argument, then set the NLM state of the
3495 * lock to "nlm_state."
3496 */
3497 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3498 SET_NLM_STATE(lock, nlm_state);
3499 }
3500 }
3501
3502 /* Get list of active locks in current lock graph. */
3503 for (lock = ACTIVE_HEAD(gp)->l_next;
3504 lock != ACTIVE_HEAD(gp);
3505 lock = nlock) {
3506 nlock = lock->l_next;
3507 /* get NLM id */
3508 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3509
3510 /*
3511 * If NLM server request AND nlmid of lock matches
3512 * nlmid of argument, then set the NLM state of the
3513 * lock to "nlm_state."
3514 */
3515 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3516 ASSERT(IS_ACTIVE(lock));
3517 SET_NLM_STATE(lock, nlm_state);
3518 }
3519 }
3520 mutex_exit(&gp->gp_mutex);
3521 }
3522 }
3523
3524 /*
3525 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3526 * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3527 * identified by "nlmid." Poke those lock requests.
3528 */
3529 static void
cl_flk_wakeup_sleeping_nlm_locks(int nlmid)3530 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3531 {
3532 lock_descriptor_t *lock;
3533 lock_descriptor_t *nlock = NULL; /* next lock */
3534 int i;
3535 graph_t *gp;
3536 int lock_nlmid;
3537
3538 for (i = 0; i < HASH_SIZE; i++) {
3539 mutex_enter(&flock_lock);
3540 gp = lock_graph[i];
3541 mutex_exit(&flock_lock);
3542 if (gp == NULL) {
3543 continue;
3544 }
3545
3546 mutex_enter(&gp->gp_mutex);
3547 for (lock = SLEEPING_HEAD(gp)->l_next;
3548 lock != SLEEPING_HEAD(gp);
3549 lock = nlock) {
3550 nlock = lock->l_next;
3551 /*
3552 * If NLM server request _and_ nlmid of lock matches
3553 * nlmid of argument, then set the NLM state of the
3554 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3555 * request.
3556 */
3557 if (IS_LOCKMGR(lock)) {
3558 /* get NLM id */
3559 lock_nlmid =
3560 GETNLMID(lock->l_flock.l_sysid);
3561 if (nlmid == lock_nlmid) {
3562 SET_NLM_STATE(lock,
3563 FLK_NLM_SHUTTING_DOWN);
3564 INTERRUPT_WAKEUP(lock);
3565 }
3566 }
3567 }
3568 mutex_exit(&gp->gp_mutex);
3569 }
3570 }
3571
3572 /*
3573 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3574 * Effects: Find all active (granted) lock manager locks _only_ for the
3575 * NLM server identified by "nlmid" and release them.
3576 */
3577 static void
cl_flk_unlock_nlm_granted(int nlmid)3578 cl_flk_unlock_nlm_granted(int nlmid)
3579 {
3580 lock_descriptor_t *lock;
3581 lock_descriptor_t *nlock = NULL; /* next lock */
3582 int i;
3583 graph_t *gp;
3584 int lock_nlmid;
3585
3586 for (i = 0; i < HASH_SIZE; i++) {
3587 mutex_enter(&flock_lock);
3588 gp = lock_graph[i];
3589 mutex_exit(&flock_lock);
3590 if (gp == NULL) {
3591 continue;
3592 }
3593
3594 mutex_enter(&gp->gp_mutex);
3595 for (lock = ACTIVE_HEAD(gp)->l_next;
3596 lock != ACTIVE_HEAD(gp);
3597 lock = nlock) {
3598 nlock = lock->l_next;
3599 ASSERT(IS_ACTIVE(lock));
3600
3601 /*
3602 * If it's an NLM server request _and_ nlmid of
3603 * the lock matches nlmid of argument, then
3604 * remove the active lock the list, wakup blocked
3605 * threads, and free the storage for the lock.
3606 * Note that there's no need to mark the NLM state
3607 * of this lock to NLM_DOWN because the lock will
3608 * be deleted anyway and its storage freed.
3609 */
3610 if (IS_LOCKMGR(lock)) {
3611 /* get NLM id */
3612 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3613 if (nlmid == lock_nlmid) {
3614 flk_delete_active_lock(lock, 0);
3615 flk_wakeup(lock, 1);
3616 flk_free_lock(lock);
3617 }
3618 }
3619 }
3620 mutex_exit(&gp->gp_mutex);
3621 }
3622 }
3623
3624 /*
3625 * Find all sleeping lock manager requests and poke them.
3626 */
3627 static void
wakeup_sleeping_lockmgr_locks(struct flock_globals * fg)3628 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3629 {
3630 lock_descriptor_t *lock;
3631 lock_descriptor_t *nlock = NULL; /* next lock */
3632 int i;
3633 graph_t *gp;
3634 zoneid_t zoneid = getzoneid();
3635
3636 for (i = 0; i < HASH_SIZE; i++) {
3637 mutex_enter(&flock_lock);
3638 gp = lock_graph[i];
3639 mutex_exit(&flock_lock);
3640 if (gp == NULL) {
3641 continue;
3642 }
3643
3644 mutex_enter(&gp->gp_mutex);
3645 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3646 for (lock = SLEEPING_HEAD(gp)->l_next;
3647 lock != SLEEPING_HEAD(gp);
3648 lock = nlock) {
3649 nlock = lock->l_next;
3650 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3651 INTERRUPT_WAKEUP(lock);
3652 }
3653 }
3654 mutex_exit(&gp->gp_mutex);
3655 }
3656 }
3657
3658
3659 /*
3660 * Find all active (granted) lock manager locks and release them.
3661 */
3662 static void
unlock_lockmgr_granted(struct flock_globals * fg)3663 unlock_lockmgr_granted(struct flock_globals *fg)
3664 {
3665 lock_descriptor_t *lock;
3666 lock_descriptor_t *nlock = NULL; /* next lock */
3667 int i;
3668 graph_t *gp;
3669 zoneid_t zoneid = getzoneid();
3670
3671 for (i = 0; i < HASH_SIZE; i++) {
3672 mutex_enter(&flock_lock);
3673 gp = lock_graph[i];
3674 mutex_exit(&flock_lock);
3675 if (gp == NULL) {
3676 continue;
3677 }
3678
3679 mutex_enter(&gp->gp_mutex);
3680 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3681 for (lock = ACTIVE_HEAD(gp)->l_next;
3682 lock != ACTIVE_HEAD(gp);
3683 lock = nlock) {
3684 nlock = lock->l_next;
3685 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3686 ASSERT(IS_ACTIVE(lock));
3687 flk_delete_active_lock(lock, 0);
3688 flk_wakeup(lock, 1);
3689 flk_free_lock(lock);
3690 }
3691 }
3692 mutex_exit(&gp->gp_mutex);
3693 }
3694 }
3695 /* ONC_PLUS EXTRACT END */
3696
3697
3698 /*
3699 * Wait until a lock is granted, cancelled, or interrupted.
3700 */
3701
3702 static void
wait_for_lock(lock_descriptor_t * request)3703 wait_for_lock(lock_descriptor_t *request)
3704 {
3705 graph_t *gp = request->l_graph;
3706
3707 ASSERT(MUTEX_HELD(&gp->gp_mutex));
3708
3709 while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3710 !(IS_INTERRUPTED(request))) {
3711 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3712 flk_set_state(request, FLK_INTERRUPTED_STATE);
3713 request->l_state |= INTERRUPTED_LOCK;
3714 }
3715 }
3716 }
3717
3718 /* ONC_PLUS EXTRACT START */
3719 /*
3720 * Create an flock structure from the existing lock information
3721 *
3722 * This routine is used to create flock structures for the lock manager
3723 * to use in a reclaim request. Since the lock was originated on this
3724 * host, it must be conforming to UNIX semantics, so no checking is
3725 * done to make sure it falls within the lower half of the 32-bit range.
3726 */
3727
3728 static void
create_flock(lock_descriptor_t * lp,flock64_t * flp)3729 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3730 {
3731 ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3732 ASSERT(lp->l_end >= lp->l_start);
3733
3734 flp->l_type = lp->l_type;
3735 flp->l_whence = 0;
3736 flp->l_start = lp->l_start;
3737 flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3738 (lp->l_end - lp->l_start + 1);
3739 flp->l_sysid = lp->l_flock.l_sysid;
3740 flp->l_pid = lp->l_flock.l_pid;
3741 }
3742
3743 /*
3744 * Convert flock_t data describing a lock range into unsigned long starting
3745 * and ending points, which are put into lock_request. Returns 0 or an
3746 * errno value.
3747 * Large Files: max is passed by the caller and we return EOVERFLOW
3748 * as defined by LFS API.
3749 */
3750
3751 int
flk_convert_lock_data(vnode_t * vp,flock64_t * flp,u_offset_t * start,u_offset_t * end,offset_t offset)3752 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3753 u_offset_t *start, u_offset_t *end, offset_t offset)
3754 {
3755 struct vattr vattr;
3756 int error;
3757
3758 /*
3759 * Determine the starting point of the request
3760 */
3761 switch (flp->l_whence) {
3762 case 0: /* SEEK_SET */
3763 *start = (u_offset_t)flp->l_start;
3764 break;
3765 case 1: /* SEEK_CUR */
3766 *start = (u_offset_t)(flp->l_start + offset);
3767 break;
3768 case 2: /* SEEK_END */
3769 vattr.va_mask = AT_SIZE;
3770 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3771 return (error);
3772 *start = (u_offset_t)(flp->l_start + vattr.va_size);
3773 break;
3774 default:
3775 return (EINVAL);
3776 }
3777
3778 /*
3779 * Determine the range covered by the request.
3780 */
3781 if (flp->l_len == 0)
3782 *end = MAX_U_OFFSET_T;
3783 else if ((offset_t)flp->l_len > 0) {
3784 *end = (u_offset_t)(*start + (flp->l_len - 1));
3785 } else {
3786 /*
3787 * Negative length; why do we even allow this ?
3788 * Because this allows easy specification of
3789 * the last n bytes of the file.
3790 */
3791 *end = *start;
3792 *start += (u_offset_t)flp->l_len;
3793 (*start)++;
3794 }
3795 return (0);
3796 }
3797
3798 /*
3799 * Check the validity of lock data. This can used by the NFS
3800 * frlock routines to check data before contacting the server. The
3801 * server must support semantics that aren't as restrictive as
3802 * the UNIX API, so the NFS client is required to check.
3803 * The maximum is now passed in by the caller.
3804 */
3805
3806 int
flk_check_lock_data(u_offset_t start,u_offset_t end,offset_t max)3807 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3808 {
3809 /*
3810 * The end (length) for local locking should never be greater
3811 * than MAXEND. However, the representation for
3812 * the entire file is MAX_U_OFFSET_T.
3813 */
3814 if ((start > max) ||
3815 ((end > max) && (end != MAX_U_OFFSET_T))) {
3816 return (EINVAL);
3817 }
3818 if (start > end) {
3819 return (EINVAL);
3820 }
3821 return (0);
3822 }
3823
3824 /*
3825 * Fill in request->l_flock with information about the lock blocking the
3826 * request. The complexity here is that lock manager requests are allowed
3827 * to see into the upper part of the 32-bit address range, whereas local
3828 * requests are only allowed to see signed values.
3829 *
3830 * What should be done when "blocker" is a lock manager lock that uses the
3831 * upper portion of the 32-bit range, but "request" is local? Since the
3832 * request has already been determined to have been blocked by the blocker,
3833 * at least some portion of "blocker" must be in the range of the request,
3834 * or the request extends to the end of file. For the first case, the
3835 * portion in the lower range is returned with the indication that it goes
3836 * "to EOF." For the second case, the last byte of the lower range is
3837 * returned with the indication that it goes "to EOF."
3838 */
3839
3840 static void
report_blocker(lock_descriptor_t * blocker,lock_descriptor_t * request)3841 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3842 {
3843 flock64_t *flrp; /* l_flock portion of request */
3844
3845 ASSERT(blocker != NULL);
3846
3847 flrp = &request->l_flock;
3848 flrp->l_whence = 0;
3849 flrp->l_type = blocker->l_type;
3850 flrp->l_pid = blocker->l_flock.l_pid;
3851 flrp->l_sysid = blocker->l_flock.l_sysid;
3852
3853 if (IS_LOCKMGR(request)) {
3854 flrp->l_start = blocker->l_start;
3855 if (blocker->l_end == MAX_U_OFFSET_T)
3856 flrp->l_len = 0;
3857 else
3858 flrp->l_len = blocker->l_end - blocker->l_start + 1;
3859 } else {
3860 if (blocker->l_start > MAXEND) {
3861 flrp->l_start = MAXEND;
3862 flrp->l_len = 0;
3863 } else {
3864 flrp->l_start = blocker->l_start;
3865 if (blocker->l_end == MAX_U_OFFSET_T)
3866 flrp->l_len = 0;
3867 else
3868 flrp->l_len = blocker->l_end -
3869 blocker->l_start + 1;
3870 }
3871 }
3872 }
3873 /* ONC_PLUS EXTRACT END */
3874
3875 /*
3876 * PSARC case 1997/292
3877 */
3878 /*
3879 * This is the public routine exported by flock.h.
3880 */
3881 void
cl_flk_change_nlm_state_to_unknown(int nlmid)3882 cl_flk_change_nlm_state_to_unknown(int nlmid)
3883 {
3884 /*
3885 * Check to see if node is booted as a cluster. If not, return.
3886 */
3887 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3888 return;
3889 }
3890
3891 /*
3892 * See comment in cl_flk_set_nlm_status().
3893 */
3894 if (nlm_reg_status == NULL) {
3895 return;
3896 }
3897
3898 /*
3899 * protect NLM registry state with a mutex.
3900 */
3901 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3902 mutex_enter(&nlm_reg_lock);
3903 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3904 mutex_exit(&nlm_reg_lock);
3905 }
3906
3907 /*
3908 * Return non-zero if the given I/O request conflicts with an active NBMAND
3909 * lock.
3910 * If svmand is non-zero, it means look at all active locks, not just NBMAND
3911 * locks.
3912 */
3913
3914 int
nbl_lock_conflict(vnode_t * vp,nbl_op_t op,u_offset_t offset,ssize_t length,int svmand,caller_context_t * ct)3915 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3916 ssize_t length, int svmand, caller_context_t *ct)
3917 {
3918 int conflict = 0;
3919 graph_t *gp;
3920 lock_descriptor_t *lock;
3921 pid_t pid;
3922 int sysid;
3923
3924 if (ct == NULL) {
3925 pid = curproc->p_pid;
3926 sysid = 0;
3927 } else {
3928 pid = ct->cc_pid;
3929 sysid = ct->cc_sysid;
3930 }
3931
3932 mutex_enter(&flock_lock);
3933 gp = lock_graph[HASH_INDEX(vp)];
3934 mutex_exit(&flock_lock);
3935 if (gp == NULL)
3936 return (0);
3937
3938 mutex_enter(&gp->gp_mutex);
3939 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3940
3941 for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3942 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3943 (lock->l_flock.l_sysid != sysid ||
3944 lock->l_flock.l_pid != pid) &&
3945 lock_blocks_io(op, offset, length,
3946 lock->l_type, lock->l_start, lock->l_end)) {
3947 conflict = 1;
3948 break;
3949 }
3950 }
3951 mutex_exit(&gp->gp_mutex);
3952
3953 return (conflict);
3954 }
3955
3956 /*
3957 * Return non-zero if the given I/O request conflicts with the given lock.
3958 */
3959
3960 static int
lock_blocks_io(nbl_op_t op,u_offset_t offset,ssize_t length,int lock_type,u_offset_t lock_start,u_offset_t lock_end)3961 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
3962 int lock_type, u_offset_t lock_start, u_offset_t lock_end)
3963 {
3964 ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
3965 ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
3966
3967 if (op == NBL_READ && lock_type == F_RDLCK)
3968 return (0);
3969
3970 if (offset <= lock_start && lock_start < offset + length)
3971 return (1);
3972 if (lock_start <= offset && offset <= lock_end)
3973 return (1);
3974
3975 return (0);
3976 }
3977
3978 #ifdef DEBUG
3979 static void
check_active_locks(graph_t * gp)3980 check_active_locks(graph_t *gp)
3981 {
3982 lock_descriptor_t *lock, *lock1;
3983 edge_t *ep;
3984
3985 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
3986 lock = lock->l_next) {
3987 ASSERT(IS_ACTIVE(lock));
3988 ASSERT(NOT_BLOCKED(lock));
3989 ASSERT(!IS_BARRIER(lock));
3990
3991 ep = FIRST_IN(lock);
3992
3993 while (ep != HEAD(lock)) {
3994 ASSERT(IS_SLEEPING(ep->from_vertex));
3995 ASSERT(!NOT_BLOCKED(ep->from_vertex));
3996 ep = NEXT_IN(ep);
3997 }
3998
3999 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4000 lock1 = lock1->l_next) {
4001 if (lock1->l_vnode == lock->l_vnode) {
4002 if (BLOCKS(lock1, lock)) {
4003 cmn_err(CE_PANIC,
4004 "active lock %p blocks %p",
4005 (void *)lock1, (void *)lock);
4006 } else if (BLOCKS(lock, lock1)) {
4007 cmn_err(CE_PANIC,
4008 "active lock %p blocks %p",
4009 (void *)lock, (void *)lock1);
4010 }
4011 }
4012 }
4013 }
4014 }
4015
4016 /*
4017 * Effect: This functions checks to see if the transition from 'old_state' to
4018 * 'new_state' is a valid one. It returns 0 if the transition is valid
4019 * and 1 if it is not.
4020 * For a map of valid transitions, see sys/flock_impl.h
4021 */
4022 static int
check_lock_transition(int old_state,int new_state)4023 check_lock_transition(int old_state, int new_state)
4024 {
4025 switch (old_state) {
4026 case FLK_INITIAL_STATE:
4027 if ((new_state == FLK_START_STATE) ||
4028 (new_state == FLK_SLEEPING_STATE) ||
4029 (new_state == FLK_ACTIVE_STATE) ||
4030 (new_state == FLK_DEAD_STATE)) {
4031 return (0);
4032 } else {
4033 return (1);
4034 }
4035 case FLK_START_STATE:
4036 if ((new_state == FLK_ACTIVE_STATE) ||
4037 (new_state == FLK_DEAD_STATE)) {
4038 return (0);
4039 } else {
4040 return (1);
4041 }
4042 case FLK_ACTIVE_STATE:
4043 if (new_state == FLK_DEAD_STATE) {
4044 return (0);
4045 } else {
4046 return (1);
4047 }
4048 case FLK_SLEEPING_STATE:
4049 if ((new_state == FLK_GRANTED_STATE) ||
4050 (new_state == FLK_INTERRUPTED_STATE) ||
4051 (new_state == FLK_CANCELLED_STATE)) {
4052 return (0);
4053 } else {
4054 return (1);
4055 }
4056 case FLK_GRANTED_STATE:
4057 if ((new_state == FLK_START_STATE) ||
4058 (new_state == FLK_INTERRUPTED_STATE) ||
4059 (new_state == FLK_CANCELLED_STATE)) {
4060 return (0);
4061 } else {
4062 return (1);
4063 }
4064 case FLK_CANCELLED_STATE:
4065 if ((new_state == FLK_INTERRUPTED_STATE) ||
4066 (new_state == FLK_DEAD_STATE)) {
4067 return (0);
4068 } else {
4069 return (1);
4070 }
4071 case FLK_INTERRUPTED_STATE:
4072 if (new_state == FLK_DEAD_STATE) {
4073 return (0);
4074 } else {
4075 return (1);
4076 }
4077 case FLK_DEAD_STATE:
4078 /* May be set more than once */
4079 if (new_state == FLK_DEAD_STATE) {
4080 return (0);
4081 } else {
4082 return (1);
4083 }
4084 default:
4085 return (1);
4086 }
4087 }
4088
4089 static void
check_sleeping_locks(graph_t * gp)4090 check_sleeping_locks(graph_t *gp)
4091 {
4092 lock_descriptor_t *lock1, *lock2;
4093 edge_t *ep;
4094 for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4095 lock1 = lock1->l_next) {
4096 ASSERT(!IS_BARRIER(lock1));
4097 for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4098 lock2 = lock2->l_next) {
4099 if (lock1->l_vnode == lock2->l_vnode) {
4100 if (BLOCKS(lock2, lock1)) {
4101 ASSERT(!IS_GRANTED(lock1));
4102 ASSERT(!NOT_BLOCKED(lock1));
4103 path(lock1, lock2);
4104 }
4105 }
4106 }
4107
4108 for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4109 lock2 = lock2->l_next) {
4110 ASSERT(!IS_BARRIER(lock1));
4111 if (lock1->l_vnode == lock2->l_vnode) {
4112 if (BLOCKS(lock2, lock1)) {
4113 ASSERT(!IS_GRANTED(lock1));
4114 ASSERT(!NOT_BLOCKED(lock1));
4115 path(lock1, lock2);
4116 }
4117 }
4118 }
4119 ep = FIRST_ADJ(lock1);
4120 while (ep != HEAD(lock1)) {
4121 ASSERT(BLOCKS(ep->to_vertex, lock1));
4122 ep = NEXT_ADJ(ep);
4123 }
4124 }
4125 }
4126
4127 static int
level_two_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2,int no_path)4128 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4129 {
4130 edge_t *ep;
4131 lock_descriptor_t *vertex;
4132 lock_descriptor_t *vertex_stack;
4133
4134 STACK_INIT(vertex_stack);
4135
4136 flk_graph_uncolor(lock1->l_graph);
4137 ep = FIRST_ADJ(lock1);
4138 ASSERT(ep != HEAD(lock1));
4139 while (ep != HEAD(lock1)) {
4140 if (no_path)
4141 ASSERT(ep->to_vertex != lock2);
4142 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4143 COLOR(ep->to_vertex);
4144 ep = NEXT_ADJ(ep);
4145 }
4146
4147 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4148 STACK_POP(vertex_stack, l_dstack);
4149 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4150 ep = NEXT_ADJ(ep)) {
4151 if (COLORED(ep->to_vertex))
4152 continue;
4153 COLOR(ep->to_vertex);
4154 if (ep->to_vertex == lock2)
4155 return (1);
4156
4157 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4158 }
4159 }
4160 return (0);
4161 }
4162
4163 static void
check_owner_locks(graph_t * gp,pid_t pid,int sysid,vnode_t * vp)4164 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4165 {
4166 lock_descriptor_t *lock;
4167
4168 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4169
4170 if (lock) {
4171 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4172 if (lock->l_flock.l_pid == pid &&
4173 lock->l_flock.l_sysid == sysid)
4174 cmn_err(CE_PANIC,
4175 "owner pid %d's lock %p in active queue",
4176 pid, (void *)lock);
4177 lock = lock->l_next;
4178 }
4179 }
4180 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4181
4182 if (lock) {
4183 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4184 if (lock->l_flock.l_pid == pid &&
4185 lock->l_flock.l_sysid == sysid)
4186 cmn_err(CE_PANIC,
4187 "owner pid %d's lock %p in sleep queue",
4188 pid, (void *)lock);
4189 lock = lock->l_next;
4190 }
4191 }
4192 }
4193
4194 static int
level_one_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4195 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4196 {
4197 edge_t *ep = FIRST_ADJ(lock1);
4198
4199 while (ep != HEAD(lock1)) {
4200 if (ep->to_vertex == lock2)
4201 return (1);
4202 else
4203 ep = NEXT_ADJ(ep);
4204 }
4205 return (0);
4206 }
4207
4208 static int
no_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4209 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4210 {
4211 return (!level_two_path(lock1, lock2, 1));
4212 }
4213
4214 static void
path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4215 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4216 {
4217 if (level_one_path(lock1, lock2)) {
4218 if (level_two_path(lock1, lock2, 0) != 0) {
4219 cmn_err(CE_WARN,
4220 "one edge one path from lock1 %p lock2 %p",
4221 (void *)lock1, (void *)lock2);
4222 }
4223 } else if (no_path(lock1, lock2)) {
4224 cmn_err(CE_PANIC,
4225 "No path from lock1 %p to lock2 %p",
4226 (void *)lock1, (void *)lock2);
4227 }
4228 }
4229 #endif /* DEBUG */
4230