xref: /netbsd-src/sys/kern/vfs_lockf.c (revision e94a5d02693120d4ad9d909e488894e9fcf0eb76)
1 /*	$NetBSD: vfs_lockf.c,v 1.83 2024/12/07 02:27:38 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Scooter Morris at Genentech Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)ufs_lockf.c	8.4 (Berkeley) 10/26/94
35  */
36 
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: vfs_lockf.c,v 1.83 2024/12/07 02:27:38 riastradh Exp $");
39 
40 #include <sys/param.h>
41 #include <sys/types.h>
42 
43 #include <sys/atomic.h>
44 #include <sys/fcntl.h>
45 #include <sys/file.h>
46 #include <sys/kauth.h>
47 #include <sys/kernel.h>
48 #include <sys/kmem.h>
49 #include <sys/lockf.h>
50 #include <sys/proc.h>
51 #include <sys/sdt.h>
52 #include <sys/systm.h>
53 #include <sys/uidinfo.h>
54 #include <sys/vnode.h>
55 
56 /*
57  * The lockf structure is a kernel structure which contains the information
58  * associated with a byte range lock.  The lockf structures are linked into
59  * the vnode structure.  Locks are sorted by the starting byte of the lock for
60  * efficiency.
61  *
62  * lf_next is used for two purposes, depending on whether the lock is
63  * being held, or is in conflict with an existing lock.  If this lock
64  * is held, it indicates the next lock on the same vnode.
65  * For pending locks, if lock->lf_next is non-NULL, then lock->lf_block
66  * must be queued on the lf_blkhd TAILQ of lock->lf_next.
67  */
68 
69 TAILQ_HEAD(locklist, lockf);
70 
71 struct lockf {
72 	kcondvar_t lf_cv;	 /* Signalling */
73 	short	lf_flags;	 /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */
74 	short	lf_type;	 /* Lock type: F_RDLCK, F_WRLCK */
75 	off_t	lf_start;	 /* The byte # of the start of the lock */
76 	off_t	lf_end;		 /* The byte # of the end of the lock (-1=EOF)*/
77 	void	*lf_id;		 /* process or file description holding lock */
78 	struct	lockf **lf_head; /* Back pointer to the head of lockf list */
79 	struct	lockf *lf_next;	 /* Next lock on this vnode, or blocking lock */
80 	struct  locklist lf_blkhd; /* List of requests blocked on this lock */
81 	TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */
82 	struct	uidinfo *lf_uip; /* Cached pointer to uidinfo */
83 };
84 
85 /* Maximum length of sleep chains to traverse to try and detect deadlock. */
86 #define MAXDEPTH 50
87 
88 static kmutex_t lockf_lock __cacheline_aligned;
89 static char lockstr[] = "lockf";
90 
91 /*
92  * This variable controls the maximum number of processes that will
93  * be checked in doing deadlock detection.
94  */
95 int maxlockdepth = MAXDEPTH;
96 
97 #ifdef LOCKF_DEBUG
98 int	lockf_debug = 0;
99 #endif
100 
101 #define SELF	0x1
102 #define OTHERS	0x2
103 
104 /*
105  * XXX TODO
106  * Misc cleanups: "void *id" should be visible in the API as a
107  * "struct proc *".
108  * (This requires rototilling all VFS's which support advisory locking).
109  */
110 
111 /*
112  * If there's a lot of lock contention on a single vnode, locking
113  * schemes which allow for more paralleism would be needed.  Given how
114  * infrequently byte-range locks are actually used in typical BSD
115  * code, a more complex approach probably isn't worth it.
116  */
117 
118 /*
119  * We enforce a limit on locks by uid, so that a single user cannot
120  * run the kernel out of memory.  For now, the limit is pretty coarse.
121  * There is no limit on root.
122  *
123  * Splitting a lock will always succeed, regardless of current allocations.
124  * If you're slightly above the limit, we still have to permit an allocation
125  * so that the unlock can succeed.  If the unlocking causes too many splits,
126  * however, you're totally cutoff.
127  */
128 #define MAXLOCKSPERUID (2 * maxfiles)
129 
130 #ifdef LOCKF_DEBUG
131 /*
132  * Print out a lock.
133  */
134 static void
135 lf_print(const char *tag, struct lockf *lock)
136 {
137 
138 	printf("%s: lock %p for ", tag, lock);
139 	if (lock->lf_flags & F_POSIX)
140 		printf("proc %d", ((struct proc *)lock->lf_id)->p_pid);
141 	else
142 		printf("file %p", (struct file *)lock->lf_id);
143 	printf(" %s, start %jd, end %jd",
144 		lock->lf_type == F_RDLCK ? "shared" :
145 		lock->lf_type == F_WRLCK ? "exclusive" :
146 		lock->lf_type == F_UNLCK ? "unlock" :
147 		"unknown", (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
148 	if (TAILQ_FIRST(&lock->lf_blkhd))
149 		printf(" block %p\n", TAILQ_FIRST(&lock->lf_blkhd));
150 	else
151 		printf("\n");
152 }
153 
154 static void
155 lf_printlist(const char *tag, struct lockf *lock)
156 {
157 	struct lockf *lf, *blk;
158 
159 	printf("%s: Lock list:\n", tag);
160 	for (lf = *lock->lf_head; lf; lf = lf->lf_next) {
161 		printf("\tlock %p for ", lf);
162 		if (lf->lf_flags & F_POSIX)
163 			printf("proc %d", ((struct proc *)lf->lf_id)->p_pid);
164 		else
165 			printf("file %p", (struct file *)lf->lf_id);
166 		printf(", %s, start %jd, end %jd",
167 		    lf->lf_type == F_RDLCK ? "shared" :
168 		    lf->lf_type == F_WRLCK ? "exclusive" :
169 		    lf->lf_type == F_UNLCK ? "unlock" :
170 		    "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
171 		TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
172 			if (blk->lf_flags & F_POSIX)
173 				printf("; proc %d",
174 				    ((struct proc *)blk->lf_id)->p_pid);
175 			else
176 				printf("; file %p", (struct file *)blk->lf_id);
177 			printf(", %s, start %jd, end %jd",
178 			    blk->lf_type == F_RDLCK ? "shared" :
179 			    blk->lf_type == F_WRLCK ? "exclusive" :
180 			    blk->lf_type == F_UNLCK ? "unlock" :
181 			    "unknown",
182 			    (intmax_t)blk->lf_start, (intmax_t)blk->lf_end);
183 			if (TAILQ_FIRST(&blk->lf_blkhd))
184 				 panic("lf_printlist: bad list");
185 		}
186 		printf("\n");
187 	}
188 }
189 #endif /* LOCKF_DEBUG */
190 
191 /*
192  * 3 options for allowfail.
193  * 0 - always allocate.  1 - cutoff at limit.  2 - cutoff at double limit.
194  */
195 static struct lockf *
196 lf_alloc(int allowfail)
197 {
198 	struct uidinfo *uip;
199 	struct lockf *lock;
200 	u_long lcnt;
201 	const uid_t uid = kauth_cred_geteuid(kauth_cred_get());
202 
203 	uip = uid_find(uid);
204 	lcnt = atomic_inc_ulong_nv(&uip->ui_lockcnt);
205 	if (uid && allowfail && lcnt >
206 	    (allowfail == 1 ? MAXLOCKSPERUID : (MAXLOCKSPERUID * 2))) {
207 		atomic_dec_ulong(&uip->ui_lockcnt);
208 		return NULL;
209 	}
210 
211 	lock = kmem_alloc(sizeof(*lock), KM_SLEEP);
212 	lock->lf_uip = uip;
213 	cv_init(&lock->lf_cv, lockstr);
214 	return lock;
215 }
216 
217 static void
218 lf_free(struct lockf *lock)
219 {
220 
221 	atomic_dec_ulong(&lock->lf_uip->ui_lockcnt);
222 	cv_destroy(&lock->lf_cv);
223 	kmem_free(lock, sizeof(*lock));
224 }
225 
226 /*
227  * Walk the list of locks for an inode to
228  * find an overlapping lock (if any).
229  *
230  * NOTE: this returns only the FIRST overlapping lock.  There
231  *	 may be more than one.
232  */
233 static int
234 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
235     struct lockf ***prev, struct lockf **overlap)
236 {
237 	off_t start, end;
238 
239 	*overlap = lf;
240 	if (lf == NULL)
241 		return 0;
242 #ifdef LOCKF_DEBUG
243 	if (lockf_debug & 2)
244 		lf_print("lf_findoverlap: looking for overlap in", lock);
245 #endif /* LOCKF_DEBUG */
246 	start = lock->lf_start;
247 	end = lock->lf_end;
248 	while (lf != NULL) {
249 		if (((type == SELF) && lf->lf_id != lock->lf_id) ||
250 		    ((type == OTHERS) && lf->lf_id == lock->lf_id)) {
251 			*prev = &lf->lf_next;
252 			*overlap = lf = lf->lf_next;
253 			continue;
254 		}
255 #ifdef LOCKF_DEBUG
256 		if (lockf_debug & 2)
257 			lf_print("\tchecking", lf);
258 #endif /* LOCKF_DEBUG */
259 		/*
260 		 * OK, check for overlap
261 		 *
262 		 * Six cases:
263 		 *	0) no overlap
264 		 *	1) overlap == lock
265 		 *	2) overlap contains lock
266 		 *	3) lock contains overlap
267 		 *	4) overlap starts before lock
268 		 *	5) overlap ends after lock
269 		 */
270 		if ((lf->lf_end != -1 && start > lf->lf_end) ||
271 		    (end != -1 && lf->lf_start > end)) {
272 			/* Case 0 */
273 #ifdef LOCKF_DEBUG
274 			if (lockf_debug & 2)
275 				printf("no overlap\n");
276 #endif /* LOCKF_DEBUG */
277 			if ((type & SELF) && end != -1 && lf->lf_start > end)
278 				return 0;
279 			*prev = &lf->lf_next;
280 			*overlap = lf = lf->lf_next;
281 			continue;
282 		}
283 		if ((lf->lf_start == start) && (lf->lf_end == end)) {
284 			/* Case 1 */
285 #ifdef LOCKF_DEBUG
286 			if (lockf_debug & 2)
287 				printf("overlap == lock\n");
288 #endif /* LOCKF_DEBUG */
289 			return 1;
290 		}
291 		if ((lf->lf_start <= start) &&
292 		    (end != -1) &&
293 		    ((lf->lf_end >= end) || (lf->lf_end == -1))) {
294 			/* Case 2 */
295 #ifdef LOCKF_DEBUG
296 			if (lockf_debug & 2)
297 				printf("overlap contains lock\n");
298 #endif /* LOCKF_DEBUG */
299 			return 2;
300 		}
301 		if (start <= lf->lf_start &&
302 		           (end == -1 ||
303 			   (lf->lf_end != -1 && end >= lf->lf_end))) {
304 			/* Case 3 */
305 #ifdef LOCKF_DEBUG
306 			if (lockf_debug & 2)
307 				printf("lock contains overlap\n");
308 #endif /* LOCKF_DEBUG */
309 			return 3;
310 		}
311 		if ((lf->lf_start < start) &&
312 			((lf->lf_end >= start) || (lf->lf_end == -1))) {
313 			/* Case 4 */
314 #ifdef LOCKF_DEBUG
315 			if (lockf_debug & 2)
316 				printf("overlap starts before lock\n");
317 #endif /* LOCKF_DEBUG */
318 			return 4;
319 		}
320 		if ((lf->lf_start > start) &&
321 			(end != -1) &&
322 			((lf->lf_end > end) || (lf->lf_end == -1))) {
323 			/* Case 5 */
324 #ifdef LOCKF_DEBUG
325 			if (lockf_debug & 2)
326 				printf("overlap ends after lock\n");
327 #endif /* LOCKF_DEBUG */
328 			return 5;
329 		}
330 		panic("lf_findoverlap: default");
331 	}
332 	return 0;
333 }
334 
335 /*
336  * Split a lock and a contained region into
337  * two or three locks as necessary.
338  */
339 static void
340 lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **sparelock)
341 {
342 	struct lockf *splitlock;
343 
344 #ifdef LOCKF_DEBUG
345 	if (lockf_debug & 2) {
346 		lf_print("lf_split", lock1);
347 		lf_print("splitting from", lock2);
348 	}
349 #endif /* LOCKF_DEBUG */
350 	/*
351 	 * Check to see if splitting into only two pieces.
352 	 */
353 	if (lock1->lf_start == lock2->lf_start) {
354 		lock1->lf_start = lock2->lf_end + 1;
355 		lock2->lf_next = lock1;
356 		return;
357 	}
358 	if (lock1->lf_end == lock2->lf_end) {
359 		lock1->lf_end = lock2->lf_start - 1;
360 		lock2->lf_next = lock1->lf_next;
361 		lock1->lf_next = lock2;
362 		return;
363 	}
364 	/*
365 	 * Make a new lock consisting of the last part of
366 	 * the encompassing lock
367 	 */
368 	splitlock = *sparelock;
369 	*sparelock = NULL;
370 	cv_destroy(&splitlock->lf_cv);
371 	memcpy(splitlock, lock1, sizeof(*splitlock));
372 	cv_init(&splitlock->lf_cv, lockstr);
373 
374 	splitlock->lf_start = lock2->lf_end + 1;
375 	TAILQ_INIT(&splitlock->lf_blkhd);
376 	lock1->lf_end = lock2->lf_start - 1;
377 	/*
378 	 * OK, now link it in
379 	 */
380 	splitlock->lf_next = lock1->lf_next;
381 	lock2->lf_next = splitlock;
382 	lock1->lf_next = lock2;
383 }
384 
385 /*
386  * Wakeup a blocklist
387  */
388 static void
389 lf_wakelock(struct lockf *listhead)
390 {
391 	struct lockf *wakelock;
392 
393 	while ((wakelock = TAILQ_FIRST(&listhead->lf_blkhd))) {
394 		KASSERT(wakelock->lf_next == listhead);
395 		TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
396 		wakelock->lf_next = NULL;
397 #ifdef LOCKF_DEBUG
398 		if (lockf_debug & 2)
399 			lf_print("lf_wakelock: awakening", wakelock);
400 #endif
401 		cv_broadcast(&wakelock->lf_cv);
402 	}
403 }
404 
405 /*
406  * Remove a byte-range lock on an inode.
407  *
408  * Generally, find the lock (or an overlap to that lock)
409  * and remove it (or shrink it), then wakeup anyone we can.
410  */
411 static int
412 lf_clearlock(struct lockf *unlock, struct lockf **sparelock)
413 {
414 	struct lockf **head = unlock->lf_head;
415 	struct lockf *lf = *head;
416 	struct lockf *overlap, **prev;
417 	int ovcase;
418 
419 	if (lf == NULL)
420 		return 0;
421 #ifdef LOCKF_DEBUG
422 	if (unlock->lf_type != F_UNLCK)
423 		panic("lf_clearlock: bad type");
424 	if (lockf_debug & 1)
425 		lf_print("lf_clearlock", unlock);
426 #endif /* LOCKF_DEBUG */
427 	prev = head;
428 	while ((ovcase = lf_findoverlap(lf, unlock, SELF,
429 	    &prev, &overlap)) != 0) {
430 		/*
431 		 * Wakeup the list of locks to be retried.
432 		 */
433 		lf_wakelock(overlap);
434 
435 		switch (ovcase) {
436 
437 		case 1: /* overlap == lock */
438 			*prev = overlap->lf_next;
439 			lf_free(overlap);
440 			break;
441 
442 		case 2: /* overlap contains lock: split it */
443 			if (overlap->lf_start == unlock->lf_start) {
444 				overlap->lf_start = unlock->lf_end + 1;
445 				break;
446 			}
447 			lf_split(overlap, unlock, sparelock);
448 			overlap->lf_next = unlock->lf_next;
449 			break;
450 
451 		case 3: /* lock contains overlap */
452 			*prev = overlap->lf_next;
453 			lf = overlap->lf_next;
454 			lf_free(overlap);
455 			continue;
456 
457 		case 4: /* overlap starts before lock */
458 			overlap->lf_end = unlock->lf_start - 1;
459 			prev = &overlap->lf_next;
460 			lf = overlap->lf_next;
461 			continue;
462 
463 		case 5: /* overlap ends after lock */
464 			overlap->lf_start = unlock->lf_end + 1;
465 			break;
466 		}
467 		break;
468 	}
469 #ifdef LOCKF_DEBUG
470 	if (lockf_debug & 1)
471 		lf_printlist("lf_clearlock", unlock);
472 #endif /* LOCKF_DEBUG */
473 	return 0;
474 }
475 
476 /*
477  * Walk the list of locks for an inode and
478  * return the first blocking lock.
479  */
480 static struct lockf *
481 lf_getblock(struct lockf *lock)
482 {
483 	struct lockf **prev, *overlap, *lf = *(lock->lf_head);
484 
485 	prev = lock->lf_head;
486 	while (lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != 0) {
487 		/*
488 		 * We've found an overlap, see if it blocks us
489 		 */
490 		if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
491 			return overlap;
492 		/*
493 		 * Nope, point to the next one on the list and
494 		 * see if it blocks us
495 		 */
496 		lf = overlap->lf_next;
497 	}
498 	return NULL;
499 }
500 
501 /*
502  * Set a byte-range lock.
503  */
504 static int
505 lf_setlock(struct lockf *lock, struct lockf **sparelock,
506     kmutex_t *interlock)
507 {
508 	struct lockf *block;
509 	struct lockf **head = lock->lf_head;
510 	struct lockf **prev, *overlap, *ltmp;
511 	int ovcase, needtolink, error;
512 
513 #ifdef LOCKF_DEBUG
514 	if (lockf_debug & 1)
515 		lf_print("lf_setlock", lock);
516 #endif /* LOCKF_DEBUG */
517 
518 	/*
519 	 * Scan lock list for this file looking for locks that would block us.
520 	 */
521 	while ((block = lf_getblock(lock)) != NULL) {
522 		/*
523 		 * Free the structure and return if nonblocking.
524 		 */
525 		if ((lock->lf_flags & F_WAIT) == 0) {
526 			lf_free(lock);
527 			return SET_ERROR(EAGAIN);
528 		}
529 		/*
530 		 * We are blocked. Since flock style locks cover
531 		 * the whole file, there is no chance for deadlock.
532 		 * For byte-range locks we must check for deadlock.
533 		 *
534 		 * Deadlock detection is done by looking through the
535 		 * wait channels to see if there are any cycles that
536 		 * involve us. MAXDEPTH is set just to make sure we
537 		 * do not go off into neverneverland.
538 		 */
539 		if ((lock->lf_flags & F_POSIX) &&
540 		    (block->lf_flags & F_POSIX)) {
541 			struct lwp *wlwp;
542 			volatile const struct lockf *waitblock;
543 			int i = 0;
544 			struct proc *p;
545 
546 			p = (struct proc *)block->lf_id;
547 			KASSERT(p != NULL);
548 			while (i++ < maxlockdepth) {
549 				mutex_enter(p->p_lock);
550 				if (p->p_nlwps > 1) {
551 					mutex_exit(p->p_lock);
552 					break;
553 				}
554 				wlwp = LIST_FIRST(&p->p_lwps);
555 				lwp_lock(wlwp);
556 				if (wlwp->l_wchan == NULL ||
557 				    wlwp->l_wmesg != lockstr) {
558 					lwp_unlock(wlwp);
559 					mutex_exit(p->p_lock);
560 					break;
561 				}
562 				waitblock = wlwp->l_wchan;
563 				lwp_unlock(wlwp);
564 				mutex_exit(p->p_lock);
565 				/* Get the owner of the blocking lock */
566 				waitblock = waitblock->lf_next;
567 				if ((waitblock->lf_flags & F_POSIX) == 0)
568 					break;
569 				p = (struct proc *)waitblock->lf_id;
570 				if (p == curproc) {
571 					lf_free(lock);
572 					return SET_ERROR(EDEADLK);
573 				}
574 			}
575 			/*
576 			 * If we're still following a dependency chain
577 			 * after maxlockdepth iterations, assume we're in
578 			 * a cycle to be safe.
579 			 */
580 			if (i >= maxlockdepth) {
581 				lf_free(lock);
582 				return SET_ERROR(EDEADLK);
583 			}
584 		}
585 		/*
586 		 * For flock type locks, we must first remove
587 		 * any shared locks that we hold before we sleep
588 		 * waiting for an exclusive lock.
589 		 */
590 		if ((lock->lf_flags & F_FLOCK) &&
591 		    lock->lf_type == F_WRLCK) {
592 			lock->lf_type = F_UNLCK;
593 			(void) lf_clearlock(lock, NULL);
594 			lock->lf_type = F_WRLCK;
595 		}
596 		/*
597 		 * Add our lock to the blocked list and sleep until we're free.
598 		 * Remember who blocked us (for deadlock detection).
599 		 */
600 		lock->lf_next = block;
601 		TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
602 #ifdef LOCKF_DEBUG
603 		if (lockf_debug & 1) {
604 			lf_print("lf_setlock: blocking on", block);
605 			lf_printlist("lf_setlock", block);
606 		}
607 #endif /* LOCKF_DEBUG */
608 		error = cv_wait_sig(&lock->lf_cv, interlock);
609 
610 		/*
611 		 * We may have been awoken by a signal (in
612 		 * which case we must remove ourselves from the
613 		 * blocked list) and/or by another process
614 		 * releasing a lock (in which case we have already
615 		 * been removed from the blocked list and our
616 		 * lf_next field set to NULL).
617 		 */
618 		if (lock->lf_next != NULL) {
619 			TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
620 			lock->lf_next = NULL;
621 		}
622 		if (error) {
623 			lf_free(lock);
624 			return error;
625 		}
626 	}
627 	/*
628 	 * No blocks!!  Add the lock.  Note that we will
629 	 * downgrade or upgrade any overlapping locks this
630 	 * process already owns.
631 	 *
632 	 * Skip over locks owned by other processes.
633 	 * Handle any locks that overlap and are owned by ourselves.
634 	 */
635 	prev = head;
636 	block = *head;
637 	needtolink = 1;
638 	for (;;) {
639 		ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
640 		if (ovcase)
641 			block = overlap->lf_next;
642 		/*
643 		 * Six cases:
644 		 *	0) no overlap
645 		 *	1) overlap == lock
646 		 *	2) overlap contains lock
647 		 *	3) lock contains overlap
648 		 *	4) overlap starts before lock
649 		 *	5) overlap ends after lock
650 		 */
651 		switch (ovcase) {
652 		case 0: /* no overlap */
653 			if (needtolink) {
654 				*prev = lock;
655 				lock->lf_next = overlap;
656 			}
657 			break;
658 
659 		case 1: /* overlap == lock */
660 			/*
661 			 * If downgrading lock, others may be
662 			 * able to acquire it.
663 			 */
664 			if (lock->lf_type == F_RDLCK &&
665 			    overlap->lf_type == F_WRLCK)
666 				lf_wakelock(overlap);
667 			overlap->lf_type = lock->lf_type;
668 			lf_free(lock);
669 			lock = overlap; /* for debug output below */
670 			break;
671 
672 		case 2: /* overlap contains lock */
673 			/*
674 			 * Check for common starting point and different types.
675 			 */
676 			if (overlap->lf_type == lock->lf_type) {
677 				lf_free(lock);
678 				lock = overlap; /* for debug output below */
679 				break;
680 			}
681 			if (overlap->lf_start == lock->lf_start) {
682 				*prev = lock;
683 				lock->lf_next = overlap;
684 				overlap->lf_start = lock->lf_end + 1;
685 			} else
686 				lf_split(overlap, lock, sparelock);
687 			lf_wakelock(overlap);
688 			break;
689 
690 		case 3: /* lock contains overlap */
691 			/*
692 			 * If downgrading lock, others may be able to
693 			 * acquire it, otherwise take the list.
694 			 */
695 			if (lock->lf_type == F_RDLCK &&
696 			    overlap->lf_type == F_WRLCK) {
697 				lf_wakelock(overlap);
698 			} else {
699 				while ((ltmp =
700 					TAILQ_FIRST(&overlap->lf_blkhd))
701 				    != NULL) {
702 					KASSERT(ltmp->lf_next == overlap);
703 					TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
704 					    lf_block);
705 					ltmp->lf_next = lock;
706 					TAILQ_INSERT_TAIL(&lock->lf_blkhd,
707 					    ltmp, lf_block);
708 				}
709 			}
710 			/*
711 			 * Add the new lock if necessary and delete the
712 			 * overlap.
713 			 */
714 			if (needtolink) {
715 				*prev = lock;
716 				lock->lf_next = overlap->lf_next;
717 				prev = &lock->lf_next;
718 				needtolink = 0;
719 			} else
720 				*prev = overlap->lf_next;
721 			lf_free(overlap);
722 			continue;
723 
724 		case 4: /* overlap starts before lock */
725 			/*
726 			 * Add lock after overlap on the list.
727 			 */
728 			lock->lf_next = overlap->lf_next;
729 			overlap->lf_next = lock;
730 			overlap->lf_end = lock->lf_start - 1;
731 			prev = &lock->lf_next;
732 			lf_wakelock(overlap);
733 			needtolink = 0;
734 			continue;
735 
736 		case 5: /* overlap ends after lock */
737 			/*
738 			 * Add the new lock before overlap.
739 			 */
740 			if (needtolink) {
741 				*prev = lock;
742 				lock->lf_next = overlap;
743 			}
744 			overlap->lf_start = lock->lf_end + 1;
745 			lf_wakelock(overlap);
746 			break;
747 		}
748 		break;
749 	}
750 #ifdef LOCKF_DEBUG
751 	if (lockf_debug & 1) {
752 		lf_print("lf_setlock: got the lock", lock);
753 		lf_printlist("lf_setlock", lock);
754 	}
755 #endif /* LOCKF_DEBUG */
756 	return 0;
757 }
758 
759 /*
760  * Check whether there is a blocking lock,
761  * and if so return its process identifier.
762  */
763 static int
764 lf_getlock(struct lockf *lock, struct flock *fl)
765 {
766 	struct lockf *block;
767 
768 #ifdef LOCKF_DEBUG
769 	if (lockf_debug & 1)
770 		lf_print("lf_getlock", lock);
771 #endif /* LOCKF_DEBUG */
772 
773 	if ((block = lf_getblock(lock)) != NULL) {
774 		fl->l_type = block->lf_type;
775 		fl->l_whence = SEEK_SET;
776 		fl->l_start = block->lf_start;
777 		if (block->lf_end == -1)
778 			fl->l_len = 0;
779 		else
780 			fl->l_len = block->lf_end - block->lf_start + 1;
781 		if (block->lf_flags & F_POSIX)
782 			fl->l_pid = ((struct proc *)block->lf_id)->p_pid;
783 		else
784 			fl->l_pid = -1;
785 	} else {
786 		fl->l_type = F_UNLCK;
787 	}
788 	return 0;
789 }
790 
791 /*
792  * Do an advisory lock operation.
793  */
794 int
795 lf_advlock(struct vop_advlock_args *ap, struct lockf **head, off_t size)
796 {
797 	struct flock *fl = ap->a_fl;
798 	struct lockf *lock = NULL;
799 	struct lockf *sparelock;
800 	kmutex_t *interlock = &lockf_lock;
801 	off_t start, end;
802 	int error = 0;
803 
804 	KASSERTMSG(size >= 0, "size=%jd", (intmax_t)size);
805 
806 	/*
807 	 * Convert the flock structure into a start and end.
808 	 */
809 	switch (fl->l_whence) {
810 	case SEEK_SET:
811 	case SEEK_CUR:
812 		/*
813 		 * Caller is responsible for adding any necessary offset
814 		 * when SEEK_CUR is used.
815 		 */
816 		start = fl->l_start;
817 		break;
818 
819 	case SEEK_END:
820 		if (fl->l_start > __type_max(off_t) - size)
821 			return SET_ERROR(EINVAL);
822 		start = size + fl->l_start;
823 		break;
824 
825 	default:
826 		return SET_ERROR(EINVAL);
827 	}
828 
829 	if (fl->l_len == 0)
830 		end = -1;
831 	else {
832 		if (fl->l_len >= 0) {
833 			if (start >= 0 &&
834 			    fl->l_len - 1 > __type_max(off_t) - start)
835 				return SET_ERROR(EINVAL);
836 			end = start + (fl->l_len - 1);
837 		} else {
838 			/* lockf() allows -ve lengths */
839 			if (start < 0)
840 				return SET_ERROR(EINVAL);
841 			end = start - 1;
842 			start += fl->l_len;
843 		}
844 	}
845 	if (start < 0)
846 		return SET_ERROR(EINVAL);
847 
848 	/*
849 	 * Allocate locks before acquiring the interlock.  We need two
850 	 * locks in the worst case.
851 	 */
852 	switch (ap->a_op) {
853 	case F_SETLK:
854 	case F_UNLCK:
855 		/*
856 		 * XXX For F_UNLCK case, we can re-use the lock.
857 		 */
858 		if ((ap->a_flags & F_FLOCK) == 0) {
859 			/*
860 			 * Byte-range lock might need one more lock.
861 			 */
862 			sparelock = lf_alloc(0);
863 			if (sparelock == NULL) {
864 				error = SET_ERROR(ENOMEM);
865 				goto quit;
866 			}
867 			break;
868 		}
869 		/* FALLTHROUGH */
870 
871 	case F_GETLK:
872 		sparelock = NULL;
873 		break;
874 
875 	default:
876 		return SET_ERROR(EINVAL);
877 	}
878 
879 	switch (ap->a_op) {
880 	case F_SETLK:
881 		lock = lf_alloc(1);
882 		break;
883 	case F_UNLCK:
884 		if (start == 0 || end == -1) {
885 			/* never split */
886 			lock = lf_alloc(0);
887 		} else {
888 			/* might split */
889 			lock = lf_alloc(2);
890 		}
891 		break;
892 	case F_GETLK:
893 		lock = lf_alloc(0);
894 		break;
895 	}
896 	if (lock == NULL) {
897 		error = SET_ERROR(ENOMEM);
898 		goto quit;
899 	}
900 
901 	mutex_enter(interlock);
902 
903 	/*
904 	 * Avoid the common case of unlocking when inode has no locks.
905 	 */
906 	if (*head == (struct lockf *)0) {
907 		if (ap->a_op != F_SETLK) {
908 			fl->l_type = F_UNLCK;
909 			error = 0;
910 			goto quit_unlock;
911 		}
912 	}
913 
914 	/*
915 	 * Create the lockf structure.
916 	 */
917 	lock->lf_start = start;
918 	lock->lf_end = end;
919 	lock->lf_head = head;
920 	lock->lf_type = fl->l_type;
921 	lock->lf_next = (struct lockf *)0;
922 	TAILQ_INIT(&lock->lf_blkhd);
923 	lock->lf_flags = ap->a_flags;
924 	if (lock->lf_flags & F_POSIX) {
925 		KASSERT(curproc == (struct proc *)ap->a_id);
926 	}
927 	lock->lf_id = ap->a_id;
928 
929 	/*
930 	 * Do the requested operation.
931 	 */
932 	switch (ap->a_op) {
933 
934 	case F_SETLK:
935 		error = lf_setlock(lock, &sparelock, interlock);
936 		lock = NULL; /* lf_setlock freed it */
937 		break;
938 
939 	case F_UNLCK:
940 		error = lf_clearlock(lock, &sparelock);
941 		break;
942 
943 	case F_GETLK:
944 		error = lf_getlock(lock, fl);
945 		break;
946 
947 	default:
948 		break;
949 		/* NOTREACHED */
950 	}
951 
952 quit_unlock:
953 	mutex_exit(interlock);
954 quit:
955 	if (lock)
956 		lf_free(lock);
957 	if (sparelock)
958 		lf_free(sparelock);
959 
960 	return error;
961 }
962 
963 /*
964  * Initialize subsystem.
965  *
966  * XXX We use a global lock.  This could be the vnode interlock, but
967  * the deadlock detection code may need to inspect locks belonging to
968  * other files.
969  */
970 void
971 lf_init(void)
972 {
973 
974 	mutex_init(&lockf_lock, MUTEX_DEFAULT, IPL_NONE);
975 }
976