xref: /netbsd-src/sys/fs/udf/udf_strat_rmw.c (revision cac8e449158efc7261bebc8657cbb0125a2cfdde)
1 /* $NetBSD: udf_strat_rmw.c,v 1.6 2008/07/28 19:41:13 reinoud Exp $ */
2 
3 /*
4  * Copyright (c) 2006, 2008 Reinoud Zandijk
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  */
28 
29 #include <sys/cdefs.h>
30 #ifndef lint
31 __KERNEL_RCSID(0, "$NetBSD: udf_strat_rmw.c,v 1.6 2008/07/28 19:41:13 reinoud Exp $");
32 #endif /* not lint */
33 
34 
35 #if defined(_KERNEL_OPT)
36 #include "opt_quota.h"
37 #include "opt_compat_netbsd.h"
38 #endif
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/sysctl.h>
43 #include <sys/namei.h>
44 #include <sys/proc.h>
45 #include <sys/kernel.h>
46 #include <sys/vnode.h>
47 #include <miscfs/genfs/genfs_node.h>
48 #include <sys/mount.h>
49 #include <sys/buf.h>
50 #include <sys/file.h>
51 #include <sys/device.h>
52 #include <sys/disklabel.h>
53 #include <sys/ioctl.h>
54 #include <sys/malloc.h>
55 #include <sys/dirent.h>
56 #include <sys/stat.h>
57 #include <sys/conf.h>
58 #include <sys/kauth.h>
59 #include <sys/kthread.h>
60 #include <dev/clock_subr.h>
61 
62 #include <fs/udf/ecma167-udf.h>
63 #include <fs/udf/udf_mount.h>
64 
65 #if defined(_KERNEL_OPT)
66 #include "opt_udf.h"
67 #endif
68 
69 #include "udf.h"
70 #include "udf_subr.h"
71 #include "udf_bswap.h"
72 
73 
74 #define VTOI(vnode) ((struct udf_node *) (vnode)->v_data)
75 #define PRIV(ump) ((struct strat_private *) (ump)->strategy_private)
76 #define BTOE(buf) ((struct udf_eccline *) ((buf)->b_private))
77 
78 /* --------------------------------------------------------------------- */
79 
80 #define UDF_MAX_PACKET_SIZE	64			/* DONT change this */
81 
82 /* sheduler states */
83 #define UDF_SHED_MAX		6
84 #define UDF_SHED_READING	1
85 #define UDF_SHED_WRITING	2
86 #define UDF_SHED_SEQWRITING	3
87 #define UDF_SHED_IDLE		4			/* resting */
88 #define UDF_SHED_FREE		5			/* recycleable */
89 
90 /* flags */
91 #define ECC_LOCKED		0x01			/* prevent access   */
92 #define ECC_WANTED		0x02			/* trying access    */
93 #define ECC_SEQWRITING		0x04			/* sequential queue */
94 #define ECC_FLOATING		0x08			/* not queued yet   */
95 
96 
97 TAILQ_HEAD(ecclineq, udf_eccline);
98 struct udf_eccline {
99 	struct udf_mount	 *ump;
100 	uint64_t		  present;		/* preserve these */
101 	uint64_t		  readin;		/* bitmap */
102 	uint64_t		  dirty;		/* bitmap */
103 	uint64_t		  error;		/* bitmap */
104 	uint32_t		  refcnt;
105 
106 	uint32_t		  flags;
107 	uint32_t		  start_sector;		/* physical */
108 
109 	struct buf		 *buf;
110 	void			 *blob;
111 
112 	struct buf		 *bufs[UDF_MAX_PACKET_SIZE];
113 	uint32_t		  bufs_bpos[UDF_MAX_PACKET_SIZE];
114 	int			  bufs_len[UDF_MAX_PACKET_SIZE];
115 
116 	int			  queued_on;		/* on which BUFQ list */
117 	LIST_ENTRY(udf_eccline)   hashchain;		/* on sector lookup  */
118 };
119 
120 
121 struct strat_private {
122 	lwp_t			 *queue_lwp;
123 	kcondvar_t		  discstrat_cv;		/* to wait on       */
124 	kmutex_t		  discstrat_mutex;	/* disc strategy    */
125 	kmutex_t		  seqwrite_mutex;	/* protect mappings */
126 
127 	int			  run_thread;		/* thread control */
128 	int			  thread_finished;	/* thread control */
129 	int			  cur_queue;
130 
131 	int			  num_floating;
132 	int			  num_queued[UDF_SHED_MAX];
133 	struct bufq_state	 *queues[UDF_SHED_MAX];
134 	struct timespec		  last_queued[UDF_SHED_MAX];
135 	struct disk_strategy	  old_strategy_setting;
136 
137 	struct pool		  eccline_pool;
138 	struct pool		  ecclineblob_pool;
139 	LIST_HEAD(, udf_eccline)  eccline_hash[UDF_ECCBUF_HASHSIZE];
140 };
141 
142 /* --------------------------------------------------------------------- */
143 
144 #define UDF_LOCK_ECCLINE(eccline) udf_lock_eccline(eccline)
145 #define UDF_UNLOCK_ECCLINE(eccline) udf_unlock_eccline(eccline)
146 
147 /* can be called with or without discstrat lock */
148 static void
149 udf_lock_eccline(struct udf_eccline *eccline)
150 {
151 	struct strat_private *priv = PRIV(eccline->ump);
152 	int waslocked, ret;
153 
154 	waslocked = mutex_owned(&priv->discstrat_mutex);
155 	if (!waslocked)
156 		mutex_enter(&priv->discstrat_mutex);
157 
158 	/* wait until its unlocked first */
159 	while (eccline->flags & ECC_LOCKED) {
160 		eccline->flags |= ECC_WANTED;
161 		ret = cv_timedwait(&priv->discstrat_cv, &priv->discstrat_mutex,
162 			hz/8);
163 		if (ret == EWOULDBLOCK)
164 			DPRINTF(LOCKING, ("eccline lock helt, waiting for "
165 				"release"));
166 	}
167 	eccline->flags |= ECC_LOCKED;
168 	eccline->flags &= ~ECC_WANTED;
169 
170 	if (!waslocked)
171 		mutex_exit(&priv->discstrat_mutex);
172 }
173 
174 
175 /* can be called with or without discstrat lock */
176 static void
177 udf_unlock_eccline(struct udf_eccline *eccline)
178 {
179 	struct strat_private *priv = PRIV(eccline->ump);
180 	int waslocked;
181 
182 	KASSERT(mutex_owned(&priv->discstrat_mutex));
183 
184 	waslocked = mutex_owned(&priv->discstrat_mutex);
185 	if (!waslocked)
186 		mutex_enter(&priv->discstrat_mutex);
187 
188 	eccline->flags &= ~ECC_LOCKED;
189 	cv_broadcast(&priv->discstrat_cv);
190 
191 	if (!waslocked)
192 		mutex_exit(&priv->discstrat_mutex);
193 }
194 
195 
196 /* NOTE discstrat_mutex should be held! */
197 static void
198 udf_dispose_eccline(struct udf_eccline *eccline)
199 {
200 	struct strat_private *priv = PRIV(eccline->ump);
201 	struct buf *ret;
202 
203 	KASSERT(mutex_owned(&priv->discstrat_mutex));
204 
205 	KASSERT(eccline->refcnt == 0);
206 	KASSERT(eccline->dirty  == 0);
207 
208 	DPRINTF(ECCLINE, ("dispose eccline with start sector %d, "
209 		"present %0"PRIx64"\n", eccline->start_sector,
210 		eccline->present));
211 
212 	if (eccline->queued_on) {
213 		ret = BUFQ_CANCEL(priv->queues[eccline->queued_on], eccline->buf);
214 		KASSERT(ret == eccline->buf);
215 		priv->num_queued[eccline->queued_on]--;
216 	}
217 	LIST_REMOVE(eccline, hashchain);
218 
219 	if (eccline->flags & ECC_FLOATING) {
220 		eccline->flags &= ~ECC_FLOATING;
221 		priv->num_floating--;
222 	}
223 
224 	putiobuf(eccline->buf);
225 	pool_put(&priv->ecclineblob_pool, eccline->blob);
226 	pool_put(&priv->eccline_pool, eccline);
227 }
228 
229 
230 /* NOTE discstrat_mutex should be held! */
231 static void
232 udf_push_eccline(struct udf_eccline *eccline, int newqueue)
233 {
234 	struct strat_private *priv = PRIV(eccline->ump);
235 	struct buf *ret;
236 	int curqueue;
237 
238 	KASSERT(mutex_owned(&priv->discstrat_mutex));
239 
240 	DPRINTF(PARANOIA, ("DEBUG: buf %p pushed on queue %d\n", eccline->buf, newqueue));
241 
242 	/* requeue */
243 	curqueue = eccline->queued_on;
244 	if (curqueue) {
245 		ret = BUFQ_CANCEL(priv->queues[curqueue], eccline->buf);
246 
247 		DPRINTF(PARANOIA, ("push_eccline BUFQ_CANCEL returned %p when "
248 			"requested to remove %p from queue %d\n", ret,
249 			eccline->buf, curqueue));
250 #ifdef DIAGNOSTIC
251 		if (ret == NULL) {
252 			int i;
253 
254 			printf("udf_push_eccline: bufq_cancel can't find "
255 				"buffer; dumping queues\n");
256 			for (i = 1; i < UDF_SHED_MAX; i++) {
257 				printf("queue %d\n\t", i);
258 				ret = BUFQ_GET(priv->queues[i]);
259 				while (ret) {
260 					printf("%p ", ret);
261 					if (ret == eccline->buf)
262 						printf("[<-] ");
263 					ret = BUFQ_GET(priv->queues[i]);
264 				}
265 				printf("\n");
266 			}
267 			panic("fatal queue bug; exit");
268 		}
269 #endif
270 
271 		KASSERT(ret == eccline->buf);
272 		priv->num_queued[curqueue]--;
273 	}
274 
275 	BUFQ_PUT(priv->queues[newqueue], eccline->buf);
276 	eccline->queued_on = newqueue;
277 	priv->num_queued[newqueue]++;
278 	vfs_timestamp(&priv->last_queued[newqueue]);
279 
280 	if (eccline->flags & ECC_FLOATING) {
281 		eccline->flags &= ~ECC_FLOATING;
282 		priv->num_floating--;
283 	}
284 
285 	if ((newqueue != UDF_SHED_FREE) && (newqueue != UDF_SHED_IDLE))
286 		cv_signal(&priv->discstrat_cv);
287 }
288 
289 
290 static struct udf_eccline *
291 udf_pop_eccline(struct strat_private *priv, int queued_on)
292 {
293 	struct udf_eccline *eccline;
294 	struct buf *buf;
295 
296 	KASSERT(mutex_owned(&priv->discstrat_mutex));
297 
298 	buf = BUFQ_GET(priv->queues[queued_on]);
299 	if (!buf) {
300 		KASSERT(priv->num_queued[queued_on] == 0);
301 		return NULL;
302 	}
303 
304 	eccline = BTOE(buf);
305 	KASSERT(eccline->queued_on == queued_on);
306 	eccline->queued_on = 0;
307 	priv->num_queued[queued_on]--;
308 
309 	if (eccline->flags & ECC_FLOATING)
310 		panic("popping already marked floating eccline");
311 	eccline->flags |= ECC_FLOATING;
312 	priv->num_floating++;
313 
314 	DPRINTF(PARANOIA, ("DEBUG: buf %p popped from queue %d\n",
315 		eccline->buf, queued_on));
316 
317 	return eccline;
318 }
319 
320 
321 static struct udf_eccline *
322 udf_geteccline(struct udf_mount *ump, uint32_t sector, int flags)
323 {
324 	struct strat_private *priv = PRIV(ump);
325 	struct udf_eccline *eccline;
326 	uint32_t start_sector, lb_size, blobsize;
327 	uint8_t *eccline_blob;
328 	int line, line_offset;
329 	int num_busy, ret;
330 
331 	line_offset  = sector % ump->packet_size;
332 	start_sector = sector - line_offset;
333 	line = (start_sector/ump->packet_size) & UDF_ECCBUF_HASHMASK;
334 
335 	mutex_enter(&priv->discstrat_mutex);
336 
337 retry:
338 	DPRINTF(ECCLINE, ("get line sector %d, line %d\n", sector, line));
339 	LIST_FOREACH(eccline, &priv->eccline_hash[line], hashchain) {
340 		if (eccline->start_sector == start_sector) {
341 			DPRINTF(ECCLINE, ("\tfound eccline, start_sector %d\n",
342 				eccline->start_sector));
343 
344 			UDF_LOCK_ECCLINE(eccline);
345 			/* move from freelist (!) */
346 			if (eccline->queued_on == UDF_SHED_FREE) {
347 				DPRINTF(ECCLINE, ("was on freelist\n"));
348 				KASSERT(eccline->refcnt == 0);
349 				udf_push_eccline(eccline, UDF_SHED_IDLE);
350 			}
351 			eccline->refcnt++;
352 			mutex_exit(&priv->discstrat_mutex);
353 			return eccline;
354 		}
355 	}
356 
357 	DPRINTF(ECCLINE, ("\tnot found in eccline cache\n"));
358 	/* not found in eccline cache */
359 
360 	lb_size  = udf_rw32(ump->logical_vol->lb_size);
361 	blobsize = ump->packet_size * lb_size;
362 
363 	/* dont allow too many pending requests */
364 	DPRINTF(ECCLINE, ("\tallocating new eccline\n"));
365 	num_busy = (priv->num_queued[UDF_SHED_SEQWRITING] + priv->num_floating);
366 	if ((flags & ECC_SEQWRITING) && (num_busy > UDF_ECCLINE_MAXBUSY)) {
367 		ret = cv_timedwait(&priv->discstrat_cv,
368 			&priv->discstrat_mutex, hz/8);
369 		goto retry;
370 	}
371 
372 	eccline_blob = pool_get(&priv->ecclineblob_pool, PR_NOWAIT);
373 	eccline = pool_get(&priv->eccline_pool, PR_NOWAIT);
374 	if ((eccline_blob == NULL) || (eccline == NULL)) {
375 		if (eccline_blob)
376 			pool_put(&priv->ecclineblob_pool, eccline_blob);
377 		if (eccline)
378 			pool_put(&priv->eccline_pool, eccline);
379 
380 		/* out of memory for now; canibalise freelist */
381 		eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
382 		if (eccline == NULL) {
383 			/* serious trouble; wait and retry */
384 			cv_timedwait(&priv->discstrat_cv,
385 				&priv->discstrat_mutex, hz/8);
386 			goto retry;
387 		}
388 		/* push back line if we're waiting for it */
389 		if (eccline->flags & ECC_WANTED) {
390 			udf_push_eccline(eccline, UDF_SHED_IDLE);
391 			goto retry;
392 		}
393 
394 		/* unlink this entry */
395 		LIST_REMOVE(eccline, hashchain);
396 
397 		KASSERT(eccline->flags & ECC_FLOATING);
398 
399 		eccline_blob = eccline->blob;
400 		memset(eccline, 0, sizeof(struct udf_eccline));
401 		eccline->flags = ECC_FLOATING;
402 	} else {
403 		memset(eccline, 0, sizeof(struct udf_eccline));
404 		eccline->flags = ECC_FLOATING;
405 		priv->num_floating++;
406 	}
407 
408 	eccline->queued_on = 0;
409 	eccline->blob = eccline_blob;
410 	eccline->buf  = getiobuf(NULL, true);
411 	eccline->buf->b_private = eccline;	/* IMPORTANT */
412 
413 	/* initialise eccline blob */
414 	memset(eccline->blob, 0, blobsize);
415 
416 	eccline->ump = ump;
417 	eccline->present = eccline->readin = eccline->dirty = 0;
418 	eccline->error = 0;
419 	eccline->refcnt = 0;
420 	eccline->start_sector = start_sector;
421 
422 	LIST_INSERT_HEAD(&priv->eccline_hash[line], eccline, hashchain);
423 
424 	/*
425 	 * TODO possible optimalisation for checking overlap with partitions
426 	 * to get a clue on future eccline usage
427 	 */
428 	eccline->refcnt++;
429 	UDF_LOCK_ECCLINE(eccline);
430 
431 	mutex_exit(&priv->discstrat_mutex);
432 
433 	return eccline;
434 }
435 
436 
437 static void
438 udf_puteccline(struct udf_eccline *eccline)
439 {
440 	struct strat_private *priv = PRIV(eccline->ump);
441 	struct udf_eccline *deccline;
442 	struct udf_mount *ump = eccline->ump;
443 	uint64_t allbits = ((uint64_t) 1 << ump->packet_size)-1;
444 	int newqueue, tries;
445 
446 	mutex_enter(&priv->discstrat_mutex);
447 
448 	/* clear directly all readin requests from present ones */
449 	if (eccline->readin & eccline->present) {
450 		/* clear all read bits that are already read in */
451 		eccline->readin &= (~eccline->present) & allbits;
452 		wakeup(eccline);
453 	}
454 
455 	DPRINTF(ECCLINE, ("put eccline start sector %d, refcnt %d\n",
456 		eccline->start_sector, eccline->refcnt));
457 
458 	/* requeue */
459 	newqueue = UDF_SHED_FREE;
460 	if (eccline->refcnt > 1)
461 		newqueue = UDF_SHED_IDLE;
462 	if (eccline->flags & ECC_WANTED)
463 		newqueue = UDF_SHED_IDLE;
464 	if (eccline->dirty) {
465 		newqueue = UDF_SHED_WRITING;
466 		if (eccline->flags & ECC_SEQWRITING)
467 			newqueue = UDF_SHED_SEQWRITING;
468 	}
469 
470 	/* if we have active nodes */
471 	if (eccline->refcnt > 1) {
472 		/* we dont set it on seqwriting */
473 		eccline->flags &= ~ECC_SEQWRITING;
474 	}
475 
476 	/* if we need reading in or not all is yet present, queue reading */
477 	if ((eccline->readin) || (eccline->present != allbits))
478 		newqueue = UDF_SHED_READING;
479 
480 	/* reduce the number of kept free buffers */
481 	tries = priv->num_queued[UDF_SHED_FREE] - UDF_ECCLINE_MAXFREE;
482 	while (tries > 0 /* priv->num_queued[UDF_SHED_FREE] > UDF_ECCLINE_MAXFREE */) {
483 		deccline = udf_pop_eccline(priv, UDF_SHED_FREE);
484 		KASSERT(deccline);
485 		KASSERT(deccline->refcnt == 0);
486 		if (deccline->flags & ECC_WANTED) {
487 			udf_push_eccline(deccline, UDF_SHED_IDLE);
488 			DPRINTF(ECCLINE, ("Tried removing, pushed back to free list\n"));
489 		} else {
490 			DPRINTF(ECCLINE, ("Removing entry from free list\n"));
491 			udf_dispose_eccline(deccline);
492 		}
493 		tries--;
494 	}
495 
496 	udf_push_eccline(eccline, newqueue);
497 
498 	KASSERT(eccline->refcnt >= 1);
499 	eccline->refcnt--;
500 	UDF_UNLOCK_ECCLINE(eccline);
501 
502 	mutex_exit(&priv->discstrat_mutex);
503 }
504 
505 /* --------------------------------------------------------------------- */
506 
507 static int
508 udf_create_nodedscr_rmw(struct udf_strat_args *args)
509 {
510 	union dscrptr   **dscrptr  = &args->dscr;
511 	struct udf_mount *ump      = args->ump;
512 	struct long_ad   *icb      = args->icb;
513 	struct udf_eccline *eccline;
514 	uint64_t bit;
515 	uint32_t sectornr, lb_size, dummy;
516 	uint8_t *mem;
517 	int error, eccsect;
518 
519 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
520 	if (error)
521 		return error;
522 
523 	lb_size  = udf_rw32(ump->logical_vol->lb_size);
524 
525 	/* get our eccline */
526 	eccline = udf_geteccline(ump, sectornr, 0);
527 	eccsect = sectornr - eccline->start_sector;
528 
529 	bit = (uint64_t) 1 << eccsect;
530 	eccline->readin  &= ~bit;	/* just in case */
531 	eccline->present |=  bit;
532 	eccline->dirty   &= ~bit;	/* Err... euhm... clean? */
533 
534 	eccline->refcnt++;
535 
536 	/* clear space */
537 	mem = ((uint8_t *) eccline->blob) + eccsect * lb_size;
538 	memset(mem, 0, lb_size);
539 
540 	udf_puteccline(eccline);
541 
542 	*dscrptr = (union dscrptr *) mem;
543 	return 0;
544 }
545 
546 
547 static void
548 udf_free_nodedscr_rmw(struct udf_strat_args *args)
549 {
550 	struct udf_mount *ump  = args->ump;
551 	struct long_ad   *icb  = args->icb;
552 	struct udf_eccline *eccline;
553 	uint64_t bit;
554 	uint32_t sectornr, dummy;
555 	int error, eccsect;
556 
557 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
558 	if (error)
559 		return;
560 
561 	/* get our eccline */
562 	eccline = udf_geteccline(ump, sectornr, 0);
563 	eccsect = sectornr - eccline->start_sector;
564 
565 	bit = (uint64_t) 1 << eccsect;
566 	eccline->readin &= ~bit;	/* just in case */
567 
568 	KASSERT(eccline->refcnt >= 1);
569 	eccline->refcnt--;
570 
571 	udf_puteccline(eccline);
572 }
573 
574 
575 static int
576 udf_read_nodedscr_rmw(struct udf_strat_args *args)
577 {
578 	union dscrptr   **dscrptr = &args->dscr;
579 	struct udf_mount *ump = args->ump;
580 	struct long_ad   *icb = args->icb;
581 	struct udf_eccline *eccline;
582 	uint64_t bit;
583 	uint32_t sectornr, dummy;
584 	uint8_t *pos;
585 	int sector_size = ump->discinfo.sector_size;
586 	int lb_size = udf_rw32(ump->logical_vol->lb_size);
587 	int i, error, dscrlen, eccsect;
588 
589 	lb_size = lb_size;
590 	KASSERT(sector_size == lb_size);
591 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
592 	if (error)
593 		return error;
594 
595 	/* get our eccline */
596 	eccline = udf_geteccline(ump, sectornr, 0);
597 	eccsect = sectornr - eccline->start_sector;
598 
599 	bit = (uint64_t) 1 << eccsect;
600 	if ((eccline->present & bit) == 0) {
601 		/* mark bit for readin */
602 		eccline->readin |= bit;
603 		eccline->refcnt++;	/* prevent recycling */
604 		KASSERT(eccline->bufs[eccsect] == NULL);
605 		udf_puteccline(eccline);
606 
607 		/* wait for completion; XXX remodel to lock bit code */
608 		error = 0;
609 		while ((eccline->present & bit) == 0) {
610 			tsleep(eccline, PRIBIO+1, "udflvdrd", hz/8);
611 			if (eccline->error & bit) {
612 				KASSERT(eccline->refcnt >= 1);
613 				eccline->refcnt--;	/* undo temp refcnt */
614 				*dscrptr = NULL;
615 				return EIO;		/* XXX error code */
616 			}
617 		}
618 
619 		/* reget our line */
620 		eccline = udf_geteccline(ump, sectornr, 0);
621 		KASSERT(eccline->refcnt >= 1);
622 		eccline->refcnt--;	/* undo refcnt */
623 	}
624 
625 	*dscrptr = (union dscrptr *)
626 		(((uint8_t *) eccline->blob) + eccsect * sector_size);
627 
628 	/* code from read_phys_descr */
629 	/* check if its a valid tag */
630 	error = udf_check_tag(*dscrptr);
631 	if (error) {
632 		/* check if its an empty block */
633 		pos = (uint8_t *) *dscrptr;
634 		for (i = 0; i < sector_size; i++, pos++) {
635 			if (*pos) break;
636 		}
637 		if (i == sector_size) {
638 			/* return no error but with no dscrptr */
639 			error = 0;
640 		}
641 		*dscrptr = NULL;
642 		udf_puteccline(eccline);
643 		return error;
644 	}
645 
646 	/* calculate descriptor size */
647 	dscrlen = udf_tagsize(*dscrptr, sector_size);
648 	error = udf_check_tag_payload(*dscrptr, dscrlen);
649 	if (error) {
650 		*dscrptr = NULL;
651 		udf_puteccline(eccline);
652 		return error;
653 	}
654 
655 	eccline->refcnt++;
656 	udf_puteccline(eccline);
657 
658 	return 0;
659 }
660 
661 
662 static int
663 udf_write_nodedscr_rmw(struct udf_strat_args *args)
664 {
665 	union dscrptr    *dscrptr = args->dscr;
666 	struct udf_mount *ump = args->ump;
667 	struct long_ad   *icb = args->icb;
668 	struct udf_node *udf_node = args->udf_node;
669 	struct udf_eccline *eccline;
670 	uint64_t bit;
671 	uint32_t sectornr, logsectornr, dummy;
672 	// int waitfor  = args->waitfor;
673 	int sector_size = ump->discinfo.sector_size;
674 	int lb_size = udf_rw32(ump->logical_vol->lb_size);
675 	int error, eccsect;
676 
677 	lb_size = lb_size;
678 	KASSERT(sector_size == lb_size);
679 	sectornr    = 0;
680 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
681 	if (error)
682 		return error;
683 
684 	/* add reference to the vnode to prevent recycling */
685 	vhold(udf_node->vnode);
686 
687 	/* get our eccline */
688 	eccline = udf_geteccline(ump, sectornr, 0);
689 	eccsect = sectornr - eccline->start_sector;
690 
691 	bit = (uint64_t) 1 << eccsect;
692 
693 	/* old callback still pending? */
694 	if (eccline->bufs[eccsect]) {
695 		DPRINTF(WRITE, ("udf_write_nodedscr_rmw: writing descriptor"
696 					" over buffer?\n"));
697 		nestiobuf_done(eccline->bufs[eccsect],
698 				eccline->bufs_len[eccsect],
699 				0);
700 		eccline->bufs[eccsect] = NULL;
701 	}
702 
703 	/* set sector number in the descriptor and validate */
704 	dscrptr = (union dscrptr *)
705 		(((uint8_t *) eccline->blob) + eccsect * sector_size);
706 	KASSERT(dscrptr == args->dscr);
707 
708 	logsectornr = udf_rw32(icb->loc.lb_num);
709 	dscrptr->tag.tag_loc = udf_rw32(logsectornr);
710 	udf_validate_tag_and_crc_sums(dscrptr);
711 
712 	udf_fixup_node_internals(ump, (uint8_t *) dscrptr, UDF_C_NODE);
713 
714 	/* set our flags */
715 	KASSERT(eccline->present & bit);
716 	eccline->dirty |= bit;
717 
718 	KASSERT(udf_tagsize(dscrptr, sector_size) <= sector_size);
719 
720 	udf_puteccline(eccline);
721 
722 	holdrele(udf_node->vnode);
723 	udf_node->outstanding_nodedscr--;
724 	if (udf_node->outstanding_nodedscr == 0) {
725 		UDF_UNLOCK_NODE(udf_node, udf_node->i_flags & IN_CALLBACK_ULK);
726 		wakeup(&udf_node->outstanding_nodedscr);
727 	}
728 
729 	/* XXX waitfor not used */
730 	return 0;
731 }
732 
733 
734 static void
735 udf_queuebuf_rmw(struct udf_strat_args *args)
736 {
737 	struct udf_mount *ump = args->ump;
738 	struct buf *buf = args->nestbuf;
739 	struct desc_tag *tag;
740 	struct strat_private *priv = PRIV(ump);
741 	struct udf_eccline *eccline;
742 	struct long_ad *node_ad_cpy;
743 	uint64_t bit, *lmapping, *pmapping, *lmappos, *pmappos, blknr;
744 	uint32_t buf_len, len, sectors, sectornr, our_sectornr;
745 	uint32_t bpos;
746 	uint16_t vpart_num;
747 	uint8_t *fidblk, *src, *dst;
748 	int sector_size = ump->discinfo.sector_size;
749 	int blks = sector_size / DEV_BSIZE;
750 	int eccsect, what, queue, error;
751 
752 	KASSERT(ump);
753 	KASSERT(buf);
754 	KASSERT(buf->b_iodone == nestiobuf_iodone);
755 
756 	blknr        = buf->b_blkno;
757 	our_sectornr = blknr / blks;
758 
759 	what = buf->b_udf_c_type;
760 	queue = UDF_SHED_READING;
761 	if ((buf->b_flags & B_READ) == 0) {
762 		/* writing */
763 		queue = UDF_SHED_SEQWRITING;
764 		if (what == UDF_C_DSCR)
765 			queue = UDF_SHED_WRITING;
766 		if (what == UDF_C_NODE)
767 			queue = UDF_SHED_WRITING;
768 	}
769 
770 	if (queue == UDF_SHED_READING) {
771 		DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw READ %p : sector %d type %d,"
772 			"b_resid %d, b_bcount %d, b_bufsize %d\n",
773 			buf, (uint32_t) buf->b_blkno / blks, buf->b_udf_c_type,
774 			buf->b_resid, buf->b_bcount, buf->b_bufsize));
775 
776 		/* mark bits for reading */
777 		buf_len = buf->b_bcount;
778 		sectornr = our_sectornr;
779 		eccline = udf_geteccline(ump, sectornr, 0);
780 		eccsect = sectornr - eccline->start_sector;
781 		bpos = 0;
782 		while (buf_len) {
783 			len = MIN(buf_len, sector_size);
784 			if (eccsect == ump->packet_size) {
785 				udf_puteccline(eccline);
786 				eccline = udf_geteccline(ump, sectornr, 0);
787 				eccsect = sectornr - eccline->start_sector;
788 			}
789 			bit = (uint64_t) 1 << eccsect;
790 			error = eccline->error & bit ? EIO : 0;
791 			if (eccline->present & bit) {
792 				src = (uint8_t *) eccline->blob +
793 					eccsect * sector_size;
794 				dst = (uint8_t *) buf->b_data + bpos;
795 				if (!error)
796 					memcpy(dst, src, len);
797 				nestiobuf_done(buf, len, error);
798 			} else {
799 				eccline->readin |= bit;
800 				KASSERT(eccline->bufs[eccsect] == NULL);
801 				eccline->bufs[eccsect] = buf;
802 				eccline->bufs_bpos[eccsect] = bpos;
803 				eccline->bufs_len[eccsect] = len;
804 			}
805 			bpos += sector_size;
806 			eccsect++;
807 			sectornr++;
808 			buf_len -= len;
809 		}
810 		udf_puteccline(eccline);
811 		return;
812 	}
813 
814 	if (queue == UDF_SHED_WRITING) {
815 		DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw WRITE %p : sector %d "
816 			"type %d, b_resid %d, b_bcount %d, b_bufsize %d\n",
817 			buf, (uint32_t) buf->b_blkno / blks, buf->b_udf_c_type,
818 			buf->b_resid, buf->b_bcount, buf->b_bufsize));
819 		/* if we have FIDs fixup using buffer's sector number(s) */
820 		if (buf->b_udf_c_type == UDF_C_FIDS) {
821 			panic("UDF_C_FIDS in SHED_WRITING!\n");
822 #if 0
823 			buf_len = buf->b_bcount;
824 			sectornr = our_sectornr;
825 			bpos = 0;
826 			while (buf_len) {
827 				len = MIN(buf_len, sector_size);
828 				fidblk = (uint8_t *) buf->b_data + bpos;
829 				udf_fixup_fid_block(fidblk, sector_size,
830 					0, len, sectornr);
831 				sectornr++;
832 				bpos += len;
833 				buf_len -= len;
834 			}
835 #endif
836 		}
837 		udf_fixup_node_internals(ump, buf->b_data, buf->b_udf_c_type);
838 
839 		/* copy parts into the bufs and set for writing */
840 		buf_len = buf->b_bcount;
841 		sectornr = our_sectornr;
842 		eccline = udf_geteccline(ump, sectornr, 0);
843 		eccsect = sectornr - eccline->start_sector;
844 		bpos = 0;
845 		while (buf_len) {
846 			len = MIN(buf_len, sector_size);
847 			if (eccsect == ump->packet_size) {
848 				udf_puteccline(eccline);
849 				eccline = udf_geteccline(ump, sectornr, 0);
850 				eccsect = sectornr - eccline->start_sector;
851 			}
852 			bit = (uint64_t) 1 << eccsect;
853 			KASSERT((eccline->readin & bit) == 0);
854 			eccline->present |= bit;
855 			eccline->dirty   |= bit;
856 			if (eccline->bufs[eccsect]) {
857 				/* old callback still pending */
858 				nestiobuf_done(eccline->bufs[eccsect],
859 						eccline->bufs_len[eccsect],
860 						0);
861 				eccline->bufs[eccsect] = NULL;
862 			}
863 
864 			src = (uint8_t *) buf->b_data + bpos;
865 			dst = (uint8_t *) eccline->blob + eccsect * sector_size;
866 			if (len != sector_size)
867 				memset(dst, 0, sector_size);
868 			memcpy(dst, src, len);
869 
870 			/* note that its finished for this extent */
871 			eccline->bufs[eccsect] = NULL;
872 			nestiobuf_done(buf, len, 0);
873 
874 			bpos += sector_size;
875 			eccsect++;
876 			sectornr++;
877 			buf_len -= len;
878 		}
879 		udf_puteccline(eccline);
880 		return;
881 
882 	}
883 
884 	/* sequential writing */
885 	KASSERT(queue == UDF_SHED_SEQWRITING);
886 	DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw SEQWRITE %p : sector XXXX "
887 		"type %d, b_resid %d, b_bcount %d, b_bufsize %d\n",
888 		buf, buf->b_udf_c_type, buf->b_resid, buf->b_bcount,
889 		buf->b_bufsize));
890 	/*
891 	 * Buffers should not have been allocated to disc addresses yet on
892 	 * this queue. Note that a buffer can get multiple extents allocated.
893 	 * Note that it *looks* like the normal writing but its different in
894 	 * the details.
895 	 *
896 	 * lmapping contains lb_num relative to base partition.
897 	 *
898 	 * XXX should we try to claim/organize the allocated memory to
899 	 * block-aligned pieces?
900 	 */
901 	mutex_enter(&priv->seqwrite_mutex);
902 
903 	lmapping    = ump->la_lmapping;
904 	node_ad_cpy = ump->la_node_ad_cpy;
905 
906 	/* logically allocate buf and map it in the file */
907 	udf_late_allocate_buf(ump, buf, lmapping, node_ad_cpy, &vpart_num);
908 
909 	/* if we have FIDs, fixup using the new allocation table */
910 	if (buf->b_udf_c_type == UDF_C_FIDS) {
911 		buf_len = buf->b_bcount;
912 		bpos = 0;
913 		lmappos = lmapping;
914 		while (buf_len) {
915 			sectornr = *lmappos++;
916 			len = MIN(buf_len, sector_size);
917 			fidblk = (uint8_t *) buf->b_data + bpos;
918 			udf_fixup_fid_block(fidblk, sector_size,
919 				0, len, sectornr);
920 			bpos += len;
921 			buf_len -= len;
922 		}
923 	}
924 	if (buf->b_udf_c_type == UDF_C_METADATA_SBM) {
925 		if (buf->b_lblkno == 0) {
926 			/* update the tag location inside */
927 			tag = (struct desc_tag *) buf->b_data;
928 			tag->tag_loc = udf_rw32(buf->b_blkno);
929 			udf_validate_tag_and_crc_sums(buf->b_data);
930 		}
931 	}
932 	udf_fixup_node_internals(ump, buf->b_data, buf->b_udf_c_type);
933 
934 	/*
935 	 * Translate new mappings in lmapping to pmappings.
936 	 * pmapping to contain lb_nums as used for disc adressing.
937 	 */
938 	pmapping = ump->la_pmapping;
939 	sectors  = (buf->b_bcount + sector_size -1) / sector_size;
940 	udf_translate_vtop_list(ump, sectors, vpart_num, lmapping, pmapping);
941 
942 	/* copy parts into the bufs and set for writing */
943 	pmappos = pmapping;
944 	buf_len = buf->b_bcount;
945 	sectornr = *pmappos++;
946 	eccline = udf_geteccline(ump, sectornr, ECC_SEQWRITING);
947 	eccsect = sectornr - eccline->start_sector;
948 	bpos = 0;
949 	while (buf_len) {
950 		len = MIN(buf_len, sector_size);
951 		eccsect = sectornr - eccline->start_sector;
952 		if ((eccsect < 0) || (eccsect >= ump->packet_size)) {
953 			eccline->flags |= ECC_SEQWRITING;
954 			udf_puteccline(eccline);
955 			eccline = udf_geteccline(ump, sectornr, ECC_SEQWRITING);
956 			eccsect = sectornr - eccline->start_sector;
957 		}
958 		bit = (uint64_t) 1 << eccsect;
959 		KASSERT((eccline->readin & bit) == 0);
960 		eccline->present |= bit;
961 		eccline->dirty   |= bit;
962 		eccline->bufs[eccsect] = NULL;
963 
964 		src = (uint8_t *) buf->b_data + bpos;
965 		dst = (uint8_t *)
966 			eccline->blob + eccsect * sector_size;
967 		if (len != sector_size)
968 			memset(dst, 0, sector_size);
969 		memcpy(dst, src, len);
970 
971 		/* note that its finished for this extent */
972 		nestiobuf_done(buf, len, 0);
973 
974 		bpos += sector_size;
975 		sectornr = *pmappos++;
976 		buf_len -= len;
977 	}
978 	eccline->flags |= ECC_SEQWRITING;
979 	udf_puteccline(eccline);
980 	mutex_exit(&priv->seqwrite_mutex);
981 }
982 
983 /* --------------------------------------------------------------------- */
984 
985 static void
986 udf_shedule_read_callback(struct buf *buf)
987 {
988 	struct udf_eccline *eccline = BTOE(buf);
989 	struct udf_mount *ump = eccline->ump;
990 	uint64_t bit;
991 	uint8_t *src, *dst;
992 	int sector_size = ump->discinfo.sector_size;
993 	int error, i, len;
994 
995 	DPRINTF(ECCLINE, ("read callback called\n"));
996 	/* post process read action */
997 	error = buf->b_error;
998 	for (i = 0; i < ump->packet_size; i++) {
999 		bit = (uint64_t) 1 << i;
1000 		src = (uint8_t *) buf->b_data +   i * sector_size;
1001 		dst = (uint8_t *) eccline->blob + i * sector_size;
1002 		if (eccline->present & bit)
1003 			continue;
1004 		if (error) {
1005 			eccline->error |= bit;
1006 		} else {
1007 			eccline->present |= bit;
1008 		}
1009 		if (eccline->bufs[i]) {
1010 			dst = (uint8_t *) eccline->bufs[i]->b_data +
1011 				eccline->bufs_bpos[i];
1012 			len = eccline->bufs_len[i];
1013 			if (!error)
1014 				memcpy(dst, src, len);
1015 			nestiobuf_done(eccline->bufs[i], len, error);
1016 			eccline->bufs[i] = NULL;
1017 		}
1018 
1019 	}
1020 	KASSERT(buf->b_data == eccline->blob);
1021 	KASSERT(eccline->present == ((uint64_t) 1 << ump->packet_size)-1);
1022 
1023 	/*
1024 	 * XXX TODO what to do on read errors? read in all sectors
1025 	 * synchronously and allocate a sparable entry?
1026 	 */
1027 
1028 	wakeup(eccline);
1029 	udf_puteccline(eccline);
1030 	DPRINTF(ECCLINE, ("read callback finished\n"));
1031 }
1032 
1033 
1034 static void
1035 udf_shedule_write_callback(struct buf *buf)
1036 {
1037 	struct udf_eccline *eccline = BTOE(buf);
1038 	struct udf_mount *ump = eccline->ump;
1039 	uint64_t bit;
1040 	int error, i, len;
1041 
1042 	DPRINTF(ECCLINE, ("write callback called\n"));
1043 	/* post process write action */
1044 	error = buf->b_error;
1045 	for (i = 0; i < ump->packet_size; i++) {
1046 		bit = (uint64_t) 1 << i;
1047 		if ((eccline->dirty & bit) == 0)
1048 			continue;
1049 		if (error) {
1050 			eccline->error |= bit;
1051 		} else {
1052 			eccline->dirty &= ~bit;
1053 		}
1054 		if (eccline->bufs[i]) {
1055 			len = eccline->bufs_len[i];
1056 			nestiobuf_done(eccline->bufs[i], len, error);
1057 			eccline->bufs[i] = NULL;
1058 		}
1059 	}
1060 	KASSERT(eccline->dirty == 0);
1061 
1062 	KASSERT(error == 0);
1063 	/*
1064 	 * XXX TODO on write errors allocate a sparable entry
1065 	 */
1066 
1067 	wakeup(eccline);
1068 	udf_puteccline(eccline);
1069 }
1070 
1071 
1072 static void
1073 udf_issue_eccline(struct udf_eccline *eccline, int queued_on)
1074 {
1075 	struct udf_mount *ump = eccline->ump;
1076 	struct strat_private *priv = PRIV(ump);
1077 	struct buf *buf, *nestbuf;
1078 	uint64_t bit, allbits = ((uint64_t) 1 << ump->packet_size)-1;
1079 	uint32_t start;
1080 	int sector_size = ump->discinfo.sector_size;
1081 	int blks = sector_size / DEV_BSIZE;
1082 	int i;
1083 
1084 	if (queued_on == UDF_SHED_READING) {
1085 		DPRINTF(SHEDULE, ("udf_issue_eccline reading : "));
1086 		/* read all bits that are not yet present */
1087 		eccline->readin = (~eccline->present) & allbits;
1088 		KASSERT(eccline->readin);
1089 		start = eccline->start_sector;
1090 		buf = eccline->buf;
1091 		buf_init(buf);
1092 		buf->b_flags    = B_READ | B_ASYNC;
1093 		SET(buf->b_cflags, BC_BUSY);	/* mark buffer busy */
1094 		buf->b_oflags   = 0;
1095 		buf->b_iodone   = udf_shedule_read_callback;
1096 		buf->b_data     = eccline->blob;
1097 		buf->b_bcount   = ump->packet_size * sector_size;
1098 		buf->b_resid    = buf->b_bcount;
1099 		buf->b_bufsize  = buf->b_bcount;
1100 		buf->b_private  = eccline;
1101 		BIO_SETPRIO(buf, BPRIO_DEFAULT);
1102 		buf->b_lblkno   = buf->b_blkno = buf->b_rawblkno = start * blks;
1103 		buf->b_proc     = NULL;
1104 
1105 		if (eccline->present != 0) {
1106 			for (i = 0; i < ump->packet_size; i++) {
1107 				bit = (uint64_t) 1 << i;
1108 				if (eccline->present & bit) {
1109 					nestiobuf_done(buf, sector_size, 0);
1110 					continue;
1111 				}
1112 				nestbuf = getiobuf(NULL, true);
1113 				nestiobuf_setup(buf, nestbuf, i * sector_size,
1114 					sector_size);
1115 				/* adjust blocknumber to read */
1116 				nestbuf->b_blkno = buf->b_blkno + i*blks;
1117 				nestbuf->b_rawblkno = buf->b_rawblkno + i*blks;
1118 
1119 				DPRINTF(SHEDULE, ("sector %d ",
1120 					start + i));
1121 				/* call asynchronous */
1122 				VOP_STRATEGY(ump->devvp, nestbuf);
1123 			}
1124 			DPRINTF(SHEDULE, ("\n"));
1125 			return;
1126 		}
1127 	} else {
1128 		/* write or seqwrite */
1129 		DPRINTF(SHEDULE, ("udf_issue_eccline writing or seqwriting : "));
1130 		if (eccline->present != allbits) {
1131 			/* requeue to read-only */
1132 			DPRINTF(SHEDULE, ("\n\t-> not complete, requeue to "
1133 				"reading\n"));
1134 			udf_push_eccline(eccline, UDF_SHED_READING);
1135 			return;
1136 		}
1137 		start = eccline->start_sector;
1138 		buf = eccline->buf;
1139 		buf_init(buf);
1140 		buf->b_flags    = B_WRITE | B_ASYNC;
1141 		SET(buf->b_cflags, BC_BUSY);	/* mark buffer busy */
1142 		buf->b_oflags   = 0;
1143 		buf->b_iodone   = udf_shedule_write_callback;
1144 		buf->b_data     = eccline->blob;
1145 		buf->b_bcount   = ump->packet_size * sector_size;
1146 		buf->b_resid    = buf->b_bcount;
1147 		buf->b_bufsize  = buf->b_bcount;
1148 		buf->b_private  = eccline;
1149 		BIO_SETPRIO(buf, BPRIO_DEFAULT);
1150 		buf->b_lblkno   = buf->b_blkno = buf->b_rawblkno = start * blks;
1151 		buf->b_proc     = NULL;
1152 	}
1153 
1154 	mutex_exit(&priv->discstrat_mutex);
1155 		/* call asynchronous */
1156 		DPRINTF(SHEDULE, ("sector %d for %d\n",
1157 			start, ump->packet_size));
1158 		VOP_STRATEGY(ump->devvp, buf);
1159 	mutex_enter(&priv->discstrat_mutex);
1160 }
1161 
1162 
1163 static void
1164 udf_discstrat_thread(void *arg)
1165 {
1166 	struct udf_mount *ump = (struct udf_mount *) arg;
1167 	struct strat_private *priv = PRIV(ump);
1168 	struct udf_eccline *eccline;
1169 	struct timespec now, *last;
1170 	int new_queue, wait, work;
1171 
1172 	work = 1;
1173 	mutex_enter(&priv->discstrat_mutex);
1174 	priv->num_floating = 0;
1175 	while (priv->run_thread || work || priv->num_floating) {
1176 		/* process the current selected queue */
1177 		/* maintenance: free exess ecclines */
1178 		while (priv->num_queued[UDF_SHED_FREE] > UDF_ECCLINE_MAXFREE) {
1179 			eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
1180 			KASSERT(eccline);
1181 			KASSERT(eccline->refcnt == 0);
1182 			DPRINTF(ECCLINE, ("Removing entry from free list\n"));
1183 			udf_dispose_eccline(eccline);
1184 		}
1185 
1186 		/* get our time */
1187 		vfs_timestamp(&now);
1188 		last = &priv->last_queued[priv->cur_queue];
1189 
1190 		/* don't shedule too quickly when there is only one */
1191 		if (priv->cur_queue == UDF_SHED_WRITING) {
1192 			if (priv->num_queued[priv->cur_queue] <= 2) {
1193 				if (now.tv_sec - last->tv_sec < 2) {
1194 					/* wait some time */
1195 					cv_timedwait(&priv->discstrat_cv,
1196 						&priv->discstrat_mutex, hz);
1197 				}
1198 			}
1199 		}
1200 
1201 		/* get our line */
1202 		eccline = udf_pop_eccline(priv, priv->cur_queue);
1203 		if (eccline) {
1204 			wait = 0;
1205 			new_queue = priv->cur_queue;
1206 			DPRINTF(ECCLINE, ("UDF_ISSUE_ECCLINE\n"));
1207 
1208 			/* complete the `get' by locking and refcounting it */
1209 			UDF_LOCK_ECCLINE(eccline);
1210 			eccline->refcnt++;
1211 
1212 			udf_issue_eccline(eccline, priv->cur_queue);
1213 		} else {
1214 			wait = 1;
1215 			/* check if we can/should switch */
1216 			new_queue = priv->cur_queue;
1217 			if (BUFQ_PEEK(priv->queues[UDF_SHED_READING]))
1218 				new_queue = UDF_SHED_READING;
1219 			if (BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]))
1220 				new_queue = UDF_SHED_WRITING;
1221 			if (BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]))
1222 				new_queue = UDF_SHED_SEQWRITING;
1223 
1224 			/* dont switch seqwriting too fast */
1225 			if (priv->cur_queue == UDF_SHED_READING) {
1226 				if (now.tv_sec - last->tv_sec < 1)
1227 					new_queue = priv->cur_queue;
1228 			}
1229 			if (priv->cur_queue == UDF_SHED_WRITING) {
1230 				if (now.tv_sec - last->tv_sec < 2)
1231 					new_queue = priv->cur_queue;
1232 			}
1233 			if (priv->cur_queue == UDF_SHED_SEQWRITING) {
1234 				if (now.tv_sec - last->tv_sec < 4)
1235 					new_queue = priv->cur_queue;
1236 			}
1237 		}
1238 
1239 		/* give room */
1240 		mutex_exit(&priv->discstrat_mutex);
1241 
1242 		if (new_queue != priv->cur_queue) {
1243 			wait = 0;
1244 			DPRINTF(SHEDULE, ("switching from %d to %d\n",
1245 				priv->cur_queue, new_queue));
1246 			priv->cur_queue = new_queue;
1247 		}
1248 		mutex_enter(&priv->discstrat_mutex);
1249 
1250 		/* wait for more if needed */
1251 		if (wait)
1252 			cv_timedwait(&priv->discstrat_cv,
1253 				&priv->discstrat_mutex, hz);	/* /8 */
1254 
1255 		work  = (BUFQ_PEEK(priv->queues[UDF_SHED_READING]) != NULL);
1256 		work |= (BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]) != NULL);
1257 		work |= (BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]) != NULL);
1258 
1259 		DPRINTF(PARANOIA, ("work : (%d, %d, %d) -> work %d, float %d\n",
1260 			(BUFQ_PEEK(priv->queues[UDF_SHED_READING]) != NULL),
1261 			(BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]) != NULL),
1262 			(BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]) != NULL),
1263 			work, priv->num_floating));
1264 	}
1265 
1266 	mutex_exit(&priv->discstrat_mutex);
1267 
1268 	/* tear down remaining ecclines */
1269 	mutex_enter(&priv->discstrat_mutex);
1270 	KASSERT(priv->num_queued[UDF_SHED_IDLE] == 0);
1271 	KASSERT(priv->num_queued[UDF_SHED_READING] == 0);
1272 	KASSERT(priv->num_queued[UDF_SHED_WRITING] == 0);
1273 	KASSERT(priv->num_queued[UDF_SHED_SEQWRITING] == 0);
1274 
1275 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_IDLE]) == NULL);
1276 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_READING]) == NULL);
1277 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]) == NULL);
1278 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]) == NULL);
1279 	eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
1280 	while (eccline) {
1281 		udf_dispose_eccline(eccline);
1282 		eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
1283 	}
1284 	KASSERT(priv->num_queued[UDF_SHED_FREE] == 0);
1285 	mutex_exit(&priv->discstrat_mutex);
1286 
1287 	priv->thread_finished = 1;
1288 	wakeup(&priv->run_thread);
1289 	kthread_exit(0);
1290 	/* not reached */
1291 }
1292 
1293 /* --------------------------------------------------------------------- */
1294 
1295 /*
1296  * Buffer memory pool allocator.
1297  */
1298 
1299 static void *
1300 ecclinepool_page_alloc(struct pool *pp, int flags)
1301 {
1302         return (void *)uvm_km_alloc(kernel_map,
1303             MAXBSIZE, MAXBSIZE,
1304             ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
1305 	    	| UVM_KMF_WIRED /* UVM_KMF_PAGABLE? */);
1306 }
1307 
1308 static void
1309 ecclinepool_page_free(struct pool *pp, void *v)
1310 {
1311         uvm_km_free(kernel_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
1312 }
1313 
1314 static struct pool_allocator ecclinepool_allocator = {
1315         .pa_alloc = ecclinepool_page_alloc,
1316         .pa_free  = ecclinepool_page_free,
1317         .pa_pagesz = MAXBSIZE,
1318 };
1319 
1320 
1321 static void
1322 udf_discstrat_init_rmw(struct udf_strat_args *args)
1323 {
1324 	struct udf_mount *ump = args->ump;
1325 	struct strat_private *priv = PRIV(ump);
1326 	uint32_t lb_size, blobsize, hashline;
1327 	int i;
1328 
1329 	KASSERT(ump);
1330 	KASSERT(ump->logical_vol);
1331 	KASSERT(priv == NULL);
1332 
1333 	lb_size = udf_rw32(ump->logical_vol->lb_size);
1334 	blobsize = ump->packet_size * lb_size;
1335 	KASSERT(lb_size > 0);
1336 	KASSERT(ump->packet_size <= 64);
1337 
1338 	/* initialise our memory space */
1339 	ump->strategy_private = malloc(sizeof(struct strat_private),
1340 		M_UDFTEMP, M_WAITOK);
1341 	priv = ump->strategy_private;
1342 	memset(priv, 0 , sizeof(struct strat_private));
1343 
1344 	/* initialise locks */
1345 	cv_init(&priv->discstrat_cv, "udfstrat");
1346 	mutex_init(&priv->discstrat_mutex, MUTEX_DRIVER, IPL_BIO);
1347 	mutex_init(&priv->seqwrite_mutex, MUTEX_DEFAULT, IPL_NONE);
1348 
1349 	/* initialise struct eccline pool */
1350 	pool_init(&priv->eccline_pool, sizeof(struct udf_eccline),
1351 		0, 0, 0, "udf_eccline_pool", NULL, IPL_NONE);
1352 
1353 	/* initialise eccline blob pool */
1354 	pool_init(&priv->ecclineblob_pool, blobsize,
1355 		0,0,0, "udf_eccline_blob", &ecclinepool_allocator, IPL_NONE);
1356 
1357 	/* initialise main queues */
1358 	for (i = 0; i < UDF_SHED_MAX; i++) {
1359 		priv->num_queued[i] = 0;
1360 		vfs_timestamp(&priv->last_queued[i]);
1361 	}
1362 	bufq_alloc(&priv->queues[UDF_SHED_READING], "disksort",
1363 		BUFQ_SORT_RAWBLOCK);
1364 	bufq_alloc(&priv->queues[UDF_SHED_WRITING], "disksort",
1365 		BUFQ_SORT_RAWBLOCK);
1366 	bufq_alloc(&priv->queues[UDF_SHED_SEQWRITING], "disksort", 0);
1367 
1368 	/* initialise administrative queues */
1369 	bufq_alloc(&priv->queues[UDF_SHED_IDLE], "fcfs", 0);
1370 	bufq_alloc(&priv->queues[UDF_SHED_FREE], "fcfs", 0);
1371 
1372 	for (hashline = 0; hashline < UDF_ECCBUF_HASHSIZE; hashline++) {
1373 		LIST_INIT(&priv->eccline_hash[hashline]);
1374 	}
1375 
1376 	/* create our disk strategy thread */
1377 	priv->cur_queue = UDF_SHED_READING;
1378 	priv->thread_finished = 0;
1379 	priv->run_thread      = 1;
1380 	if (kthread_create(PRI_NONE, 0 /* KTHREAD_MPSAFE*/, NULL /* cpu_info*/,
1381 		udf_discstrat_thread, ump, &priv->queue_lwp,
1382 		"%s", "udf_rw")) {
1383 		panic("fork udf_rw");
1384 	}
1385 }
1386 
1387 
1388 static void
1389 udf_discstrat_finish_rmw(struct udf_strat_args *args)
1390 {
1391 	struct udf_mount *ump = args->ump;
1392 	struct strat_private *priv = PRIV(ump);
1393 	int error;
1394 
1395 	if (ump == NULL)
1396 		return;
1397 
1398 	/* stop our sheduling thread */
1399 	KASSERT(priv->run_thread == 1);
1400 	priv->run_thread = 0;
1401 	wakeup(priv->queue_lwp);
1402 	while (!priv->thread_finished) {
1403 		error = tsleep(&priv->run_thread, PRIBIO+1,
1404 			"udfshedfin", hz);
1405 	}
1406 	/* kthread should be finished now */
1407 
1408 	/* cleanup our pools */
1409 	pool_destroy(&priv->eccline_pool);
1410 	pool_destroy(&priv->ecclineblob_pool);
1411 
1412 	cv_destroy(&priv->discstrat_cv);
1413 	mutex_destroy(&priv->discstrat_mutex);
1414 	mutex_destroy(&priv->seqwrite_mutex);
1415 
1416 	/* free our private space */
1417 	free(ump->strategy_private, M_UDFTEMP);
1418 	ump->strategy_private = NULL;
1419 }
1420 
1421 /* --------------------------------------------------------------------- */
1422 
1423 struct udf_strategy udf_strat_rmw =
1424 {
1425 	udf_create_nodedscr_rmw,
1426 	udf_free_nodedscr_rmw,
1427 	udf_read_nodedscr_rmw,
1428 	udf_write_nodedscr_rmw,
1429 	udf_queuebuf_rmw,
1430 	udf_discstrat_init_rmw,
1431 	udf_discstrat_finish_rmw
1432 };
1433 
1434