xref: /netbsd-src/sys/fs/udf/udf_strat_rmw.c (revision 404fbe5fb94ca1e054339640cabb2801ce52dd30)
1 /* $NetBSD: udf_strat_rmw.c,v 1.16 2008/12/16 16:18:25 pooka Exp $ */
2 
3 /*
4  * Copyright (c) 2006, 2008 Reinoud Zandijk
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  */
28 
29 #include <sys/cdefs.h>
30 #ifndef lint
31 __KERNEL_RCSID(0, "$NetBSD: udf_strat_rmw.c,v 1.16 2008/12/16 16:18:25 pooka Exp $");
32 #endif /* not lint */
33 
34 
35 #if defined(_KERNEL_OPT)
36 #include "opt_compat_netbsd.h"
37 #endif
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/sysctl.h>
42 #include <sys/namei.h>
43 #include <sys/proc.h>
44 #include <sys/kernel.h>
45 #include <sys/vnode.h>
46 #include <miscfs/genfs/genfs_node.h>
47 #include <sys/mount.h>
48 #include <sys/buf.h>
49 #include <sys/file.h>
50 #include <sys/device.h>
51 #include <sys/disklabel.h>
52 #include <sys/ioctl.h>
53 #include <sys/malloc.h>
54 #include <sys/dirent.h>
55 #include <sys/stat.h>
56 #include <sys/conf.h>
57 #include <sys/kauth.h>
58 #include <sys/kthread.h>
59 #include <dev/clock_subr.h>
60 
61 #include <fs/udf/ecma167-udf.h>
62 #include <fs/udf/udf_mount.h>
63 
64 #include "udf.h"
65 #include "udf_subr.h"
66 #include "udf_bswap.h"
67 
68 
69 #define VTOI(vnode) ((struct udf_node *) (vnode)->v_data)
70 #define PRIV(ump) ((struct strat_private *) (ump)->strategy_private)
71 #define BTOE(buf) ((struct udf_eccline *) ((buf)->b_private))
72 
73 /* --------------------------------------------------------------------- */
74 
75 #define UDF_MAX_PACKET_SIZE	64			/* DONT change this */
76 
77 /* sheduler states */
78 #define UDF_SHED_WAITING	1			/* waiting on timeout */
79 #define UDF_SHED_READING	2
80 #define UDF_SHED_WRITING	3
81 #define UDF_SHED_SEQWRITING	4
82 #define UDF_SHED_IDLE		5			/* resting */
83 #define UDF_SHED_FREE		6			/* recycleable */
84 #define UDF_SHED_MAX		6+1
85 
86 /* flags */
87 #define ECC_LOCKED		0x01			/* prevent access   */
88 #define ECC_WANTED		0x02			/* trying access    */
89 #define ECC_SEQWRITING		0x04			/* sequential queue */
90 #define ECC_FLOATING		0x08			/* not queued yet   */
91 
92 #define ECC_WAITTIME		10
93 
94 
95 TAILQ_HEAD(ecclineq, udf_eccline);
96 struct udf_eccline {
97 	struct udf_mount	 *ump;
98 	uint64_t		  present;		/* preserve these */
99 	uint64_t		  readin;		/* bitmap */
100 	uint64_t		  dirty;		/* bitmap */
101 	uint64_t		  error;		/* bitmap */
102 	uint32_t		  refcnt;
103 
104 	struct timespec		  wait_time;
105 	uint32_t		  flags;
106 	uint32_t		  start_sector;		/* physical */
107 
108 	struct buf		 *buf;
109 	void			 *blob;
110 
111 	struct buf		 *bufs[UDF_MAX_PACKET_SIZE];
112 	uint32_t		  bufs_bpos[UDF_MAX_PACKET_SIZE];
113 	int			  bufs_len[UDF_MAX_PACKET_SIZE];
114 
115 	int			  queued_on;		/* on which BUFQ list */
116 	LIST_ENTRY(udf_eccline)   hashchain;		/* on sector lookup  */
117 };
118 
119 
120 struct strat_private {
121 	lwp_t			 *queue_lwp;
122 	kcondvar_t		  discstrat_cv;		/* to wait on       */
123 	kmutex_t		  discstrat_mutex;	/* disc strategy    */
124 	kmutex_t		  seqwrite_mutex;	/* protect mappings */
125 
126 	int			  thread_running;	/* thread control */
127 	int			  run_thread;		/* thread control */
128 	int			  thread_finished;	/* thread control */
129 	int			  cur_queue;
130 
131 	int			  num_floating;
132 	int			  num_queued[UDF_SHED_MAX];
133 	struct bufq_state	 *queues[UDF_SHED_MAX];
134 	struct timespec		  last_queued[UDF_SHED_MAX];
135 	struct disk_strategy	  old_strategy_setting;
136 
137 	struct pool		  eccline_pool;
138 	struct pool		  ecclineblob_pool;
139 	LIST_HEAD(, udf_eccline)  eccline_hash[UDF_ECCBUF_HASHSIZE];
140 };
141 
142 /* --------------------------------------------------------------------- */
143 
144 #define UDF_LOCK_ECCLINE(eccline) udf_lock_eccline(eccline)
145 #define UDF_UNLOCK_ECCLINE(eccline) udf_unlock_eccline(eccline)
146 
147 /* can be called with or without discstrat lock */
148 static void
149 udf_lock_eccline(struct udf_eccline *eccline)
150 {
151 	struct strat_private *priv = PRIV(eccline->ump);
152 	int waslocked, ret;
153 
154 	waslocked = mutex_owned(&priv->discstrat_mutex);
155 	if (!waslocked)
156 		mutex_enter(&priv->discstrat_mutex);
157 
158 	/* wait until its unlocked first */
159 	while (eccline->flags & ECC_LOCKED) {
160 		eccline->flags |= ECC_WANTED;
161 		ret = cv_timedwait(&priv->discstrat_cv, &priv->discstrat_mutex,
162 			hz/8);
163 		if (ret == EWOULDBLOCK)
164 			DPRINTF(LOCKING, ("eccline lock helt, waiting for "
165 				"release"));
166 	}
167 	eccline->flags |= ECC_LOCKED;
168 	eccline->flags &= ~ECC_WANTED;
169 
170 	if (!waslocked)
171 		mutex_exit(&priv->discstrat_mutex);
172 }
173 
174 
175 /* can be called with or without discstrat lock */
176 static void
177 udf_unlock_eccline(struct udf_eccline *eccline)
178 {
179 	struct strat_private *priv = PRIV(eccline->ump);
180 	int waslocked;
181 
182 	waslocked = mutex_owned(&priv->discstrat_mutex);
183 	if (!waslocked)
184 		mutex_enter(&priv->discstrat_mutex);
185 
186 	eccline->flags &= ~ECC_LOCKED;
187 	cv_broadcast(&priv->discstrat_cv);
188 
189 	if (!waslocked)
190 		mutex_exit(&priv->discstrat_mutex);
191 }
192 
193 
194 /* NOTE discstrat_mutex should be held! */
195 static void
196 udf_dispose_eccline(struct udf_eccline *eccline)
197 {
198 	struct strat_private *priv = PRIV(eccline->ump);
199 	struct buf *ret;
200 
201 	KASSERT(mutex_owned(&priv->discstrat_mutex));
202 
203 	KASSERT(eccline->refcnt == 0);
204 	KASSERT(eccline->dirty  == 0);
205 
206 	DPRINTF(ECCLINE, ("dispose eccline with start sector %d, "
207 		"present %0"PRIx64"\n", eccline->start_sector,
208 		eccline->present));
209 
210 	if (eccline->queued_on) {
211 		ret = BUFQ_CANCEL(priv->queues[eccline->queued_on], eccline->buf);
212 		KASSERT(ret == eccline->buf);
213 		priv->num_queued[eccline->queued_on]--;
214 	}
215 	LIST_REMOVE(eccline, hashchain);
216 
217 	if (eccline->flags & ECC_FLOATING) {
218 		eccline->flags &= ~ECC_FLOATING;
219 		priv->num_floating--;
220 	}
221 
222 	putiobuf(eccline->buf);
223 	pool_put(&priv->ecclineblob_pool, eccline->blob);
224 	pool_put(&priv->eccline_pool, eccline);
225 }
226 
227 
228 /* NOTE discstrat_mutex should be held! */
229 static void
230 udf_push_eccline(struct udf_eccline *eccline, int newqueue)
231 {
232 	struct strat_private *priv = PRIV(eccline->ump);
233 	struct buf *ret;
234 	int curqueue;
235 
236 	KASSERT(mutex_owned(&priv->discstrat_mutex));
237 
238 	DPRINTF(PARANOIA, ("DEBUG: buf %p pushed on queue %d\n", eccline->buf, newqueue));
239 
240 	/* requeue */
241 	curqueue = eccline->queued_on;
242 	if (curqueue) {
243 		ret = BUFQ_CANCEL(priv->queues[curqueue], eccline->buf);
244 
245 		DPRINTF(PARANOIA, ("push_eccline BUFQ_CANCEL returned %p when "
246 			"requested to remove %p from queue %d\n", ret,
247 			eccline->buf, curqueue));
248 #ifdef DIAGNOSTIC
249 		if (ret == NULL) {
250 			int i;
251 
252 			printf("udf_push_eccline: bufq_cancel can't find "
253 				"buffer; dumping queues\n");
254 			for (i = 1; i < UDF_SHED_MAX; i++) {
255 				printf("queue %d\n\t", i);
256 				ret = BUFQ_GET(priv->queues[i]);
257 				while (ret) {
258 					printf("%p ", ret);
259 					if (ret == eccline->buf)
260 						printf("[<-] ");
261 					ret = BUFQ_GET(priv->queues[i]);
262 				}
263 				printf("\n");
264 			}
265 			panic("fatal queue bug; exit");
266 		}
267 #endif
268 
269 		KASSERT(ret == eccline->buf);
270 		priv->num_queued[curqueue]--;
271 	}
272 
273 	/* set buffer block numbers to make sure its queued correctly */
274 	eccline->buf->b_lblkno   = eccline->start_sector;
275 	eccline->buf->b_blkno    = eccline->start_sector;
276 	eccline->buf->b_rawblkno = eccline->start_sector;
277 
278 	BUFQ_PUT(priv->queues[newqueue], eccline->buf);
279 	eccline->queued_on = newqueue;
280 	priv->num_queued[newqueue]++;
281 	vfs_timestamp(&priv->last_queued[newqueue]);
282 
283 	if (eccline->flags & ECC_FLOATING) {
284 		eccline->flags &= ~ECC_FLOATING;
285 		priv->num_floating--;
286 	}
287 
288 	/* tickle disc strategy statemachine */
289 	if (newqueue != UDF_SHED_IDLE)
290 		cv_signal(&priv->discstrat_cv);
291 }
292 
293 
294 static struct udf_eccline *
295 udf_pop_eccline(struct strat_private *priv, int queued_on)
296 {
297 	struct udf_eccline *eccline;
298 	struct buf *buf;
299 
300 	KASSERT(mutex_owned(&priv->discstrat_mutex));
301 
302 	buf = BUFQ_GET(priv->queues[queued_on]);
303 	if (!buf) {
304 		KASSERT(priv->num_queued[queued_on] == 0);
305 		return NULL;
306 	}
307 
308 	eccline = BTOE(buf);
309 	KASSERT(eccline->queued_on == queued_on);
310 	eccline->queued_on = 0;
311 	priv->num_queued[queued_on]--;
312 
313 	if (eccline->flags & ECC_FLOATING)
314 		panic("popping already marked floating eccline");
315 	eccline->flags |= ECC_FLOATING;
316 	priv->num_floating++;
317 
318 	DPRINTF(PARANOIA, ("DEBUG: buf %p popped from queue %d\n",
319 		eccline->buf, queued_on));
320 
321 	return eccline;
322 }
323 
324 
325 static struct udf_eccline *
326 udf_geteccline(struct udf_mount *ump, uint32_t sector, int flags)
327 {
328 	struct strat_private *priv = PRIV(ump);
329 	struct udf_eccline *eccline;
330 	uint32_t start_sector, lb_size, blobsize;
331 	uint8_t *eccline_blob;
332 	int line, line_offset;
333 	int num_busy, ret;
334 
335 	line_offset  = sector % ump->packet_size;
336 	start_sector = sector - line_offset;
337 	line = (start_sector/ump->packet_size) & UDF_ECCBUF_HASHMASK;
338 
339 	mutex_enter(&priv->discstrat_mutex);
340 	KASSERT(priv->thread_running);
341 
342 retry:
343 	DPRINTF(ECCLINE, ("get line sector %d, line %d\n", sector, line));
344 	LIST_FOREACH(eccline, &priv->eccline_hash[line], hashchain) {
345 		if (eccline->start_sector == start_sector) {
346 			DPRINTF(ECCLINE, ("\tfound eccline, start_sector %d\n",
347 				eccline->start_sector));
348 
349 			UDF_LOCK_ECCLINE(eccline);
350 			/* move from freelist (!) */
351 			if (eccline->queued_on == UDF_SHED_FREE) {
352 				DPRINTF(ECCLINE, ("was on freelist\n"));
353 				KASSERT(eccline->refcnt == 0);
354 				udf_push_eccline(eccline, UDF_SHED_IDLE);
355 			}
356 			eccline->refcnt++;
357 			mutex_exit(&priv->discstrat_mutex);
358 			return eccline;
359 		}
360 	}
361 
362 	DPRINTF(ECCLINE, ("\tnot found in eccline cache\n"));
363 	/* not found in eccline cache */
364 
365 	lb_size  = udf_rw32(ump->logical_vol->lb_size);
366 	blobsize = ump->packet_size * lb_size;
367 
368 	/* dont allow too many pending requests */
369 	DPRINTF(ECCLINE, ("\tallocating new eccline\n"));
370 	num_busy = (priv->num_queued[UDF_SHED_SEQWRITING] + priv->num_floating);
371 	if ((flags & ECC_SEQWRITING) && (num_busy > UDF_ECCLINE_MAXBUSY)) {
372 		ret = cv_timedwait(&priv->discstrat_cv,
373 			&priv->discstrat_mutex, hz/8);
374 		goto retry;
375 	}
376 
377 	eccline_blob = pool_get(&priv->ecclineblob_pool, PR_NOWAIT);
378 	eccline = pool_get(&priv->eccline_pool, PR_NOWAIT);
379 	if ((eccline_blob == NULL) || (eccline == NULL)) {
380 		if (eccline_blob)
381 			pool_put(&priv->ecclineblob_pool, eccline_blob);
382 		if (eccline)
383 			pool_put(&priv->eccline_pool, eccline);
384 
385 		/* out of memory for now; canibalise freelist */
386 		eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
387 		if (eccline == NULL) {
388 			/* serious trouble; wait and retry */
389 			cv_timedwait(&priv->discstrat_cv,
390 				&priv->discstrat_mutex, hz/8);
391 			goto retry;
392 		}
393 		/* push back line if we're waiting for it */
394 		if (eccline->flags & ECC_WANTED) {
395 			udf_push_eccline(eccline, UDF_SHED_IDLE);
396 			goto retry;
397 		}
398 
399 		/* unlink this entry */
400 		LIST_REMOVE(eccline, hashchain);
401 
402 		KASSERT(eccline->flags & ECC_FLOATING);
403 
404 		eccline_blob = eccline->blob;
405 		memset(eccline, 0, sizeof(struct udf_eccline));
406 		eccline->flags = ECC_FLOATING;
407 	} else {
408 		memset(eccline, 0, sizeof(struct udf_eccline));
409 		eccline->flags = ECC_FLOATING;
410 		priv->num_floating++;
411 	}
412 
413 	eccline->queued_on = 0;
414 	eccline->blob = eccline_blob;
415 	eccline->buf  = getiobuf(NULL, true);
416 	eccline->buf->b_private = eccline;	/* IMPORTANT */
417 
418 	/* initialise eccline blob */
419 	memset(eccline->blob, 0, blobsize);
420 
421 	eccline->ump = ump;
422 	eccline->present = eccline->readin = eccline->dirty = 0;
423 	eccline->error = 0;
424 	eccline->refcnt = 0;
425 
426 	eccline->start_sector    = start_sector;
427 	eccline->buf->b_lblkno   = start_sector;
428 	eccline->buf->b_blkno    = start_sector;
429 	eccline->buf->b_rawblkno = start_sector;
430 
431 	LIST_INSERT_HEAD(&priv->eccline_hash[line], eccline, hashchain);
432 
433 	/*
434 	 * TODO possible optimalisation for checking overlap with partitions
435 	 * to get a clue on future eccline usage
436 	 */
437 	eccline->refcnt++;
438 	UDF_LOCK_ECCLINE(eccline);
439 
440 	mutex_exit(&priv->discstrat_mutex);
441 
442 	return eccline;
443 }
444 
445 
446 static void
447 udf_puteccline(struct udf_eccline *eccline)
448 {
449 	struct strat_private *priv = PRIV(eccline->ump);
450 	struct udf_mount *ump = eccline->ump;
451 	uint64_t allbits = ((uint64_t) 1 << ump->packet_size)-1;
452 
453 	mutex_enter(&priv->discstrat_mutex);
454 
455 	/* clear directly all readin requests from present ones */
456 	if (eccline->readin & eccline->present) {
457 		/* clear all read bits that are already read in */
458 		eccline->readin &= (~eccline->present) & allbits;
459 		wakeup(eccline);
460 	}
461 
462 	DPRINTF(ECCLINE, ("put eccline start sector %d, refcnt %d\n",
463 		eccline->start_sector, eccline->refcnt));
464 
465 	/* if we have active nodes we dont set it on seqwriting */
466 	if (eccline->refcnt > 1)
467 		eccline->flags &= ~ECC_SEQWRITING;
468 
469 	vfs_timestamp(&eccline->wait_time);
470 	eccline->wait_time.tv_sec += ECC_WAITTIME;
471 	udf_push_eccline(eccline, UDF_SHED_WAITING);
472 
473 	KASSERT(eccline->refcnt >= 1);
474 	eccline->refcnt--;
475 	UDF_UNLOCK_ECCLINE(eccline);
476 
477 	wakeup(eccline);
478 	mutex_exit(&priv->discstrat_mutex);
479 }
480 
481 /* --------------------------------------------------------------------- */
482 
483 static int
484 udf_create_nodedscr_rmw(struct udf_strat_args *args)
485 {
486 	union dscrptr   **dscrptr  = &args->dscr;
487 	struct udf_mount *ump      = args->ump;
488 	struct long_ad   *icb      = args->icb;
489 	struct udf_eccline *eccline;
490 	uint64_t bit;
491 	uint32_t sectornr, lb_size, dummy;
492 	uint8_t *mem;
493 	int error, eccsect;
494 
495 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
496 	if (error)
497 		return error;
498 
499 	lb_size  = udf_rw32(ump->logical_vol->lb_size);
500 
501 	/* get our eccline */
502 	eccline = udf_geteccline(ump, sectornr, 0);
503 	eccsect = sectornr - eccline->start_sector;
504 
505 	bit = (uint64_t) 1 << eccsect;
506 	eccline->readin  &= ~bit;	/* just in case */
507 	eccline->present |=  bit;
508 	eccline->dirty   &= ~bit;	/* Err... euhm... clean? */
509 
510 	eccline->refcnt++;
511 
512 	/* clear space */
513 	mem = ((uint8_t *) eccline->blob) + eccsect * lb_size;
514 	memset(mem, 0, lb_size);
515 
516 	udf_puteccline(eccline);
517 
518 	*dscrptr = (union dscrptr *) mem;
519 	return 0;
520 }
521 
522 
523 static void
524 udf_free_nodedscr_rmw(struct udf_strat_args *args)
525 {
526 	struct udf_mount *ump  = args->ump;
527 	struct long_ad   *icb  = args->icb;
528 	struct udf_eccline *eccline;
529 	uint64_t bit;
530 	uint32_t sectornr, dummy;
531 	int error, eccsect;
532 
533 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
534 	if (error)
535 		return;
536 
537 	/* get our eccline */
538 	eccline = udf_geteccline(ump, sectornr, 0);
539 	eccsect = sectornr - eccline->start_sector;
540 
541 	bit = (uint64_t) 1 << eccsect;
542 	eccline->readin &= ~bit;	/* just in case */
543 
544 	KASSERT(eccline->refcnt >= 1);
545 	eccline->refcnt--;
546 
547 	udf_puteccline(eccline);
548 }
549 
550 
551 static int
552 udf_read_nodedscr_rmw(struct udf_strat_args *args)
553 {
554 	union dscrptr   **dscrptr = &args->dscr;
555 	struct udf_mount *ump = args->ump;
556 	struct long_ad   *icb = args->icb;
557 	struct udf_eccline *eccline;
558 	uint64_t bit;
559 	uint32_t sectornr, dummy;
560 	uint8_t *pos;
561 	int sector_size = ump->discinfo.sector_size;
562 	int lb_size = udf_rw32(ump->logical_vol->lb_size);
563 	int i, error, dscrlen, eccsect;
564 
565 	lb_size = lb_size;
566 	KASSERT(sector_size == lb_size);
567 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
568 	if (error)
569 		return error;
570 
571 	/* get our eccline */
572 	eccline = udf_geteccline(ump, sectornr, 0);
573 	eccsect = sectornr - eccline->start_sector;
574 
575 	bit = (uint64_t) 1 << eccsect;
576 	if ((eccline->present & bit) == 0) {
577 		/* mark bit for readin */
578 		eccline->readin |= bit;
579 		eccline->refcnt++;	/* prevent recycling */
580 		KASSERT(eccline->bufs[eccsect] == NULL);
581 		udf_puteccline(eccline);
582 
583 		/* wait for completion; XXX remodel to lock bit code */
584 		error = 0;
585 		while ((eccline->present & bit) == 0) {
586 			tsleep(eccline, PRIBIO+1, "udflvdrd", hz/8);
587 			if (eccline->error & bit) {
588 				KASSERT(eccline->refcnt >= 1);
589 				eccline->refcnt--;	/* undo temp refcnt */
590 				*dscrptr = NULL;
591 				return EIO;		/* XXX error code */
592 			}
593 		}
594 
595 		/* reget our line */
596 		eccline = udf_geteccline(ump, sectornr, 0);
597 		KASSERT(eccline->refcnt >= 1);
598 		eccline->refcnt--;	/* undo refcnt */
599 	}
600 
601 	*dscrptr = (union dscrptr *)
602 		(((uint8_t *) eccline->blob) + eccsect * sector_size);
603 
604 	/* code from read_phys_descr */
605 	/* check if its a valid tag */
606 	error = udf_check_tag(*dscrptr);
607 	if (error) {
608 		/* check if its an empty block */
609 		pos = (uint8_t *) *dscrptr;
610 		for (i = 0; i < sector_size; i++, pos++) {
611 			if (*pos) break;
612 		}
613 		if (i == sector_size) {
614 			/* return no error but with no dscrptr */
615 			error = 0;
616 		}
617 		*dscrptr = NULL;
618 		udf_puteccline(eccline);
619 		return error;
620 	}
621 
622 	/* calculate descriptor size */
623 	dscrlen = udf_tagsize(*dscrptr, sector_size);
624 	error = udf_check_tag_payload(*dscrptr, dscrlen);
625 	if (error) {
626 		*dscrptr = NULL;
627 		udf_puteccline(eccline);
628 		return error;
629 	}
630 
631 	eccline->refcnt++;
632 	udf_puteccline(eccline);
633 
634 	return 0;
635 }
636 
637 
638 static int
639 udf_write_nodedscr_rmw(struct udf_strat_args *args)
640 {
641 	union dscrptr    *dscrptr = args->dscr;
642 	struct udf_mount *ump = args->ump;
643 	struct long_ad   *icb = args->icb;
644 	struct udf_node *udf_node = args->udf_node;
645 	struct udf_eccline *eccline;
646 	uint64_t bit;
647 	uint32_t sectornr, logsectornr, dummy;
648 	// int waitfor  = args->waitfor;
649 	int sector_size = ump->discinfo.sector_size;
650 	int lb_size = udf_rw32(ump->logical_vol->lb_size);
651 	int error, eccsect;
652 
653 	lb_size = lb_size;
654 	KASSERT(sector_size == lb_size);
655 	sectornr    = 0;
656 	error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
657 	if (error)
658 		return error;
659 
660 	/* add reference to the vnode to prevent recycling */
661 	vhold(udf_node->vnode);
662 
663 	/* get our eccline */
664 	eccline = udf_geteccline(ump, sectornr, 0);
665 	eccsect = sectornr - eccline->start_sector;
666 
667 	bit = (uint64_t) 1 << eccsect;
668 
669 	/* old callback still pending? */
670 	if (eccline->bufs[eccsect]) {
671 		DPRINTF(WRITE, ("udf_write_nodedscr_rmw: writing descriptor"
672 					" over buffer?\n"));
673 		nestiobuf_done(eccline->bufs[eccsect],
674 				eccline->bufs_len[eccsect],
675 				0);
676 		eccline->bufs[eccsect] = NULL;
677 	}
678 
679 	/* set sector number in the descriptor and validate */
680 	dscrptr = (union dscrptr *)
681 		(((uint8_t *) eccline->blob) + eccsect * sector_size);
682 	KASSERT(dscrptr == args->dscr);
683 
684 	logsectornr = udf_rw32(icb->loc.lb_num);
685 	dscrptr->tag.tag_loc = udf_rw32(logsectornr);
686 	udf_validate_tag_and_crc_sums(dscrptr);
687 
688 	udf_fixup_node_internals(ump, (uint8_t *) dscrptr, UDF_C_NODE);
689 
690 	/* set our flags */
691 	KASSERT(eccline->present & bit);
692 	eccline->dirty |= bit;
693 
694 	KASSERT(udf_tagsize(dscrptr, sector_size) <= sector_size);
695 
696 	udf_puteccline(eccline);
697 
698 	holdrele(udf_node->vnode);
699 	udf_node->outstanding_nodedscr--;
700 	if (udf_node->outstanding_nodedscr == 0) {
701 		UDF_UNLOCK_NODE(udf_node, udf_node->i_flags & IN_CALLBACK_ULK);
702 		wakeup(&udf_node->outstanding_nodedscr);
703 	}
704 
705 	/* XXX waitfor not used */
706 	return 0;
707 }
708 
709 
710 static void
711 udf_queuebuf_rmw(struct udf_strat_args *args)
712 {
713 	struct udf_mount *ump = args->ump;
714 	struct buf *buf = args->nestbuf;
715 	struct desc_tag *tag;
716 	struct strat_private *priv = PRIV(ump);
717 	struct udf_eccline *eccline;
718 	struct long_ad *node_ad_cpy;
719 	uint64_t bit, *lmapping, *pmapping, *lmappos, *pmappos, blknr;
720 	uint32_t buf_len, len, sectors, sectornr, our_sectornr;
721 	uint32_t bpos;
722 	uint16_t vpart_num;
723 	uint8_t *fidblk, *src, *dst;
724 	int sector_size = ump->discinfo.sector_size;
725 	int blks = sector_size / DEV_BSIZE;
726 	int eccsect, what, queue, error;
727 
728 	KASSERT(ump);
729 	KASSERT(buf);
730 	KASSERT(buf->b_iodone == nestiobuf_iodone);
731 
732 	blknr        = buf->b_blkno;
733 	our_sectornr = blknr / blks;
734 
735 	what = buf->b_udf_c_type;
736 	queue = UDF_SHED_READING;
737 	if ((buf->b_flags & B_READ) == 0) {
738 		/* writing */
739 		queue = UDF_SHED_SEQWRITING;
740 		if (what == UDF_C_DSCR)
741 			queue = UDF_SHED_WRITING;
742 		if (what == UDF_C_NODE)
743 			queue = UDF_SHED_WRITING;
744 	}
745 
746 	if (queue == UDF_SHED_READING) {
747 		DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw READ %p : sector %d type %d,"
748 			"b_resid %d, b_bcount %d, b_bufsize %d\n",
749 			buf, (uint32_t) buf->b_blkno / blks, buf->b_udf_c_type,
750 			buf->b_resid, buf->b_bcount, buf->b_bufsize));
751 
752 		/* mark bits for reading */
753 		buf_len = buf->b_bcount;
754 		sectornr = our_sectornr;
755 		eccline = udf_geteccline(ump, sectornr, 0);
756 		eccsect = sectornr - eccline->start_sector;
757 		bpos = 0;
758 		while (buf_len) {
759 			len = MIN(buf_len, sector_size);
760 			if (eccsect == ump->packet_size) {
761 				udf_puteccline(eccline);
762 				eccline = udf_geteccline(ump, sectornr, 0);
763 				eccsect = sectornr - eccline->start_sector;
764 			}
765 			bit = (uint64_t) 1 << eccsect;
766 			error = eccline->error & bit ? EIO : 0;
767 			if (eccline->present & bit) {
768 				src = (uint8_t *) eccline->blob +
769 					eccsect * sector_size;
770 				dst = (uint8_t *) buf->b_data + bpos;
771 				if (!error)
772 					memcpy(dst, src, len);
773 				nestiobuf_done(buf, len, error);
774 			} else {
775 				eccline->readin |= bit;
776 				KASSERT(eccline->bufs[eccsect] == NULL);
777 				eccline->bufs[eccsect] = buf;
778 				eccline->bufs_bpos[eccsect] = bpos;
779 				eccline->bufs_len[eccsect] = len;
780 			}
781 			bpos += sector_size;
782 			eccsect++;
783 			sectornr++;
784 			buf_len -= len;
785 		}
786 		udf_puteccline(eccline);
787 		return;
788 	}
789 
790 	if (queue == UDF_SHED_WRITING) {
791 		DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw WRITE %p : sector %d "
792 			"type %d, b_resid %d, b_bcount %d, b_bufsize %d\n",
793 			buf, (uint32_t) buf->b_blkno / blks, buf->b_udf_c_type,
794 			buf->b_resid, buf->b_bcount, buf->b_bufsize));
795 		/* if we have FIDs fixup using buffer's sector number(s) */
796 		if (buf->b_udf_c_type == UDF_C_FIDS) {
797 			panic("UDF_C_FIDS in SHED_WRITING!\n");
798 #if 0
799 			buf_len = buf->b_bcount;
800 			sectornr = our_sectornr;
801 			bpos = 0;
802 			while (buf_len) {
803 				len = MIN(buf_len, sector_size);
804 				fidblk = (uint8_t *) buf->b_data + bpos;
805 				udf_fixup_fid_block(fidblk, sector_size,
806 					0, len, sectornr);
807 				sectornr++;
808 				bpos += len;
809 				buf_len -= len;
810 			}
811 #endif
812 		}
813 		udf_fixup_node_internals(ump, buf->b_data, buf->b_udf_c_type);
814 
815 		/* copy parts into the bufs and set for writing */
816 		buf_len = buf->b_bcount;
817 		sectornr = our_sectornr;
818 		eccline = udf_geteccline(ump, sectornr, 0);
819 		eccsect = sectornr - eccline->start_sector;
820 		bpos = 0;
821 		while (buf_len) {
822 			len = MIN(buf_len, sector_size);
823 			if (eccsect == ump->packet_size) {
824 				udf_puteccline(eccline);
825 				eccline = udf_geteccline(ump, sectornr, 0);
826 				eccsect = sectornr - eccline->start_sector;
827 			}
828 			bit = (uint64_t) 1 << eccsect;
829 			KASSERT((eccline->readin & bit) == 0);
830 			eccline->present |= bit;
831 			eccline->dirty   |= bit;
832 			if (eccline->bufs[eccsect]) {
833 				/* old callback still pending */
834 				nestiobuf_done(eccline->bufs[eccsect],
835 						eccline->bufs_len[eccsect],
836 						0);
837 				eccline->bufs[eccsect] = NULL;
838 			}
839 
840 			src = (uint8_t *) buf->b_data + bpos;
841 			dst = (uint8_t *) eccline->blob + eccsect * sector_size;
842 			if (len != sector_size)
843 				memset(dst, 0, sector_size);
844 			memcpy(dst, src, len);
845 
846 			/* note that its finished for this extent */
847 			eccline->bufs[eccsect] = NULL;
848 			nestiobuf_done(buf, len, 0);
849 
850 			bpos += sector_size;
851 			eccsect++;
852 			sectornr++;
853 			buf_len -= len;
854 		}
855 		udf_puteccline(eccline);
856 		return;
857 
858 	}
859 
860 	/* sequential writing */
861 	KASSERT(queue == UDF_SHED_SEQWRITING);
862 	DPRINTF(SHEDULE, ("\nudf_queuebuf_rmw SEQWRITE %p : sector XXXX "
863 		"type %d, b_resid %d, b_bcount %d, b_bufsize %d\n",
864 		buf, buf->b_udf_c_type, buf->b_resid, buf->b_bcount,
865 		buf->b_bufsize));
866 	/*
867 	 * Buffers should not have been allocated to disc addresses yet on
868 	 * this queue. Note that a buffer can get multiple extents allocated.
869 	 * Note that it *looks* like the normal writing but its different in
870 	 * the details.
871 	 *
872 	 * lmapping contains lb_num relative to base partition.
873 	 *
874 	 * XXX should we try to claim/organize the allocated memory to
875 	 * block-aligned pieces?
876 	 */
877 	mutex_enter(&priv->seqwrite_mutex);
878 
879 	lmapping    = ump->la_lmapping;
880 	node_ad_cpy = ump->la_node_ad_cpy;
881 
882 	/* logically allocate buf and map it in the file */
883 	udf_late_allocate_buf(ump, buf, lmapping, node_ad_cpy, &vpart_num);
884 
885 	/* if we have FIDs, fixup using the new allocation table */
886 	if (buf->b_udf_c_type == UDF_C_FIDS) {
887 		buf_len = buf->b_bcount;
888 		bpos = 0;
889 		lmappos = lmapping;
890 		while (buf_len) {
891 			sectornr = *lmappos++;
892 			len = MIN(buf_len, sector_size);
893 			fidblk = (uint8_t *) buf->b_data + bpos;
894 			udf_fixup_fid_block(fidblk, sector_size,
895 				0, len, sectornr);
896 			bpos += len;
897 			buf_len -= len;
898 		}
899 	}
900 	if (buf->b_udf_c_type == UDF_C_METADATA_SBM) {
901 		if (buf->b_lblkno == 0) {
902 			/* update the tag location inside */
903 			tag = (struct desc_tag *) buf->b_data;
904 			tag->tag_loc = udf_rw32(*lmapping);
905 			udf_validate_tag_and_crc_sums(buf->b_data);
906 		}
907 	}
908 	udf_fixup_node_internals(ump, buf->b_data, buf->b_udf_c_type);
909 
910 	/*
911 	 * Translate new mappings in lmapping to pmappings.
912 	 * pmapping to contain lb_nums as used for disc adressing.
913 	 */
914 	pmapping = ump->la_pmapping;
915 	sectors  = (buf->b_bcount + sector_size -1) / sector_size;
916 	udf_translate_vtop_list(ump, sectors, vpart_num, lmapping, pmapping);
917 
918 	/* copy parts into the bufs and set for writing */
919 	pmappos = pmapping;
920 	buf_len = buf->b_bcount;
921 	sectornr = *pmappos++;
922 	eccline = udf_geteccline(ump, sectornr, ECC_SEQWRITING);
923 	eccsect = sectornr - eccline->start_sector;
924 	bpos = 0;
925 	while (buf_len) {
926 		len = MIN(buf_len, sector_size);
927 		eccsect = sectornr - eccline->start_sector;
928 		if ((eccsect < 0) || (eccsect >= ump->packet_size)) {
929 			eccline->flags |= ECC_SEQWRITING;
930 			udf_puteccline(eccline);
931 			eccline = udf_geteccline(ump, sectornr, ECC_SEQWRITING);
932 			eccsect = sectornr - eccline->start_sector;
933 		}
934 		bit = (uint64_t) 1 << eccsect;
935 		KASSERT((eccline->readin & bit) == 0);
936 		eccline->present |= bit;
937 		eccline->dirty   |= bit;
938 		eccline->bufs[eccsect] = NULL;
939 
940 		src = (uint8_t *) buf->b_data + bpos;
941 		dst = (uint8_t *)
942 			eccline->blob + eccsect * sector_size;
943 		if (len != sector_size)
944 			memset(dst, 0, sector_size);
945 		memcpy(dst, src, len);
946 
947 		/* note that its finished for this extent */
948 		nestiobuf_done(buf, len, 0);
949 
950 		bpos += sector_size;
951 		sectornr = *pmappos++;
952 		buf_len -= len;
953 	}
954 	eccline->flags |= ECC_SEQWRITING;
955 	udf_puteccline(eccline);
956 	mutex_exit(&priv->seqwrite_mutex);
957 }
958 
959 /* --------------------------------------------------------------------- */
960 
961 static void
962 udf_shedule_read_callback(struct buf *buf)
963 {
964 	struct udf_eccline *eccline = BTOE(buf);
965 	struct udf_mount *ump = eccline->ump;
966 	uint64_t bit;
967 	uint8_t *src, *dst;
968 	int sector_size = ump->discinfo.sector_size;
969 	int error, i, len;
970 
971 	DPRINTF(ECCLINE, ("read callback called\n"));
972 	/* post process read action */
973 	error = buf->b_error;
974 	for (i = 0; i < ump->packet_size; i++) {
975 		bit = (uint64_t) 1 << i;
976 		src = (uint8_t *) buf->b_data +   i * sector_size;
977 		dst = (uint8_t *) eccline->blob + i * sector_size;
978 		if (eccline->present & bit)
979 			continue;
980 		eccline->present |= bit;
981 		if (error)
982 			eccline->error |= bit;
983 		if (eccline->bufs[i]) {
984 			dst = (uint8_t *) eccline->bufs[i]->b_data +
985 				eccline->bufs_bpos[i];
986 			len = eccline->bufs_len[i];
987 			if (!error)
988 				memcpy(dst, src, len);
989 			nestiobuf_done(eccline->bufs[i], len, error);
990 			eccline->bufs[i] = NULL;
991 		}
992 
993 	}
994 	KASSERT(buf->b_data == eccline->blob);
995 	KASSERT(eccline->present == ((uint64_t) 1 << ump->packet_size)-1);
996 
997 	/*
998 	 * XXX TODO what to do on read errors? read in all sectors
999 	 * synchronously and allocate a sparable entry?
1000 	 */
1001 
1002 	udf_puteccline(eccline);
1003 	DPRINTF(ECCLINE, ("read callback finished\n"));
1004 }
1005 
1006 
1007 static void
1008 udf_shedule_write_callback(struct buf *buf)
1009 {
1010 	struct udf_eccline *eccline = BTOE(buf);
1011 	struct udf_mount *ump = eccline->ump;
1012 	uint64_t bit;
1013 	int error, i, len;
1014 
1015 	DPRINTF(ECCLINE, ("write callback called\n"));
1016 	/* post process write action */
1017 	error = buf->b_error;
1018 	for (i = 0; i < ump->packet_size; i++) {
1019 		bit = (uint64_t) 1 << i;
1020 		if ((eccline->dirty & bit) == 0)
1021 			continue;
1022 		if (error) {
1023 			eccline->error |= bit;
1024 		} else {
1025 			eccline->dirty &= ~bit;
1026 		}
1027 		if (eccline->bufs[i]) {
1028 			len = eccline->bufs_len[i];
1029 			nestiobuf_done(eccline->bufs[i], len, error);
1030 			eccline->bufs[i] = NULL;
1031 		}
1032 	}
1033 	KASSERT(eccline->dirty == 0);
1034 
1035 	KASSERT(error == 0);
1036 	/*
1037 	 * XXX TODO on write errors allocate a sparable entry and reissue
1038 	 */
1039 
1040 	udf_puteccline(eccline);
1041 }
1042 
1043 
1044 static void
1045 udf_issue_eccline(struct udf_eccline *eccline, int queued_on)
1046 {
1047 	struct udf_mount *ump = eccline->ump;
1048 	struct strat_private *priv = PRIV(ump);
1049 	struct buf *buf, *nestbuf;
1050 	uint64_t bit, allbits = ((uint64_t) 1 << ump->packet_size)-1;
1051 	uint32_t start;
1052 	int sector_size = ump->discinfo.sector_size;
1053 	int blks = sector_size / DEV_BSIZE;
1054 	int i;
1055 
1056 	if (queued_on == UDF_SHED_READING) {
1057 		DPRINTF(SHEDULE, ("udf_issue_eccline reading : "));
1058 		/* read all bits that are not yet present */
1059 		eccline->readin = (~eccline->present) & allbits;
1060 		KASSERT(eccline->readin);
1061 		start = eccline->start_sector;
1062 		buf = eccline->buf;
1063 		buf->b_flags    = B_READ | B_ASYNC;
1064 		SET(buf->b_cflags, BC_BUSY);	/* mark buffer busy */
1065 		buf->b_oflags   = 0;
1066 		buf->b_iodone   = udf_shedule_read_callback;
1067 		buf->b_data     = eccline->blob;
1068 		buf->b_bcount   = ump->packet_size * sector_size;
1069 		buf->b_resid    = buf->b_bcount;
1070 		buf->b_bufsize  = buf->b_bcount;
1071 		buf->b_private  = eccline;
1072 		BIO_SETPRIO(buf, BPRIO_DEFAULT);
1073 		buf->b_lblkno   = buf->b_blkno = buf->b_rawblkno = start * blks;
1074 		buf->b_proc     = NULL;
1075 
1076 		if (eccline->present != 0) {
1077 			for (i = 0; i < ump->packet_size; i++) {
1078 				bit = (uint64_t) 1 << i;
1079 				if (eccline->present & bit) {
1080 					nestiobuf_done(buf, sector_size, 0);
1081 					continue;
1082 				}
1083 				nestbuf = getiobuf(NULL, true);
1084 				nestiobuf_setup(buf, nestbuf, i * sector_size,
1085 					sector_size);
1086 				/* adjust blocknumber to read */
1087 				nestbuf->b_blkno = buf->b_blkno + i*blks;
1088 				nestbuf->b_rawblkno = buf->b_rawblkno + i*blks;
1089 
1090 				DPRINTF(SHEDULE, ("sector %d ",
1091 					start + i));
1092 				/* call asynchronous */
1093 				VOP_STRATEGY(ump->devvp, nestbuf);
1094 			}
1095 			DPRINTF(SHEDULE, ("\n"));
1096 			return;
1097 		}
1098 	} else {
1099 		/* write or seqwrite */
1100 		DPRINTF(SHEDULE, ("udf_issue_eccline writing or seqwriting : "));
1101 		DPRINTF(SHEDULE, ("\n\tpresent %"PRIx64", readin %"PRIx64", "
1102 			"dirty %"PRIx64"\n\t", eccline->present, eccline->readin,
1103 			eccline->dirty));
1104 		if (eccline->present != allbits) {
1105 			/* requeue to read-only */
1106 			DPRINTF(SHEDULE, ("\n\t-> not complete, requeue to "
1107 				"reading\n"));
1108 			udf_push_eccline(eccline, UDF_SHED_READING);
1109 			return;
1110 		}
1111 		start = eccline->start_sector;
1112 		buf = eccline->buf;
1113 		buf->b_flags    = B_WRITE | B_ASYNC;
1114 		SET(buf->b_cflags, BC_BUSY);	/* mark buffer busy */
1115 		buf->b_oflags   = 0;
1116 		buf->b_iodone   = udf_shedule_write_callback;
1117 		buf->b_data     = eccline->blob;
1118 		buf->b_bcount   = ump->packet_size * sector_size;
1119 		buf->b_resid    = buf->b_bcount;
1120 		buf->b_bufsize  = buf->b_bcount;
1121 		buf->b_private  = eccline;
1122 		BIO_SETPRIO(buf, BPRIO_DEFAULT);
1123 		buf->b_lblkno   = buf->b_blkno = buf->b_rawblkno = start * blks;
1124 		buf->b_proc     = NULL;
1125 	}
1126 
1127 	mutex_exit(&priv->discstrat_mutex);
1128 		/* call asynchronous */
1129 		DPRINTF(SHEDULE, ("sector %d for %d\n",
1130 			start, ump->packet_size));
1131 		VOP_STRATEGY(ump->devvp, buf);
1132 	mutex_enter(&priv->discstrat_mutex);
1133 }
1134 
1135 
1136 static void
1137 udf_discstrat_thread(void *arg)
1138 {
1139 	struct udf_mount *ump = (struct udf_mount *) arg;
1140 	struct strat_private *priv = PRIV(ump);
1141 	struct udf_eccline *eccline;
1142 	struct timespec now, *last;
1143 	uint64_t allbits = ((uint64_t) 1 << ump->packet_size)-1;
1144 	int new_queue, wait, work, num, cnt;
1145 
1146 	work = 1;
1147 	priv->thread_running = 1;
1148 	mutex_enter(&priv->discstrat_mutex);
1149 	priv->num_floating = 0;
1150 	while (priv->run_thread || work || priv->num_floating) {
1151 		/* get our time */
1152 		vfs_timestamp(&now);
1153 
1154 		/* maintenance: handle eccline state machine */
1155 		num = priv->num_queued[UDF_SHED_WAITING];
1156 		cnt = 0;
1157 		while (cnt < num) {
1158 			eccline = udf_pop_eccline(priv, UDF_SHED_WAITING);
1159 			/* requeue */
1160 			new_queue = UDF_SHED_FREE;
1161 			if (eccline->refcnt > 0)
1162 				new_queue = UDF_SHED_IDLE;
1163 			if (eccline->flags & ECC_WANTED)
1164 				new_queue = UDF_SHED_IDLE;
1165 			if (eccline->readin)
1166 				new_queue = UDF_SHED_READING;
1167 			if (eccline->dirty) {
1168 				new_queue = UDF_SHED_WAITING;
1169 				if ((eccline->wait_time.tv_sec - now.tv_sec <= 0) ||
1170 				   ((eccline->present == allbits) &&
1171 				    (eccline->flags & ECC_SEQWRITING)))
1172 				{
1173 					new_queue = UDF_SHED_WRITING;
1174 					if (eccline->flags & ECC_SEQWRITING)
1175 						new_queue = UDF_SHED_SEQWRITING;
1176 					if (eccline->present != allbits)
1177 						new_queue = UDF_SHED_READING;
1178 				}
1179 			}
1180 			udf_push_eccline(eccline, new_queue);
1181 			cnt++;
1182 		}
1183 
1184 		/* maintenance: free exess ecclines */
1185 		while (priv->num_queued[UDF_SHED_FREE] > UDF_ECCLINE_MAXFREE) {
1186 			eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
1187 			KASSERT(eccline);
1188 			KASSERT(eccline->refcnt == 0);
1189 			if (eccline->flags & ECC_WANTED) {
1190 				udf_push_eccline(eccline, UDF_SHED_IDLE);
1191 				DPRINTF(ECCLINE, ("Tried removing, pushed back to free list\n"));
1192 			} else {
1193 				DPRINTF(ECCLINE, ("Removing entry from free list\n"));
1194 				udf_dispose_eccline(eccline);
1195 			}
1196 		}
1197 
1198 		/* process the current selected queue */
1199 		/* get our time */
1200 		vfs_timestamp(&now);
1201 		last = &priv->last_queued[priv->cur_queue];
1202 
1203 		/* get our line */
1204 		eccline = udf_pop_eccline(priv, priv->cur_queue);
1205 		if (eccline) {
1206 			wait = 0;
1207 			new_queue = priv->cur_queue;
1208 			DPRINTF(ECCLINE, ("UDF_ISSUE_ECCLINE\n"));
1209 
1210 			/* complete the `get' by locking and refcounting it */
1211 			UDF_LOCK_ECCLINE(eccline);
1212 			eccline->refcnt++;
1213 
1214 			udf_issue_eccline(eccline, priv->cur_queue);
1215 		} else {
1216 			/* don't switch too quickly */
1217 			if (now.tv_sec - last->tv_sec < 2) {
1218 				/* wait some time */
1219 				cv_timedwait(&priv->discstrat_cv,
1220 					&priv->discstrat_mutex, hz);
1221 				/* we assume there is work to be done */
1222 				work = 1;
1223 				continue;
1224 			}
1225 
1226 			/* XXX select on queue lengths ? */
1227 			wait = 1;
1228 			/* check if we can/should switch */
1229 			new_queue = priv->cur_queue;
1230 			if (BUFQ_PEEK(priv->queues[UDF_SHED_READING]))
1231 				new_queue = UDF_SHED_READING;
1232 			if (BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]))
1233 				new_queue = UDF_SHED_WRITING;
1234 			if (BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]))
1235 				new_queue = UDF_SHED_SEQWRITING;
1236 		}
1237 
1238 		/* give room */
1239 		mutex_exit(&priv->discstrat_mutex);
1240 
1241 		if (new_queue != priv->cur_queue) {
1242 			wait = 0;
1243 			DPRINTF(SHEDULE, ("switching from %d to %d\n",
1244 				priv->cur_queue, new_queue));
1245 			priv->cur_queue = new_queue;
1246 		}
1247 		mutex_enter(&priv->discstrat_mutex);
1248 
1249 		/* wait for more if needed */
1250 		if (wait)
1251 			cv_timedwait(&priv->discstrat_cv,
1252 				&priv->discstrat_mutex, hz/4);	/* /8 */
1253 
1254 		work  = (BUFQ_PEEK(priv->queues[UDF_SHED_WAITING]) != NULL);
1255 		work |= (BUFQ_PEEK(priv->queues[UDF_SHED_READING]) != NULL);
1256 		work |= (BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]) != NULL);
1257 		work |= (BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]) != NULL);
1258 
1259 		DPRINTF(PARANOIA, ("work : (%d, %d, %d) -> work %d, float %d\n",
1260 			(BUFQ_PEEK(priv->queues[UDF_SHED_READING]) != NULL),
1261 			(BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]) != NULL),
1262 			(BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]) != NULL),
1263 			work, priv->num_floating));
1264 	}
1265 
1266 	mutex_exit(&priv->discstrat_mutex);
1267 
1268 	/* tear down remaining ecclines */
1269 	mutex_enter(&priv->discstrat_mutex);
1270 	KASSERT(priv->num_queued[UDF_SHED_WAITING] == 0);
1271 	KASSERT(priv->num_queued[UDF_SHED_IDLE] == 0);
1272 	KASSERT(priv->num_queued[UDF_SHED_READING] == 0);
1273 	KASSERT(priv->num_queued[UDF_SHED_WRITING] == 0);
1274 	KASSERT(priv->num_queued[UDF_SHED_SEQWRITING] == 0);
1275 
1276 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_WAITING]) == NULL);
1277 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_IDLE]) == NULL);
1278 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_READING]) == NULL);
1279 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_WRITING]) == NULL);
1280 	KASSERT(BUFQ_PEEK(priv->queues[UDF_SHED_SEQWRITING]) == NULL);
1281 	eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
1282 	while (eccline) {
1283 		udf_dispose_eccline(eccline);
1284 		eccline = udf_pop_eccline(priv, UDF_SHED_FREE);
1285 	}
1286 	KASSERT(priv->num_queued[UDF_SHED_FREE] == 0);
1287 	mutex_exit(&priv->discstrat_mutex);
1288 
1289 	priv->thread_running  = 0;
1290 	priv->thread_finished = 1;
1291 	wakeup(&priv->run_thread);
1292 	kthread_exit(0);
1293 	/* not reached */
1294 }
1295 
1296 /* --------------------------------------------------------------------- */
1297 
1298 /*
1299  * Buffer memory pool allocator.
1300  */
1301 
1302 static void *
1303 ecclinepool_page_alloc(struct pool *pp, int flags)
1304 {
1305         return (void *)uvm_km_alloc(kernel_map,
1306             MAXBSIZE, MAXBSIZE,
1307             ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
1308 	    	| UVM_KMF_WIRED /* UVM_KMF_PAGABLE? */);
1309 }
1310 
1311 static void
1312 ecclinepool_page_free(struct pool *pp, void *v)
1313 {
1314         uvm_km_free(kernel_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
1315 }
1316 
1317 static struct pool_allocator ecclinepool_allocator = {
1318         .pa_alloc = ecclinepool_page_alloc,
1319         .pa_free  = ecclinepool_page_free,
1320         .pa_pagesz = MAXBSIZE,
1321 };
1322 
1323 
1324 static void
1325 udf_discstrat_init_rmw(struct udf_strat_args *args)
1326 {
1327 	struct udf_mount *ump = args->ump;
1328 	struct strat_private *priv = PRIV(ump);
1329 	uint32_t lb_size, blobsize, hashline;
1330 	int i;
1331 
1332 	KASSERT(ump);
1333 	KASSERT(ump->logical_vol);
1334 	KASSERT(priv == NULL);
1335 
1336 	lb_size = udf_rw32(ump->logical_vol->lb_size);
1337 	blobsize = ump->packet_size * lb_size;
1338 	KASSERT(lb_size > 0);
1339 	KASSERT(ump->packet_size <= 64);
1340 
1341 	/* initialise our memory space */
1342 	ump->strategy_private = malloc(sizeof(struct strat_private),
1343 		M_UDFTEMP, M_WAITOK);
1344 	priv = ump->strategy_private;
1345 	memset(priv, 0 , sizeof(struct strat_private));
1346 
1347 	/* initialise locks */
1348 	cv_init(&priv->discstrat_cv, "udfstrat");
1349 	mutex_init(&priv->discstrat_mutex, MUTEX_DEFAULT, IPL_NONE);
1350 	mutex_init(&priv->seqwrite_mutex, MUTEX_DEFAULT, IPL_NONE);
1351 
1352 	/* initialise struct eccline pool */
1353 	pool_init(&priv->eccline_pool, sizeof(struct udf_eccline),
1354 		0, 0, 0, "udf_eccline_pool", NULL, IPL_NONE);
1355 
1356 	/* initialise eccline blob pool */
1357         ecclinepool_allocator.pa_pagesz = blobsize;
1358 	pool_init(&priv->ecclineblob_pool, blobsize,
1359 		0, 0, 0, "udf_eccline_blob", &ecclinepool_allocator, IPL_NONE);
1360 
1361 	/* initialise main queues */
1362 	for (i = 0; i < UDF_SHED_MAX; i++) {
1363 		priv->num_queued[i] = 0;
1364 		vfs_timestamp(&priv->last_queued[i]);
1365 	}
1366 	bufq_alloc(&priv->queues[UDF_SHED_WAITING], "fcfs",
1367 		BUFQ_SORT_RAWBLOCK);
1368 	bufq_alloc(&priv->queues[UDF_SHED_READING], "disksort",
1369 		BUFQ_SORT_RAWBLOCK);
1370 	bufq_alloc(&priv->queues[UDF_SHED_WRITING], "disksort",
1371 		BUFQ_SORT_RAWBLOCK);
1372 	bufq_alloc(&priv->queues[UDF_SHED_SEQWRITING], "disksort", 0);
1373 
1374 	/* initialise administrative queues */
1375 	bufq_alloc(&priv->queues[UDF_SHED_IDLE], "fcfs", 0);
1376 	bufq_alloc(&priv->queues[UDF_SHED_FREE], "fcfs", 0);
1377 
1378 	for (hashline = 0; hashline < UDF_ECCBUF_HASHSIZE; hashline++) {
1379 		LIST_INIT(&priv->eccline_hash[hashline]);
1380 	}
1381 
1382 	/* create our disk strategy thread */
1383 	priv->cur_queue = UDF_SHED_READING;
1384 	priv->thread_finished = 0;
1385 	priv->thread_running  = 0;
1386 	priv->run_thread      = 1;
1387 	if (kthread_create(PRI_NONE, 0 /* KTHREAD_MPSAFE*/, NULL /* cpu_info*/,
1388 		udf_discstrat_thread, ump, &priv->queue_lwp,
1389 		"%s", "udf_rw")) {
1390 		panic("fork udf_rw");
1391 	}
1392 
1393 	/* wait for thread to spin up */
1394 	while (!priv->thread_running) {
1395 		tsleep(&priv->thread_running, PRIBIO+1, "udfshedstart", hz);
1396 	}
1397 }
1398 
1399 
1400 static void
1401 udf_discstrat_finish_rmw(struct udf_strat_args *args)
1402 {
1403 	struct udf_mount *ump = args->ump;
1404 	struct strat_private *priv = PRIV(ump);
1405 	int error;
1406 
1407 	if (ump == NULL)
1408 		return;
1409 
1410 	/* stop our sheduling thread */
1411 	KASSERT(priv->run_thread == 1);
1412 	priv->run_thread = 0;
1413 	wakeup(priv->queue_lwp);
1414 	while (!priv->thread_finished) {
1415 		error = tsleep(&priv->run_thread, PRIBIO+1,
1416 			"udfshedfin", hz);
1417 	}
1418 	/* kthread should be finished now */
1419 
1420 	/* cleanup our pools */
1421 	pool_destroy(&priv->eccline_pool);
1422 	pool_destroy(&priv->ecclineblob_pool);
1423 
1424 	cv_destroy(&priv->discstrat_cv);
1425 	mutex_destroy(&priv->discstrat_mutex);
1426 	mutex_destroy(&priv->seqwrite_mutex);
1427 
1428 	/* free our private space */
1429 	free(ump->strategy_private, M_UDFTEMP);
1430 	ump->strategy_private = NULL;
1431 }
1432 
1433 /* --------------------------------------------------------------------- */
1434 
1435 struct udf_strategy udf_strat_rmw =
1436 {
1437 	udf_create_nodedscr_rmw,
1438 	udf_free_nodedscr_rmw,
1439 	udf_read_nodedscr_rmw,
1440 	udf_write_nodedscr_rmw,
1441 	udf_queuebuf_rmw,
1442 	udf_discstrat_init_rmw,
1443 	udf_discstrat_finish_rmw
1444 };
1445 
1446