xref: /netbsd-src/sys/kern/bufq_priocscan.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /*	$NetBSD: bufq_priocscan.c,v 1.18 2014/01/28 12:50:54 martin Exp $	*/
2 
3 /*-
4  * Copyright (c)2004,2005,2006,2008,2009,2011,2012 YAMAMOTO Takashi,
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: bufq_priocscan.c,v 1.18 2014/01/28 12:50:54 martin Exp $");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/buf.h>
35 #include <sys/bufq.h>
36 #include <sys/bufq_impl.h>
37 #include <sys/kmem.h>
38 #include <sys/rbtree.h>
39 
40 #undef	PRIOCSCAN_USE_GLOBAL_POSITION
41 
42 /*
43  * Cyclical scan (CSCAN)
44  */
45 
46 struct cscan_key {
47 	daddr_t	k_rawblkno;
48 	int k_cylinder;
49 };
50 
51 struct cscan_queue {
52 	rb_tree_t cq_buffers;		/* ordered list of buffers */
53 #if !defined(PRIOCSCAN_USE_GLOBAL_POSITION)
54 	struct cscan_key cq_lastkey;	/* key of last request */
55 #endif /* !defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
56 	int cq_sortby;			/* BUFQ_SORT_MASK */
57 	rb_tree_ops_t cq_ops;
58 };
59 
60 static signed int
61 buf_cmp(const struct buf *b1, const struct buf *b2, int sortby)
62 {
63 
64 	if (buf_inorder(b2, b1, sortby)) {
65 		return 1;	/* b1 > b2 */
66 	}
67 	if (buf_inorder(b1, b2, sortby)) {
68 		return -1;	/* b1 < b2 */
69 	}
70 	return 0;
71 }
72 
73 /* return positive if n1 > n2 */
74 static signed int
75 cscan_tree_compare_nodes(void *context, const void *n1, const void *n2)
76 {
77 	const struct cscan_queue * const q = context;
78 	const struct buf * const b1 = n1;
79 	const struct buf * const b2 = n2;
80 	const int sortby = q->cq_sortby;
81 	const int diff = buf_cmp(b1, b2, sortby);
82 
83 	/*
84 	 * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
85 	 */
86 
87 	if (diff != 0) {
88 		return diff;
89 	}
90 
91 	/*
92 	 * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
93 	 */
94 	if (b1 > b2) {
95 		return 1;
96 	}
97 	if (b1 < b2) {
98 		return -1;
99 	}
100 	return 0;
101 }
102 
103 /* return positive if n1 > k2 */
104 static signed int
105 cscan_tree_compare_key(void *context, const void *n1, const void *k2)
106 {
107 	const struct cscan_queue * const q = context;
108 	const struct buf * const b1 = n1;
109 	const struct cscan_key * const key = k2;
110 	const struct buf tmp = {
111 		.b_rawblkno = key->k_rawblkno,
112 		.b_cylinder = key->k_cylinder,
113 	};
114 	const struct buf *b2 = &tmp;
115 	const int sortby = q->cq_sortby;
116 
117 	return buf_cmp(b1, b2, sortby);
118 }
119 
120 static void __unused
121 cscan_dump(struct cscan_queue *cq)
122 {
123 	const int sortby = cq->cq_sortby;
124 	struct buf *bp;
125 
126 	RB_TREE_FOREACH(bp, &cq->cq_buffers) {
127 		if (sortby == BUFQ_SORT_RAWBLOCK) {
128 			printf(" %jd", (intmax_t)bp->b_rawblkno);
129 		} else {
130 			printf(" %jd/%jd",
131 			    (intmax_t)bp->b_cylinder, (intmax_t)bp->b_rawblkno);
132 		}
133 	}
134 }
135 
136 static inline bool
137 cscan_empty(struct cscan_queue *q)
138 {
139 
140 	/* XXX this might do more work than necessary */
141 	return rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT) == NULL;
142 }
143 
144 static void
145 cscan_put(struct cscan_queue *q, struct buf *bp)
146 {
147 	struct buf *obp __diagused;
148 
149 	obp = rb_tree_insert_node(&q->cq_buffers, bp);
150 	KASSERT(obp == bp); /* see cscan_tree_compare_nodes */
151 }
152 
153 static struct buf *
154 cscan_get(struct cscan_queue *q, int remove, struct cscan_key *key)
155 {
156 	struct buf *bp;
157 
158 	bp = rb_tree_find_node_geq(&q->cq_buffers, key);
159 	KDASSERT(bp == NULL || cscan_tree_compare_key(q, bp, key) >= 0);
160 	if (bp == NULL) {
161 		bp = rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT);
162 		KDASSERT(cscan_tree_compare_key(q, bp, key) < 0);
163 	}
164 	if (bp != NULL && remove) {
165 #if defined(DEBUG)
166 		struct buf *nbp;
167 #endif /* defined(DEBUG) */
168 
169 		rb_tree_remove_node(&q->cq_buffers, bp);
170 		/*
171 		 * remember the head position.
172 		 */
173 		key->k_cylinder = bp->b_cylinder;
174 		key->k_rawblkno = bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT);
175 #if defined(DEBUG)
176 		nbp = rb_tree_find_node_geq(&q->cq_buffers, key);
177 		if (nbp != NULL && cscan_tree_compare_nodes(q, nbp, bp) < 0) {
178 			panic("%s: wrong order %p < %p\n", __func__,
179 			    nbp, bp);
180 		}
181 #endif /* defined(DEBUG) */
182 	}
183 	return bp;
184 }
185 
186 static void
187 cscan_init(struct cscan_queue *q, int sortby)
188 {
189 	static const rb_tree_ops_t cscan_tree_ops = {
190 		.rbto_compare_nodes = cscan_tree_compare_nodes,
191 		.rbto_compare_key = cscan_tree_compare_key,
192 		.rbto_node_offset = offsetof(struct buf, b_u.u_rbnode),
193 		.rbto_context = NULL,
194 	};
195 
196 	q->cq_sortby = sortby;
197 	/* XXX copy ops to workaround rbtree.h API limitation */
198 	q->cq_ops = cscan_tree_ops;
199 	q->cq_ops.rbto_context = q;
200 	rb_tree_init(&q->cq_buffers, &q->cq_ops);
201 }
202 
203 /*
204  * Per-prioritiy CSCAN.
205  *
206  * XXX probably we should have a way to raise
207  * priority of the on-queue requests.
208  */
209 #define	PRIOCSCAN_NQUEUE	3
210 
211 struct priocscan_queue {
212 	struct cscan_queue q_queue;
213 	unsigned int q_burst;
214 };
215 
216 struct bufq_priocscan {
217 	struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE];
218 
219 #if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
220 	/*
221 	 * XXX using "global" head position can reduce positioning time
222 	 * when switching between queues.
223 	 * although it might affect against fairness.
224 	 */
225 	struct cscan_key bq_lastkey;
226 #endif
227 };
228 
229 /*
230  * how many requests to serve when having pending requests on other queues.
231  *
232  * XXX tune
233  * be careful: while making these values larger likely
234  * increases the total throughput, it can also increase latencies
235  * for some workloads.
236  */
237 const int priocscan_burst[] = {
238 	64, 16, 4
239 };
240 
241 static void bufq_priocscan_init(struct bufq_state *);
242 static void bufq_priocscan_put(struct bufq_state *, struct buf *);
243 static struct buf *bufq_priocscan_get(struct bufq_state *, int);
244 
245 BUFQ_DEFINE(priocscan, 40, bufq_priocscan_init);
246 
247 static inline struct cscan_queue *bufq_priocscan_selectqueue(
248     struct bufq_priocscan *, const struct buf *);
249 
250 static inline struct cscan_queue *
251 bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp)
252 {
253 	static const int priocscan_priomap[] = {
254 		[BPRIO_TIMENONCRITICAL] = 2,
255 		[BPRIO_TIMELIMITED] = 1,
256 		[BPRIO_TIMECRITICAL] = 0
257 	};
258 
259 	return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue;
260 }
261 
262 static void
263 bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp)
264 {
265 	struct bufq_priocscan *q = bufq->bq_private;
266 	struct cscan_queue *cq;
267 
268 	cq = bufq_priocscan_selectqueue(q, bp);
269 	cscan_put(cq, bp);
270 }
271 
272 static struct buf *
273 bufq_priocscan_get(struct bufq_state *bufq, int remove)
274 {
275 	struct bufq_priocscan *q = bufq->bq_private;
276 	struct priocscan_queue *pq, *npq;
277 	struct priocscan_queue *first; /* highest priority non-empty queue */
278 	const struct priocscan_queue *epq;
279 	struct buf *bp;
280 	bool single; /* true if there's only one non-empty queue */
281 
282 	/*
283 	 * find the highest priority non-empty queue.
284 	 */
285 	pq = &q->bq_queue[0];
286 	epq = pq + PRIOCSCAN_NQUEUE;
287 	for (; pq < epq; pq++) {
288 		if (!cscan_empty(&pq->q_queue)) {
289 			break;
290 		}
291 	}
292 	if (pq == epq) {
293 		/*
294 		 * all our queues are empty.  there's nothing to serve.
295 		 */
296 		return NULL;
297 	}
298 	first = pq;
299 
300 	/*
301 	 * scan the rest of queues.
302 	 *
303 	 * if we have two or more non-empty queues, we serve the highest
304 	 * priority one with non-zero burst count.
305 	 */
306 	single = true;
307 	for (npq = pq + 1; npq < epq; npq++) {
308 		if (!cscan_empty(&npq->q_queue)) {
309 			/*
310 			 * we found another non-empty queue.
311 			 * it means that a queue needs to consume its burst
312 			 * count to be served.
313 			 */
314 			single = false;
315 
316 			/*
317 			 * check if our current candidate queue has already
318 			 * exhausted its burst count.
319 			 */
320 			if (pq->q_burst > 0) {
321 				break;
322 			}
323 			pq = npq;
324 		}
325 	}
326 	if (single) {
327 		/*
328 		 * there's only a non-empty queue.
329 		 * just serve it without consuming its burst count.
330 		 */
331 		KASSERT(pq == first);
332 	} else {
333 		/*
334 		 * there are two or more non-empty queues.
335 		 */
336 		if (pq->q_burst == 0) {
337 			/*
338 			 * no queues can be served because they have already
339 			 * exhausted their burst count.
340 			 */
341 			unsigned int i;
342 #ifdef DEBUG
343 			for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
344 				pq = &q->bq_queue[i];
345 				if (!cscan_empty(&pq->q_queue) && pq->q_burst) {
346 					panic("%s: inconsist", __func__);
347 				}
348 			}
349 #endif /* DEBUG */
350 			/*
351 			 * reset burst counts.
352 			 */
353 			if (remove) {
354 				for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
355 					pq = &q->bq_queue[i];
356 					pq->q_burst = priocscan_burst[i];
357 				}
358 			}
359 
360 			/*
361 			 * serve the highest priority non-empty queue.
362 			 */
363 			pq = first;
364 		}
365 		/*
366 		 * consume the burst count.
367 		 *
368 		 * XXX account only by number of requests.  is it good enough?
369 		 */
370 		if (remove) {
371 			KASSERT(pq->q_burst > 0);
372 			pq->q_burst--;
373 		}
374 	}
375 
376 	/*
377 	 * finally, get a request from the selected queue.
378 	 */
379 	KDASSERT(!cscan_empty(&pq->q_queue));
380 	bp = cscan_get(&pq->q_queue, remove,
381 #if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
382 	    &q->bq_lastkey
383 #else /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
384 	    &pq->q_queue.cq_lastkey
385 #endif /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
386 	    );
387 	KDASSERT(bp != NULL);
388 	KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp));
389 
390 	return bp;
391 }
392 
393 static struct buf *
394 bufq_priocscan_cancel(struct bufq_state *bufq, struct buf *bp)
395 {
396 	struct bufq_priocscan * const q = bufq->bq_private;
397 	unsigned int i;
398 
399 	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
400 		struct cscan_queue * const cq = &q->bq_queue[i].q_queue;
401 		struct buf *it;
402 
403 		/*
404 		 * XXX probably could be faster but the cancel functionality
405 		 * is not widely used anyway.
406 		 */
407 		RB_TREE_FOREACH(it, &cq->cq_buffers) {
408 			if (it == bp) {
409 				rb_tree_remove_node(&cq->cq_buffers, bp);
410 				return bp;
411 			}
412 		}
413 	}
414 	return NULL;
415 }
416 
417 static void
418 bufq_priocscan_fini(struct bufq_state *bufq)
419 {
420 
421 	KASSERT(bufq->bq_private != NULL);
422 	kmem_free(bufq->bq_private, sizeof(struct bufq_priocscan));
423 }
424 
425 static void
426 bufq_priocscan_init(struct bufq_state *bufq)
427 {
428 	struct bufq_priocscan *q;
429 	const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
430 	unsigned int i;
431 
432 	bufq->bq_get = bufq_priocscan_get;
433 	bufq->bq_put = bufq_priocscan_put;
434 	bufq->bq_cancel = bufq_priocscan_cancel;
435 	bufq->bq_fini = bufq_priocscan_fini;
436 	bufq->bq_private = kmem_zalloc(sizeof(struct bufq_priocscan), KM_SLEEP);
437 
438 	q = bufq->bq_private;
439 	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
440 		struct cscan_queue *cq = &q->bq_queue[i].q_queue;
441 
442 		cscan_init(cq, sortby);
443 	}
444 }
445