xref: /netbsd-src/sys/kern/bufq_priocscan.c (revision ec8060020862b5ac256de2a4e79555a03fb31d96)
1 /*	$NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $	*/
2 
3 /*-
4  * Copyright (c)2004,2005,2006,2008,2009,2011,2012 YAMAMOTO Takashi,
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/buf.h>
35 #include <sys/bufq.h>
36 #include <sys/bufq_impl.h>
37 #include <sys/kmem.h>
38 #include <sys/rbtree.h>
39 #include <sys/module.h>
40 
41 #undef	PRIOCSCAN_USE_GLOBAL_POSITION
42 
43 /*
44  * Cyclical scan (CSCAN)
45  */
46 
47 struct cscan_key {
48 	daddr_t	k_rawblkno;
49 	int k_cylinder;
50 };
51 
52 struct cscan_queue {
53 	rb_tree_t cq_buffers;		/* ordered list of buffers */
54 #if !defined(PRIOCSCAN_USE_GLOBAL_POSITION)
55 	struct cscan_key cq_lastkey;	/* key of last request */
56 #endif /* !defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
57 	int cq_sortby;			/* BUFQ_SORT_MASK */
58 	rb_tree_ops_t cq_ops;
59 };
60 
61 static signed int
buf_cmp(const struct buf * b1,const struct buf * b2,int sortby)62 buf_cmp(const struct buf *b1, const struct buf *b2, int sortby)
63 {
64 
65 	if (buf_inorder(b2, b1, sortby)) {
66 		return 1;	/* b1 > b2 */
67 	}
68 	if (buf_inorder(b1, b2, sortby)) {
69 		return -1;	/* b1 < b2 */
70 	}
71 	return 0;
72 }
73 
74 /* return positive if n1 > n2 */
75 static signed int
cscan_tree_compare_nodes(void * context,const void * n1,const void * n2)76 cscan_tree_compare_nodes(void *context, const void *n1, const void *n2)
77 {
78 	const struct cscan_queue * const q = context;
79 	const struct buf * const b1 = n1;
80 	const struct buf * const b2 = n2;
81 	const int sortby = q->cq_sortby;
82 	const int diff = buf_cmp(b1, b2, sortby);
83 
84 	/*
85 	 * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
86 	 */
87 
88 	if (diff != 0) {
89 		return diff;
90 	}
91 
92 	/*
93 	 * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
94 	 */
95 	if (b1 > b2) {
96 		return 1;
97 	}
98 	if (b1 < b2) {
99 		return -1;
100 	}
101 	return 0;
102 }
103 
104 /* return positive if n1 > k2 */
105 static signed int
cscan_tree_compare_key(void * context,const void * n1,const void * k2)106 cscan_tree_compare_key(void *context, const void *n1, const void *k2)
107 {
108 	const struct cscan_queue * const q = context;
109 	const struct buf * const b1 = n1;
110 	const struct cscan_key * const key = k2;
111 	const struct buf tmp = {
112 		.b_rawblkno = key->k_rawblkno,
113 		.b_cylinder = key->k_cylinder,
114 	};
115 	const struct buf *b2 = &tmp;
116 	const int sortby = q->cq_sortby;
117 
118 	return buf_cmp(b1, b2, sortby);
119 }
120 
121 static void __unused
cscan_dump(struct cscan_queue * cq)122 cscan_dump(struct cscan_queue *cq)
123 {
124 	const int sortby = cq->cq_sortby;
125 	struct buf *bp;
126 
127 	RB_TREE_FOREACH(bp, &cq->cq_buffers) {
128 		if (sortby == BUFQ_SORT_RAWBLOCK) {
129 			printf(" %jd", (intmax_t)bp->b_rawblkno);
130 		} else {
131 			printf(" %jd/%jd",
132 			    (intmax_t)bp->b_cylinder, (intmax_t)bp->b_rawblkno);
133 		}
134 	}
135 }
136 
137 static inline bool
cscan_empty(struct cscan_queue * q)138 cscan_empty(struct cscan_queue *q)
139 {
140 
141 	/* XXX this might do more work than necessary */
142 	return rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT) == NULL;
143 }
144 
145 static void
cscan_put(struct cscan_queue * q,struct buf * bp)146 cscan_put(struct cscan_queue *q, struct buf *bp)
147 {
148 	struct buf *obp __diagused;
149 
150 	obp = rb_tree_insert_node(&q->cq_buffers, bp);
151 	KASSERT(obp == bp); /* see cscan_tree_compare_nodes */
152 }
153 
154 static struct buf *
cscan_get(struct cscan_queue * q,int remove,struct cscan_key * key)155 cscan_get(struct cscan_queue *q, int remove, struct cscan_key *key)
156 {
157 	struct buf *bp;
158 
159 	bp = rb_tree_find_node_geq(&q->cq_buffers, key);
160 	KDASSERT(bp == NULL || cscan_tree_compare_key(q, bp, key) >= 0);
161 	if (bp == NULL) {
162 		bp = rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT);
163 		KDASSERT(cscan_tree_compare_key(q, bp, key) < 0);
164 	}
165 	if (bp != NULL && remove) {
166 #if defined(DEBUG)
167 		struct buf *nbp;
168 #endif /* defined(DEBUG) */
169 
170 		rb_tree_remove_node(&q->cq_buffers, bp);
171 		/*
172 		 * remember the head position.
173 		 */
174 		key->k_cylinder = bp->b_cylinder;
175 		key->k_rawblkno = bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT);
176 #if defined(DEBUG)
177 		nbp = rb_tree_find_node_geq(&q->cq_buffers, key);
178 		if (nbp != NULL && cscan_tree_compare_nodes(q, nbp, bp) < 0) {
179 			panic("%s: wrong order %p < %p\n", __func__,
180 			    nbp, bp);
181 		}
182 #endif /* defined(DEBUG) */
183 	}
184 	return bp;
185 }
186 
187 static void
cscan_init(struct cscan_queue * q,int sortby)188 cscan_init(struct cscan_queue *q, int sortby)
189 {
190 	static const rb_tree_ops_t cscan_tree_ops = {
191 		.rbto_compare_nodes = cscan_tree_compare_nodes,
192 		.rbto_compare_key = cscan_tree_compare_key,
193 		.rbto_node_offset = offsetof(struct buf, b_u.u_rbnode),
194 		.rbto_context = NULL,
195 	};
196 
197 	q->cq_sortby = sortby;
198 	/* XXX copy ops to workaround rbtree.h API limitation */
199 	q->cq_ops = cscan_tree_ops;
200 	q->cq_ops.rbto_context = q;
201 	rb_tree_init(&q->cq_buffers, &q->cq_ops);
202 }
203 
204 /*
205  * Per-prioritiy CSCAN.
206  *
207  * XXX probably we should have a way to raise
208  * priority of the on-queue requests.
209  */
210 #define	PRIOCSCAN_NQUEUE	3
211 
212 struct priocscan_queue {
213 	struct cscan_queue q_queue;
214 	unsigned int q_burst;
215 };
216 
217 struct bufq_priocscan {
218 	struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE];
219 
220 #if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
221 	/*
222 	 * XXX using "global" head position can reduce positioning time
223 	 * when switching between queues.
224 	 * although it might affect against fairness.
225 	 */
226 	struct cscan_key bq_lastkey;
227 #endif
228 };
229 
230 /*
231  * how many requests to serve when having pending requests on other queues.
232  *
233  * XXX tune
234  * be careful: while making these values larger likely
235  * increases the total throughput, it can also increase latencies
236  * for some workloads.
237  */
238 const int priocscan_burst[] = {
239 	64, 16, 4
240 };
241 
242 static void bufq_priocscan_init(struct bufq_state *);
243 static void bufq_priocscan_put(struct bufq_state *, struct buf *);
244 static struct buf *bufq_priocscan_get(struct bufq_state *, int);
245 
246 BUFQ_DEFINE(priocscan, 40, bufq_priocscan_init);
247 
248 static inline struct cscan_queue *bufq_priocscan_selectqueue(
249     struct bufq_priocscan *, const struct buf *);
250 
251 static inline struct cscan_queue *
bufq_priocscan_selectqueue(struct bufq_priocscan * q,const struct buf * bp)252 bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp)
253 {
254 	static const int priocscan_priomap[] = {
255 		[BPRIO_TIMENONCRITICAL] = 2,
256 		[BPRIO_TIMELIMITED] = 1,
257 		[BPRIO_TIMECRITICAL] = 0
258 	};
259 
260 	return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue;
261 }
262 
263 static void
bufq_priocscan_put(struct bufq_state * bufq,struct buf * bp)264 bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp)
265 {
266 	struct bufq_priocscan *q = bufq_private(bufq);
267 	struct cscan_queue *cq;
268 
269 	cq = bufq_priocscan_selectqueue(q, bp);
270 	cscan_put(cq, bp);
271 }
272 
273 static struct buf *
bufq_priocscan_get(struct bufq_state * bufq,int remove)274 bufq_priocscan_get(struct bufq_state *bufq, int remove)
275 {
276 	struct bufq_priocscan *q = bufq_private(bufq);
277 	struct priocscan_queue *pq, *npq;
278 	struct priocscan_queue *first; /* highest priority non-empty queue */
279 	const struct priocscan_queue *epq;
280 	struct buf *bp;
281 	bool single; /* true if there's only one non-empty queue */
282 
283 	/*
284 	 * find the highest priority non-empty queue.
285 	 */
286 	pq = &q->bq_queue[0];
287 	epq = pq + PRIOCSCAN_NQUEUE;
288 	for (; pq < epq; pq++) {
289 		if (!cscan_empty(&pq->q_queue)) {
290 			break;
291 		}
292 	}
293 	if (pq == epq) {
294 		/*
295 		 * all our queues are empty.  there's nothing to serve.
296 		 */
297 		return NULL;
298 	}
299 	first = pq;
300 
301 	/*
302 	 * scan the rest of queues.
303 	 *
304 	 * if we have two or more non-empty queues, we serve the highest
305 	 * priority one with non-zero burst count.
306 	 */
307 	single = true;
308 	for (npq = pq + 1; npq < epq; npq++) {
309 		if (!cscan_empty(&npq->q_queue)) {
310 			/*
311 			 * we found another non-empty queue.
312 			 * it means that a queue needs to consume its burst
313 			 * count to be served.
314 			 */
315 			single = false;
316 
317 			/*
318 			 * check if our current candidate queue has already
319 			 * exhausted its burst count.
320 			 */
321 			if (pq->q_burst > 0) {
322 				break;
323 			}
324 			pq = npq;
325 		}
326 	}
327 	if (single) {
328 		/*
329 		 * there's only a non-empty queue.
330 		 * just serve it without consuming its burst count.
331 		 */
332 		KASSERT(pq == first);
333 	} else {
334 		/*
335 		 * there are two or more non-empty queues.
336 		 */
337 		if (pq->q_burst == 0) {
338 			/*
339 			 * no queues can be served because they have already
340 			 * exhausted their burst count.
341 			 */
342 			unsigned int i;
343 #ifdef DEBUG
344 			for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
345 				pq = &q->bq_queue[i];
346 				if (!cscan_empty(&pq->q_queue) && pq->q_burst) {
347 					panic("%s: inconsist", __func__);
348 				}
349 			}
350 #endif /* DEBUG */
351 			/*
352 			 * reset burst counts.
353 			 */
354 			if (remove) {
355 				for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
356 					pq = &q->bq_queue[i];
357 					pq->q_burst = priocscan_burst[i];
358 				}
359 			}
360 
361 			/*
362 			 * serve the highest priority non-empty queue.
363 			 */
364 			pq = first;
365 		}
366 		/*
367 		 * consume the burst count.
368 		 *
369 		 * XXX account only by number of requests.  is it good enough?
370 		 */
371 		if (remove) {
372 			KASSERT(pq->q_burst > 0);
373 			pq->q_burst--;
374 		}
375 	}
376 
377 	/*
378 	 * finally, get a request from the selected queue.
379 	 */
380 	KDASSERT(!cscan_empty(&pq->q_queue));
381 	bp = cscan_get(&pq->q_queue, remove,
382 #if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
383 	    &q->bq_lastkey
384 #else /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
385 	    &pq->q_queue.cq_lastkey
386 #endif /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
387 	    );
388 	KDASSERT(bp != NULL);
389 	KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp));
390 
391 	return bp;
392 }
393 
394 static struct buf *
bufq_priocscan_cancel(struct bufq_state * bufq,struct buf * bp)395 bufq_priocscan_cancel(struct bufq_state *bufq, struct buf *bp)
396 {
397 	struct bufq_priocscan * const q = bufq_private(bufq);
398 	unsigned int i;
399 
400 	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
401 		struct cscan_queue * const cq = &q->bq_queue[i].q_queue;
402 		struct buf *it;
403 
404 		/*
405 		 * XXX probably could be faster but the cancel functionality
406 		 * is not widely used anyway.
407 		 */
408 		RB_TREE_FOREACH(it, &cq->cq_buffers) {
409 			if (it == bp) {
410 				rb_tree_remove_node(&cq->cq_buffers, bp);
411 				return bp;
412 			}
413 		}
414 	}
415 	return NULL;
416 }
417 
418 static void
bufq_priocscan_fini(struct bufq_state * bufq)419 bufq_priocscan_fini(struct bufq_state *bufq)
420 {
421 
422 	KASSERT(bufq->bq_private != NULL);
423 	kmem_free(bufq->bq_private, sizeof(struct bufq_priocscan));
424 }
425 
426 static void
bufq_priocscan_init(struct bufq_state * bufq)427 bufq_priocscan_init(struct bufq_state *bufq)
428 {
429 	struct bufq_priocscan *q;
430 	const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
431 	unsigned int i;
432 
433 	bufq->bq_get = bufq_priocscan_get;
434 	bufq->bq_put = bufq_priocscan_put;
435 	bufq->bq_cancel = bufq_priocscan_cancel;
436 	bufq->bq_fini = bufq_priocscan_fini;
437 	bufq->bq_private = kmem_zalloc(sizeof(struct bufq_priocscan), KM_SLEEP);
438 
439 	q = bufq->bq_private;
440 	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
441 		struct cscan_queue *cq = &q->bq_queue[i].q_queue;
442 
443 		cscan_init(cq, sortby);
444 	}
445 }
446 
447 MODULE(MODULE_CLASS_BUFQ, bufq_priocscan, NULL);
448 
449 static int
bufq_priocscan_modcmd(modcmd_t cmd,void * opaque)450 bufq_priocscan_modcmd(modcmd_t cmd, void *opaque)
451 {
452 
453 	switch (cmd) {
454 	case MODULE_CMD_INIT:
455 		return bufq_register(&bufq_strat_priocscan);
456 	case MODULE_CMD_FINI:
457 		return bufq_unregister(&bufq_strat_priocscan);
458 	default:
459 		return ENOTTY;
460 	}
461 }
462