1/*	$NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $	*/
2
3/*-
4 * Copyright (c)2004,2005,2006,2008,2009,2011,2012 YAMAMOTO Takashi,
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__KERNEL_RCSID(0, "$NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/buf.h>
35#include <sys/bufq.h>
36#include <sys/bufq_impl.h>
37#include <sys/kmem.h>
38#include <sys/rbtree.h>
39#include <sys/module.h>
40
41#undef	PRIOCSCAN_USE_GLOBAL_POSITION
42
43/*
44 * Cyclical scan (CSCAN)
45 */
46
47struct cscan_key {
48	daddr_t	k_rawblkno;
49	int k_cylinder;
50};
51
52struct cscan_queue {
53	rb_tree_t cq_buffers;		/* ordered list of buffers */
54#if !defined(PRIOCSCAN_USE_GLOBAL_POSITION)
55	struct cscan_key cq_lastkey;	/* key of last request */
56#endif /* !defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
57	int cq_sortby;			/* BUFQ_SORT_MASK */
58	rb_tree_ops_t cq_ops;
59};
60
61static signed int
62buf_cmp(const struct buf *b1, const struct buf *b2, int sortby)
63{
64
65	if (buf_inorder(b2, b1, sortby)) {
66		return 1;	/* b1 > b2 */
67	}
68	if (buf_inorder(b1, b2, sortby)) {
69		return -1;	/* b1 < b2 */
70	}
71	return 0;
72}
73
74/* return positive if n1 > n2 */
75static signed int
76cscan_tree_compare_nodes(void *context, const void *n1, const void *n2)
77{
78	const struct cscan_queue * const q = context;
79	const struct buf * const b1 = n1;
80	const struct buf * const b2 = n2;
81	const int sortby = q->cq_sortby;
82	const int diff = buf_cmp(b1, b2, sortby);
83
84	/*
85	 * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
86	 */
87
88	if (diff != 0) {
89		return diff;
90	}
91
92	/*
93	 * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
94	 */
95	if (b1 > b2) {
96		return 1;
97	}
98	if (b1 < b2) {
99		return -1;
100	}
101	return 0;
102}
103
104/* return positive if n1 > k2 */
105static signed int
106cscan_tree_compare_key(void *context, const void *n1, const void *k2)
107{
108	const struct cscan_queue * const q = context;
109	const struct buf * const b1 = n1;
110	const struct cscan_key * const key = k2;
111	const struct buf tmp = {
112		.b_rawblkno = key->k_rawblkno,
113		.b_cylinder = key->k_cylinder,
114	};
115	const struct buf *b2 = &tmp;
116	const int sortby = q->cq_sortby;
117
118	return buf_cmp(b1, b2, sortby);
119}
120
121static void __unused
122cscan_dump(struct cscan_queue *cq)
123{
124	const int sortby = cq->cq_sortby;
125	struct buf *bp;
126
127	RB_TREE_FOREACH(bp, &cq->cq_buffers) {
128		if (sortby == BUFQ_SORT_RAWBLOCK) {
129			printf(" %jd", (intmax_t)bp->b_rawblkno);
130		} else {
131			printf(" %jd/%jd",
132			    (intmax_t)bp->b_cylinder, (intmax_t)bp->b_rawblkno);
133		}
134	}
135}
136
137static inline bool
138cscan_empty(struct cscan_queue *q)
139{
140
141	/* XXX this might do more work than necessary */
142	return rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT) == NULL;
143}
144
145static void
146cscan_put(struct cscan_queue *q, struct buf *bp)
147{
148	struct buf *obp __diagused;
149
150	obp = rb_tree_insert_node(&q->cq_buffers, bp);
151	KASSERT(obp == bp); /* see cscan_tree_compare_nodes */
152}
153
154static struct buf *
155cscan_get(struct cscan_queue *q, int remove, struct cscan_key *key)
156{
157	struct buf *bp;
158
159	bp = rb_tree_find_node_geq(&q->cq_buffers, key);
160	KDASSERT(bp == NULL || cscan_tree_compare_key(q, bp, key) >= 0);
161	if (bp == NULL) {
162		bp = rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT);
163		KDASSERT(cscan_tree_compare_key(q, bp, key) < 0);
164	}
165	if (bp != NULL && remove) {
166#if defined(DEBUG)
167		struct buf *nbp;
168#endif /* defined(DEBUG) */
169
170		rb_tree_remove_node(&q->cq_buffers, bp);
171		/*
172		 * remember the head position.
173		 */
174		key->k_cylinder = bp->b_cylinder;
175		key->k_rawblkno = bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT);
176#if defined(DEBUG)
177		nbp = rb_tree_find_node_geq(&q->cq_buffers, key);
178		if (nbp != NULL && cscan_tree_compare_nodes(q, nbp, bp) < 0) {
179			panic("%s: wrong order %p < %p\n", __func__,
180			    nbp, bp);
181		}
182#endif /* defined(DEBUG) */
183	}
184	return bp;
185}
186
187static void
188cscan_init(struct cscan_queue *q, int sortby)
189{
190	static const rb_tree_ops_t cscan_tree_ops = {
191		.rbto_compare_nodes = cscan_tree_compare_nodes,
192		.rbto_compare_key = cscan_tree_compare_key,
193		.rbto_node_offset = offsetof(struct buf, b_u.u_rbnode),
194		.rbto_context = NULL,
195	};
196
197	q->cq_sortby = sortby;
198	/* XXX copy ops to workaround rbtree.h API limitation */
199	q->cq_ops = cscan_tree_ops;
200	q->cq_ops.rbto_context = q;
201	rb_tree_init(&q->cq_buffers, &q->cq_ops);
202}
203
204/*
205 * Per-prioritiy CSCAN.
206 *
207 * XXX probably we should have a way to raise
208 * priority of the on-queue requests.
209 */
210#define	PRIOCSCAN_NQUEUE	3
211
212struct priocscan_queue {
213	struct cscan_queue q_queue;
214	unsigned int q_burst;
215};
216
217struct bufq_priocscan {
218	struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE];
219
220#if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
221	/*
222	 * XXX using "global" head position can reduce positioning time
223	 * when switching between queues.
224	 * although it might affect against fairness.
225	 */
226	struct cscan_key bq_lastkey;
227#endif
228};
229
230/*
231 * how many requests to serve when having pending requests on other queues.
232 *
233 * XXX tune
234 * be careful: while making these values larger likely
235 * increases the total throughput, it can also increase latencies
236 * for some workloads.
237 */
238const int priocscan_burst[] = {
239	64, 16, 4
240};
241
242static void bufq_priocscan_init(struct bufq_state *);
243static void bufq_priocscan_put(struct bufq_state *, struct buf *);
244static struct buf *bufq_priocscan_get(struct bufq_state *, int);
245
246BUFQ_DEFINE(priocscan, 40, bufq_priocscan_init);
247
248static inline struct cscan_queue *bufq_priocscan_selectqueue(
249    struct bufq_priocscan *, const struct buf *);
250
251static inline struct cscan_queue *
252bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp)
253{
254	static const int priocscan_priomap[] = {
255		[BPRIO_TIMENONCRITICAL] = 2,
256		[BPRIO_TIMELIMITED] = 1,
257		[BPRIO_TIMECRITICAL] = 0
258	};
259
260	return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue;
261}
262
263static void
264bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp)
265{
266	struct bufq_priocscan *q = bufq_private(bufq);
267	struct cscan_queue *cq;
268
269	cq = bufq_priocscan_selectqueue(q, bp);
270	cscan_put(cq, bp);
271}
272
273static struct buf *
274bufq_priocscan_get(struct bufq_state *bufq, int remove)
275{
276	struct bufq_priocscan *q = bufq_private(bufq);
277	struct priocscan_queue *pq, *npq;
278	struct priocscan_queue *first; /* highest priority non-empty queue */
279	const struct priocscan_queue *epq;
280	struct buf *bp;
281	bool single; /* true if there's only one non-empty queue */
282
283	/*
284	 * find the highest priority non-empty queue.
285	 */
286	pq = &q->bq_queue[0];
287	epq = pq + PRIOCSCAN_NQUEUE;
288	for (; pq < epq; pq++) {
289		if (!cscan_empty(&pq->q_queue)) {
290			break;
291		}
292	}
293	if (pq == epq) {
294		/*
295		 * all our queues are empty.  there's nothing to serve.
296		 */
297		return NULL;
298	}
299	first = pq;
300
301	/*
302	 * scan the rest of queues.
303	 *
304	 * if we have two or more non-empty queues, we serve the highest
305	 * priority one with non-zero burst count.
306	 */
307	single = true;
308	for (npq = pq + 1; npq < epq; npq++) {
309		if (!cscan_empty(&npq->q_queue)) {
310			/*
311			 * we found another non-empty queue.
312			 * it means that a queue needs to consume its burst
313			 * count to be served.
314			 */
315			single = false;
316
317			/*
318			 * check if our current candidate queue has already
319			 * exhausted its burst count.
320			 */
321			if (pq->q_burst > 0) {
322				break;
323			}
324			pq = npq;
325		}
326	}
327	if (single) {
328		/*
329		 * there's only a non-empty queue.
330		 * just serve it without consuming its burst count.
331		 */
332		KASSERT(pq == first);
333	} else {
334		/*
335		 * there are two or more non-empty queues.
336		 */
337		if (pq->q_burst == 0) {
338			/*
339			 * no queues can be served because they have already
340			 * exhausted their burst count.
341			 */
342			unsigned int i;
343#ifdef DEBUG
344			for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
345				pq = &q->bq_queue[i];
346				if (!cscan_empty(&pq->q_queue) && pq->q_burst) {
347					panic("%s: inconsist", __func__);
348				}
349			}
350#endif /* DEBUG */
351			/*
352			 * reset burst counts.
353			 */
354			if (remove) {
355				for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
356					pq = &q->bq_queue[i];
357					pq->q_burst = priocscan_burst[i];
358				}
359			}
360
361			/*
362			 * serve the highest priority non-empty queue.
363			 */
364			pq = first;
365		}
366		/*
367		 * consume the burst count.
368		 *
369		 * XXX account only by number of requests.  is it good enough?
370		 */
371		if (remove) {
372			KASSERT(pq->q_burst > 0);
373			pq->q_burst--;
374		}
375	}
376
377	/*
378	 * finally, get a request from the selected queue.
379	 */
380	KDASSERT(!cscan_empty(&pq->q_queue));
381	bp = cscan_get(&pq->q_queue, remove,
382#if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
383	    &q->bq_lastkey
384#else /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
385	    &pq->q_queue.cq_lastkey
386#endif /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
387	    );
388	KDASSERT(bp != NULL);
389	KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp));
390
391	return bp;
392}
393
394static struct buf *
395bufq_priocscan_cancel(struct bufq_state *bufq, struct buf *bp)
396{
397	struct bufq_priocscan * const q = bufq_private(bufq);
398	unsigned int i;
399
400	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
401		struct cscan_queue * const cq = &q->bq_queue[i].q_queue;
402		struct buf *it;
403
404		/*
405		 * XXX probably could be faster but the cancel functionality
406		 * is not widely used anyway.
407		 */
408		RB_TREE_FOREACH(it, &cq->cq_buffers) {
409			if (it == bp) {
410				rb_tree_remove_node(&cq->cq_buffers, bp);
411				return bp;
412			}
413		}
414	}
415	return NULL;
416}
417
418static void
419bufq_priocscan_fini(struct bufq_state *bufq)
420{
421
422	KASSERT(bufq->bq_private != NULL);
423	kmem_free(bufq->bq_private, sizeof(struct bufq_priocscan));
424}
425
426static void
427bufq_priocscan_init(struct bufq_state *bufq)
428{
429	struct bufq_priocscan *q;
430	const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
431	unsigned int i;
432
433	bufq->bq_get = bufq_priocscan_get;
434	bufq->bq_put = bufq_priocscan_put;
435	bufq->bq_cancel = bufq_priocscan_cancel;
436	bufq->bq_fini = bufq_priocscan_fini;
437	bufq->bq_private = kmem_zalloc(sizeof(struct bufq_priocscan), KM_SLEEP);
438
439	q = bufq->bq_private;
440	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
441		struct cscan_queue *cq = &q->bq_queue[i].q_queue;
442
443		cscan_init(cq, sortby);
444	}
445}
446
447MODULE(MODULE_CLASS_BUFQ, bufq_priocscan, NULL);
448
449static int
450bufq_priocscan_modcmd(modcmd_t cmd, void *opaque)
451{
452
453	switch (cmd) {
454	case MODULE_CMD_INIT:
455		return bufq_register(&bufq_strat_priocscan);
456	case MODULE_CMD_FINI:
457		return bufq_unregister(&bufq_strat_priocscan);
458	default:
459		return ENOTTY;
460	}
461}
462