xref: /dflybsd-src/sys/net/altq/altq_fairq.c (revision ca86d83e7d8d6bfef814ef3683c37d99ad62f11c)
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/net/altq/altq_fairq.c,v 1.2 2008/05/14 11:59:23 sephe Exp $
35  */
36 /*
37  * Matt: I gutted altq_priq.c and used it as a skeleton on which to build
38  * fairq.  The fairq algorithm is completely different then priq, of course,
39  * but because I used priq's skeleton I believe I should include priq's
40  * copyright.
41  *
42  * Copyright (C) 2000-2003
43  *	Sony Computer Science Laboratories Inc.  All rights reserved.
44  *
45  * Redistribution and use in source and binary forms, with or without
46  * modification, are permitted provided that the following conditions
47  * are met:
48  * 1. Redistributions of source code must retain the above copyright
49  *    notice, this list of conditions and the following disclaimer.
50  * 2. Redistributions in binary form must reproduce the above copyright
51  *    notice, this list of conditions and the following disclaimer in the
52  *    documentation and/or other materials provided with the distribution.
53  *
54  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  */
66 
67 /*
68  * FAIRQ - take traffic classified by keep state (hashed into
69  *	   pf->state_hash) and bucketize it.  Fairly extract
70  *	   the first packet from each bucket in a round-robin fashion.
71  *
72  * TODO - better overall qlimit support (right now it is per-bucket).
73  *	- NOTE: red etc is per bucket, not overall.
74  *	- better service curve support.
75  *
76  * EXAMPLE:
77  *
78  *  altq on em0 fairq bandwidth 650Kb queue { std, bulk }
79  *  queue std  priority 3 bandwidth 200Kb \
80  *	fairq (buckets 64, default, hogs 1Kb) qlimit 50
81  *  queue bulk priority 2 bandwidth 100Kb \
82  *	fairq (buckets 64, hogs 1Kb) qlimit 50
83  *
84  *	NOTE: When the aggregate bandwidth is less than the link bandwidth
85  *	      any remaining bandwidth is dynamically assigned using the
86  *	      existing bandwidth specs as weightings.
87  *
88  *  pass out on em0 from any to any keep state queue std
89  *  pass out on em0 inet proto tcp ..... port ... keep state queue bulk
90  */
91 #include "opt_altq.h"
92 #include "opt_inet.h"
93 #include "opt_inet6.h"
94 
95 #ifdef ALTQ_FAIRQ  /* fairq is enabled in the kernel conf */
96 
97 #include <sys/param.h>
98 #include <sys/malloc.h>
99 #include <sys/mbuf.h>
100 #include <sys/socket.h>
101 #include <sys/sockio.h>
102 #include <sys/systm.h>
103 #include <sys/proc.h>
104 #include <sys/errno.h>
105 #include <sys/kernel.h>
106 #include <sys/queue.h>
107 #include <sys/thread.h>
108 
109 #include <net/if.h>
110 #include <net/ifq_var.h>
111 #include <netinet/in.h>
112 
113 #include <net/pf/pfvar.h>
114 #include <net/altq/altq.h>
115 #include <net/altq/altq_fairq.h>
116 
117 #include <sys/thread2.h>
118 
119 #define FAIRQ_SUBQ_INDEX	ALTQ_SUBQ_INDEX_DEFAULT
120 #define FAIRQ_LOCK(ifq) \
121     ALTQ_SQ_LOCK(&(ifq)->altq_subq[FAIRQ_SUBQ_INDEX])
122 #define FAIRQ_UNLOCK(ifq) \
123     ALTQ_SQ_UNLOCK(&(ifq)->altq_subq[FAIRQ_SUBQ_INDEX])
124 
125 /*
126  * function prototypes
127  */
128 static int	fairq_clear_interface(struct fairq_if *);
129 static int	fairq_request(struct ifaltq_subque *, int, void *);
130 static void	fairq_purge(struct fairq_if *);
131 static struct fairq_class *fairq_class_create(struct fairq_if *, int,
132 					int, u_int, struct fairq_opts *, int);
133 static int	fairq_class_destroy(struct fairq_class *);
134 static int	fairq_enqueue(struct ifaltq_subque *, struct mbuf *,
135 					struct altq_pktattr *);
136 static struct mbuf *fairq_dequeue(struct ifaltq_subque *, struct mbuf *, int);
137 
138 static int	fairq_addq(struct fairq_class *, struct mbuf *, int hash);
139 static struct mbuf *fairq_getq(struct fairq_class *, uint64_t);
140 static struct mbuf *fairq_pollq(struct fairq_class *, uint64_t, int *);
141 static fairq_bucket_t *fairq_selectq(struct fairq_class *, int);
142 static void	fairq_purgeq(struct fairq_class *);
143 
144 static void	get_class_stats(struct fairq_classstats *,
145 					struct fairq_class *);
146 static struct fairq_class *clh_to_clp(struct fairq_if *, uint32_t);
147 
148 int
149 fairq_pfattach(struct pf_altq *a, struct ifaltq *ifq)
150 {
151 	return altq_attach(ifq, ALTQT_FAIRQ, a->altq_disc, ifq_mapsubq_default,
152 	    fairq_enqueue, fairq_dequeue, fairq_request, NULL, NULL);
153 }
154 
155 int
156 fairq_add_altq(struct pf_altq *a)
157 {
158 	struct fairq_if *pif;
159 	struct ifnet *ifp;
160 
161 	if ((ifp = ifunit(a->ifname)) == NULL)
162 		return (EINVAL);
163 	if (!ifq_is_ready(&ifp->if_snd))
164 		return (ENODEV);
165 
166 	pif = kmalloc(sizeof(*pif), M_ALTQ, M_WAITOK | M_ZERO);
167 	pif->pif_bandwidth = a->ifbandwidth;
168 	pif->pif_maxpri = -1;
169 	pif->pif_ifq = &ifp->if_snd;
170 	ifq_purge_all(&ifp->if_snd);
171 
172 	/* keep the state in pf_altq */
173 	a->altq_disc = pif;
174 
175 	return (0);
176 }
177 
178 int
179 fairq_remove_altq(struct pf_altq *a)
180 {
181 	struct fairq_if *pif;
182 
183 	if ((pif = a->altq_disc) == NULL)
184 		return (EINVAL);
185 	a->altq_disc = NULL;
186 
187 	fairq_clear_interface(pif);
188 
189 	kfree(pif, M_ALTQ);
190 	return (0);
191 }
192 
193 static int
194 fairq_add_queue_locked(struct pf_altq *a, struct fairq_if *pif)
195 {
196 	struct fairq_class *cl;
197 
198 	KKASSERT(a->priority < FAIRQ_MAXPRI);
199 	KKASSERT(a->qid != 0);
200 
201 	if (pif->pif_classes[a->priority] != NULL)
202 		return (EBUSY);
203 	if (clh_to_clp(pif, a->qid) != NULL)
204 		return (EBUSY);
205 
206 	cl = fairq_class_create(pif, a->priority, a->qlimit, a->bandwidth,
207 			       &a->pq_u.fairq_opts, a->qid);
208 	if (cl == NULL)
209 		return (ENOMEM);
210 
211 	return (0);
212 }
213 
214 int
215 fairq_add_queue(struct pf_altq *a)
216 {
217 	struct fairq_if *pif;
218 	struct ifaltq *ifq;
219 	int error;
220 
221 	/* check parameters */
222 	if (a->priority >= FAIRQ_MAXPRI)
223 		return (EINVAL);
224 	if (a->qid == 0)
225 		return (EINVAL);
226 
227 	/* XXX not MP safe */
228 	if ((pif = a->altq_disc) == NULL)
229 		return (EINVAL);
230 	ifq = pif->pif_ifq;
231 
232 	FAIRQ_LOCK(ifq);
233 	error = fairq_add_queue_locked(a, pif);
234 	FAIRQ_UNLOCK(ifq);
235 
236 	return error;
237 }
238 
239 static int
240 fairq_remove_queue_locked(struct pf_altq *a, struct fairq_if *pif)
241 {
242 	struct fairq_class *cl;
243 
244 	if ((cl = clh_to_clp(pif, a->qid)) == NULL)
245 		return (EINVAL);
246 
247 	return (fairq_class_destroy(cl));
248 }
249 
250 int
251 fairq_remove_queue(struct pf_altq *a)
252 {
253 	struct fairq_if *pif;
254 	struct ifaltq *ifq;
255 	int error;
256 
257 	/* XXX not MP safe */
258 	if ((pif = a->altq_disc) == NULL)
259 		return (EINVAL);
260 	ifq = pif->pif_ifq;
261 
262 	FAIRQ_LOCK(ifq);
263 	error = fairq_remove_queue_locked(a, pif);
264 	FAIRQ_UNLOCK(ifq);
265 
266 	return error;
267 }
268 
269 int
270 fairq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes)
271 {
272 	struct fairq_if *pif;
273 	struct fairq_class *cl;
274 	struct fairq_classstats stats;
275 	struct ifaltq *ifq;
276 	int error = 0;
277 
278 	if (*nbytes < sizeof(stats))
279 		return (EINVAL);
280 
281 	/* XXX not MP safe */
282 	if ((pif = altq_lookup(a->ifname, ALTQT_FAIRQ)) == NULL)
283 		return (EBADF);
284 	ifq = pif->pif_ifq;
285 
286 	FAIRQ_LOCK(ifq);
287 
288 	if ((cl = clh_to_clp(pif, a->qid)) == NULL) {
289 		FAIRQ_UNLOCK(ifq);
290 		return (EINVAL);
291 	}
292 
293 	get_class_stats(&stats, cl);
294 
295 	FAIRQ_UNLOCK(ifq);
296 
297 	if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0)
298 		return (error);
299 	*nbytes = sizeof(stats);
300 	return (0);
301 }
302 
303 /*
304  * bring the interface back to the initial state by discarding
305  * all the filters and classes.
306  */
307 static int
308 fairq_clear_interface(struct fairq_if *pif)
309 {
310 	struct fairq_class *cl;
311 	int pri;
312 
313 	/* clear out the classes */
314 	for (pri = 0; pri <= pif->pif_maxpri; pri++) {
315 		if ((cl = pif->pif_classes[pri]) != NULL)
316 			fairq_class_destroy(cl);
317 	}
318 
319 	return (0);
320 }
321 
322 static int
323 fairq_request(struct ifaltq_subque *ifsq, int req, void *arg)
324 {
325 	struct ifaltq *ifq = ifsq->ifsq_altq;
326 	struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
327 
328 	crit_enter();
329 	switch (req) {
330 	case ALTRQ_PURGE:
331 		if (ifsq_get_index(ifsq) == FAIRQ_SUBQ_INDEX) {
332 			fairq_purge(pif);
333 		} else {
334 			/*
335 			 * Race happened, the unrelated subqueue was
336 			 * picked during the packet scheduler transition.
337 			 */
338 			ifsq_classic_request(ifsq, ALTRQ_PURGE, NULL);
339 		}
340 		break;
341 	}
342 	crit_exit();
343 	return (0);
344 }
345 
346 /* discard all the queued packets on the interface */
347 static void
348 fairq_purge(struct fairq_if *pif)
349 {
350 	struct fairq_class *cl;
351 	int pri;
352 
353 	for (pri = 0; pri <= pif->pif_maxpri; pri++) {
354 		if ((cl = pif->pif_classes[pri]) != NULL && cl->cl_head)
355 			fairq_purgeq(cl);
356 	}
357 	if (ifq_is_enabled(pif->pif_ifq))
358 		pif->pif_ifq->altq_subq[FAIRQ_SUBQ_INDEX].ifq_len = 0;
359 }
360 
361 static struct fairq_class *
362 fairq_class_create(struct fairq_if *pif, int pri, int qlimit,
363 		   u_int bandwidth, struct fairq_opts *opts, int qid)
364 {
365 	struct fairq_class *cl;
366 	int flags = opts->flags;
367 	u_int nbuckets = opts->nbuckets;
368 	int i;
369 
370 #ifndef ALTQ_RED
371 	if (flags & FARF_RED) {
372 #ifdef ALTQ_DEBUG
373 		kprintf("fairq_class_create: RED not configured for FAIRQ!\n");
374 #endif
375 		return (NULL);
376 	}
377 #endif
378 	if (nbuckets == 0)
379 		nbuckets = 256;
380 	if (nbuckets > FAIRQ_MAX_BUCKETS)
381 		nbuckets = FAIRQ_MAX_BUCKETS;
382 	/* enforce power-of-2 size */
383 	while ((nbuckets ^ (nbuckets - 1)) != ((nbuckets << 1) - 1))
384 		++nbuckets;
385 
386 	if ((cl = pif->pif_classes[pri]) != NULL) {
387 		/* modify the class instead of creating a new one */
388 		crit_enter();
389 		if (cl->cl_head)
390 			fairq_purgeq(cl);
391 		crit_exit();
392 #ifdef ALTQ_RIO
393 		if (cl->cl_qtype == Q_RIO)
394 			rio_destroy((rio_t *)cl->cl_red);
395 #endif
396 #ifdef ALTQ_RED
397 		if (cl->cl_qtype == Q_RED)
398 			red_destroy(cl->cl_red);
399 #endif
400 	} else {
401 		cl = kmalloc(sizeof(*cl), M_ALTQ, M_WAITOK | M_ZERO);
402 		cl->cl_nbuckets = nbuckets;
403 		cl->cl_nbucket_mask = nbuckets - 1;
404 
405 		cl->cl_buckets = kmalloc(sizeof(*cl->cl_buckets) *
406 					 cl->cl_nbuckets,
407 					 M_ALTQ, M_WAITOK | M_ZERO);
408 		cl->cl_head = NULL;
409 	}
410 
411 	pif->pif_classes[pri] = cl;
412 	if (flags & FARF_DEFAULTCLASS)
413 		pif->pif_default = cl;
414 	if (qlimit == 0)
415 		qlimit = 50;  /* use default */
416 	cl->cl_qlimit = qlimit;
417 	for (i = 0; i < cl->cl_nbuckets; ++i) {
418 		qlimit(&cl->cl_buckets[i].queue) = qlimit;
419 	}
420 	cl->cl_bandwidth = bandwidth / 8;	/* cvt to bytes per second */
421 	cl->cl_qtype = Q_DROPTAIL;
422 	cl->cl_flags = flags & FARF_USERFLAGS;
423 	cl->cl_pri = pri;
424 	if (pri > pif->pif_maxpri)
425 		pif->pif_maxpri = pri;
426 	cl->cl_pif = pif;
427 	cl->cl_handle = qid;
428 	cl->cl_hogs_m1 = opts->hogs_m1 / 8;
429 	cl->cl_lssc_m1 = opts->lssc_m1 / 8;	/* NOT YET USED */
430 	cl->cl_bw_current = 0;
431 
432 #ifdef ALTQ_RED
433 	if (flags & (FARF_RED|FARF_RIO)) {
434 		int red_flags, red_pkttime;
435 
436 		red_flags = 0;
437 		if (flags & FARF_ECN)
438 			red_flags |= REDF_ECN;
439 #ifdef ALTQ_RIO
440 		if (flags & FARF_CLEARDSCP)
441 			red_flags |= RIOF_CLEARDSCP;
442 #endif
443 		if (pif->pif_bandwidth < 8)
444 			red_pkttime = 1000 * 1000 * 1000; /* 1 sec */
445 		else
446 			red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu
447 			  * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8);
448 #ifdef ALTQ_RIO
449 		if (flags & FARF_RIO) {
450 			cl->cl_red = (red_t *)rio_alloc(0, NULL,
451 						red_flags, red_pkttime);
452 			if (cl->cl_red != NULL)
453 				cl->cl_qtype = Q_RIO;
454 		} else
455 #endif
456 		if (flags & FARF_RED) {
457 			cl->cl_red = red_alloc(0, 0,
458 			    cl->cl_qlimit * 10/100,
459 			    cl->cl_qlimit * 30/100,
460 			    red_flags, red_pkttime);
461 			if (cl->cl_red != NULL)
462 				cl->cl_qtype = Q_RED;
463 		}
464 	}
465 #endif /* ALTQ_RED */
466 
467 	return (cl);
468 }
469 
470 static int
471 fairq_class_destroy(struct fairq_class *cl)
472 {
473 	struct fairq_if *pif;
474 	int pri;
475 
476 	crit_enter();
477 
478 	if (cl->cl_head)
479 		fairq_purgeq(cl);
480 
481 	pif = cl->cl_pif;
482 	pif->pif_classes[cl->cl_pri] = NULL;
483 	if (pif->pif_poll_cache == cl)
484 		pif->pif_poll_cache = NULL;
485 	if (pif->pif_maxpri == cl->cl_pri) {
486 		for (pri = cl->cl_pri; pri >= 0; pri--)
487 			if (pif->pif_classes[pri] != NULL) {
488 				pif->pif_maxpri = pri;
489 				break;
490 			}
491 		if (pri < 0)
492 			pif->pif_maxpri = -1;
493 	}
494 	crit_exit();
495 
496 	if (cl->cl_red != NULL) {
497 #ifdef ALTQ_RIO
498 		if (cl->cl_qtype == Q_RIO)
499 			rio_destroy((rio_t *)cl->cl_red);
500 #endif
501 #ifdef ALTQ_RED
502 		if (cl->cl_qtype == Q_RED)
503 			red_destroy(cl->cl_red);
504 #endif
505 	}
506 	kfree(cl->cl_buckets, M_ALTQ);
507 	cl->cl_head = NULL;	/* sanity */
508 	cl->cl_polled = NULL;	/* sanity */
509 	cl->cl_buckets = NULL;	/* sanity */
510 	kfree(cl, M_ALTQ);
511 
512 	return (0);
513 }
514 
515 /*
516  * fairq_enqueue is an enqueue function to be registered to
517  * (*altq_enqueue) in struct ifaltq.
518  */
519 static int
520 fairq_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m,
521     struct altq_pktattr *pktattr)
522 {
523 	struct ifaltq *ifq = ifsq->ifsq_altq;
524 	struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
525 	struct fairq_class *cl;
526 	int error;
527 	int len;
528 	int hash;
529 
530 	if (ifsq_get_index(ifsq) != FAIRQ_SUBQ_INDEX) {
531 		/*
532 		 * Race happened, the unrelated subqueue was
533 		 * picked during the packet scheduler transition.
534 		 */
535 		ifsq_classic_request(ifsq, ALTRQ_PURGE, NULL);
536 		m_freem(m);
537 		return ENOBUFS;
538 	}
539 
540 	crit_enter();
541 
542 	/* grab class set by classifier */
543 	if ((m->m_flags & M_PKTHDR) == 0) {
544 		/* should not happen */
545 		if_printf(ifq->altq_ifp, "altq: packet does not have pkthdr\n");
546 		m_freem(m);
547 		error = ENOBUFS;
548 		goto done;
549 	}
550 
551 	if (m->m_pkthdr.fw_flags & PF_MBUF_STRUCTURE) {
552 		cl = clh_to_clp(pif, m->m_pkthdr.pf.qid);
553 		if (m->m_pkthdr.pf.flags & PF_TAG_STATE_HASHED)
554 			hash = (int)m->m_pkthdr.pf.state_hash;
555 		else
556 			hash = 0;
557 	} else {
558 		cl = NULL;
559 		hash = 0;
560 	}
561 	if (cl == NULL) {
562 		cl = pif->pif_default;
563 		if (cl == NULL) {
564 			m_freem(m);
565 			error = ENOBUFS;
566 			goto done;
567 		}
568 	}
569 	cl->cl_flags |= FARF_HAS_PACKETS;
570 	cl->cl_pktattr = NULL;
571 	len = m_pktlen(m);
572 	if (fairq_addq(cl, m, hash) != 0) {
573 		/* drop occurred.  mbuf was freed in fairq_addq. */
574 		PKTCNTR_ADD(&cl->cl_dropcnt, len);
575 		error = ENOBUFS;
576 		goto done;
577 	}
578 	ifsq->ifq_len++;
579 	error = 0;
580 done:
581 	crit_exit();
582 	return (error);
583 }
584 
585 /*
586  * fairq_dequeue is a dequeue function to be registered to
587  * (*altq_dequeue) in struct ifaltq.
588  *
589  * note: ALTDQ_POLL returns the next packet without removing the packet
590  *	from the queue.  ALTDQ_REMOVE is a normal dequeue operation.
591  *	ALTDQ_REMOVE must return the same packet if called immediately
592  *	after ALTDQ_POLL.
593  */
594 static struct mbuf *
595 fairq_dequeue(struct ifaltq_subque *ifsq, struct mbuf *mpolled, int op)
596 {
597 	struct ifaltq *ifq = ifsq->ifsq_altq;
598 	struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
599 	struct fairq_class *cl;
600 	struct fairq_class *best_cl;
601 	struct mbuf *best_m;
602 	struct mbuf *m;
603 	uint64_t cur_time = read_machclk();
604 	u_int best_scale;
605 	u_int scale;
606 	int pri;
607 	int hit_limit;
608 
609 	if (ifsq_get_index(ifsq) != FAIRQ_SUBQ_INDEX) {
610 		/*
611 		 * Race happened, the unrelated subqueue was
612 		 * picked during the packet scheduler transition.
613 		 */
614 		ifsq_classic_request(ifsq, ALTRQ_PURGE, NULL);
615 		return NULL;
616 	}
617 
618 	if (ifsq_is_empty(ifsq)) {
619 		/* no packet in the queue */
620 		KKASSERT(mpolled == NULL);
621 		return (NULL);
622 	}
623 
624 	crit_enter();
625 	if (pif->pif_poll_cache && op == ALTDQ_REMOVE) {
626 		best_cl = pif->pif_poll_cache;
627 		m = fairq_getq(best_cl, cur_time);
628 		pif->pif_poll_cache = NULL;
629 		if (m) {
630 			ifsq->ifq_len--;
631 			PKTCNTR_ADD(&best_cl->cl_xmitcnt, m_pktlen(m));
632 		}
633 	} else {
634 		best_cl = NULL;
635 		best_m = NULL;
636 		best_scale = 0xFFFFFFFFU;
637 
638 		for (pri = pif->pif_maxpri;  pri >= 0; pri--) {
639 			if ((cl = pif->pif_classes[pri]) == NULL)
640 				continue;
641 			if ((cl->cl_flags & FARF_HAS_PACKETS) == 0)
642 				continue;
643 			m = fairq_pollq(cl, cur_time, &hit_limit);
644 			if (m == NULL) {
645 				cl->cl_flags &= ~FARF_HAS_PACKETS;
646 				continue;
647 			}
648 
649 			/*
650 			 * We can halt the search immediately if the queue
651 			 * did not hit its bandwidth limit.
652 			 */
653 			if (hit_limit == 0) {
654 				best_cl = cl;
655 				best_m = m;
656 				break;
657 			}
658 
659 			/*
660 			 * Otherwise calculate the scale factor and select
661 			 * the queue with the lowest scale factor.  This
662 			 * apportions any unused bandwidth weighted by
663 			 * the relative bandwidth specification.
664 			 */
665 			scale = cl->cl_bw_current * 100 / cl->cl_bandwidth;
666 			if (scale < best_scale) {
667 				best_cl = cl;
668 				best_m = m;
669 				best_scale = scale;
670 			}
671 		}
672 
673 		if (op == ALTDQ_POLL) {
674 #ifdef foo
675 			/*
676 			 * Don't use poll cache; the poll/dequeue
677 			 * model is no longer applicable to SMP
678 			 * system.  e.g.
679 			 *    CPU-A            CPU-B
680 			 *      :                :
681 			 *    poll               :
682 			 *      :              poll
683 			 *    dequeue (+)        :
684 			 *
685 			 * The dequeue at (+) will hit the poll
686 			 * cache set by CPU-B.
687 			 */
688 			pif->pif_poll_cache = best_cl;
689 #endif
690 			m = best_m;
691 		} else if (best_cl) {
692 			m = fairq_getq(best_cl, cur_time);
693 			KKASSERT(best_m == m);
694 			ifsq->ifq_len--;
695 			PKTCNTR_ADD(&best_cl->cl_xmitcnt, m_pktlen(m));
696 		} else {
697 			m = NULL;
698 		}
699 	}
700 	crit_exit();
701 	KKASSERT(mpolled == NULL || mpolled == m);
702 	return (m);
703 }
704 
705 static int
706 fairq_addq(struct fairq_class *cl, struct mbuf *m, int hash)
707 {
708 	fairq_bucket_t *b;
709 	u_int hindex;
710 	uint64_t bw;
711 
712 	/*
713 	 * If the packet doesn't have any keep state put it on the end of
714 	 * our queue.  XXX this can result in out of order delivery.
715 	 */
716 	if (hash == 0) {
717 		if (cl->cl_head)
718 			b = cl->cl_head->prev;
719 		else
720 			b = &cl->cl_buckets[0];
721 	} else {
722 		hindex = hash & cl->cl_nbucket_mask;
723 		b = &cl->cl_buckets[hindex];
724 	}
725 
726 	/*
727 	 * Add the bucket to the end of the circular list of active buckets.
728 	 *
729 	 * As a special case we add the bucket to the beginning of the list
730 	 * instead of the end if it was not previously on the list and if
731 	 * its traffic is less then the hog level.
732 	 */
733 	if (b->in_use == 0) {
734 		b->in_use = 1;
735 		if (cl->cl_head == NULL) {
736 			cl->cl_head = b;
737 			b->next = b;
738 			b->prev = b;
739 		} else {
740 			b->next = cl->cl_head;
741 			b->prev = cl->cl_head->prev;
742 			b->prev->next = b;
743 			b->next->prev = b;
744 
745 			if (b->bw_delta && cl->cl_hogs_m1) {
746 				bw = b->bw_bytes * machclk_freq / b->bw_delta;
747 				if (bw < cl->cl_hogs_m1)
748 					cl->cl_head = b;
749 			}
750 		}
751 	}
752 
753 #ifdef ALTQ_RIO
754 	if (cl->cl_qtype == Q_RIO)
755 		return rio_addq((rio_t *)cl->cl_red, &b->queue, m, cl->cl_pktattr);
756 #endif
757 #ifdef ALTQ_RED
758 	if (cl->cl_qtype == Q_RED)
759 		return red_addq(cl->cl_red, &b->queue, m, cl->cl_pktattr);
760 #endif
761 	if (qlen(&b->queue) >= qlimit(&b->queue)) {
762 		m_freem(m);
763 		return (-1);
764 	}
765 
766 	if (cl->cl_flags & FARF_CLEARDSCP)
767 		write_dsfield(m, cl->cl_pktattr, 0);
768 
769 	_addq(&b->queue, m);
770 
771 	return (0);
772 }
773 
774 static struct mbuf *
775 fairq_getq(struct fairq_class *cl, uint64_t cur_time)
776 {
777 	fairq_bucket_t *b;
778 	struct mbuf *m;
779 
780 	b = fairq_selectq(cl, 0);
781 	if (b == NULL)
782 		m = NULL;
783 #ifdef ALTQ_RIO
784 	else if (cl->cl_qtype == Q_RIO)
785 		m = rio_getq((rio_t *)cl->cl_red, &b->queue);
786 #endif
787 #ifdef ALTQ_RED
788 	else if (cl->cl_qtype == Q_RED)
789 		m = red_getq(cl->cl_red, &b->queue);
790 #endif
791 	else
792 		m = _getq(&b->queue);
793 
794 	/*
795 	 * Calculate the BW change
796 	 */
797 	if (m != NULL) {
798 		uint64_t delta;
799 
800 		/*
801 		 * Per-class bandwidth calculation
802 		 */
803 		delta = (cur_time - cl->cl_last_time);
804 		if (delta > machclk_freq * 8)
805 			delta = machclk_freq * 8;
806 		cl->cl_bw_delta += delta;
807 		cl->cl_bw_bytes += m->m_pkthdr.len;
808 		cl->cl_last_time = cur_time;
809 		if (cl->cl_bw_delta > machclk_freq) {
810 			cl->cl_bw_delta -= cl->cl_bw_delta >> 2;
811 			cl->cl_bw_bytes -= cl->cl_bw_bytes >> 2;
812 		}
813 
814 		/*
815 		 * Per-bucket bandwidth calculation
816 		 */
817 		delta = (cur_time - b->last_time);
818 		if (delta > machclk_freq * 8)
819 			delta = machclk_freq * 8;
820 		b->bw_delta += delta;
821 		b->bw_bytes += m->m_pkthdr.len;
822 		b->last_time = cur_time;
823 		if (b->bw_delta > machclk_freq) {
824 			b->bw_delta -= b->bw_delta >> 2;
825 			b->bw_bytes -= b->bw_bytes >> 2;
826 		}
827 	}
828 	return(m);
829 }
830 
831 /*
832  * Figure out what the next packet would be if there were no limits.  If
833  * this class hits its bandwidth limit *hit_limit is set to no-zero, otherwise
834  * it is set to 0.  A non-NULL mbuf is returned either way.
835  */
836 static struct mbuf *
837 fairq_pollq(struct fairq_class *cl, uint64_t cur_time, int *hit_limit)
838 {
839 	fairq_bucket_t *b;
840 	struct mbuf *m;
841 	uint64_t delta;
842 	uint64_t bw;
843 
844 	*hit_limit = 0;
845 	b = fairq_selectq(cl, 1);
846 	if (b == NULL)
847 		return(NULL);
848 	m = qhead(&b->queue);
849 
850 	/*
851 	 * Did this packet exceed the class bandwidth?  Calculate the
852 	 * bandwidth component of the packet.
853 	 *
854 	 * - Calculate bytes per second
855 	 */
856 	delta = cur_time - cl->cl_last_time;
857 	if (delta > machclk_freq * 8)
858 		delta = machclk_freq * 8;
859 	cl->cl_bw_delta += delta;
860 	cl->cl_last_time = cur_time;
861 	if (cl->cl_bw_delta) {
862 		bw = cl->cl_bw_bytes * machclk_freq / cl->cl_bw_delta;
863 
864 		if (bw > cl->cl_bandwidth)
865 			*hit_limit = 1;
866 		cl->cl_bw_current = bw;
867 #if 0
868 		kprintf("BW %6lld relative to %6u %d queue %p\n",
869 			bw, cl->cl_bandwidth, *hit_limit, b);
870 #endif
871 	}
872 	return(m);
873 }
874 
875 /*
876  * Locate the next queue we want to pull a packet out of.  This code
877  * is also responsible for removing empty buckets from the circular list.
878  */
879 static
880 fairq_bucket_t *
881 fairq_selectq(struct fairq_class *cl, int ispoll)
882 {
883 	fairq_bucket_t *b;
884 	uint64_t bw;
885 
886 	if (ispoll == 0 && cl->cl_polled) {
887 		b = cl->cl_polled;
888 		cl->cl_polled = NULL;
889 		return(b);
890 	}
891 
892 	while ((b = cl->cl_head) != NULL) {
893 		/*
894 		 * Remove empty queues from consideration
895 		 */
896 		if (qempty(&b->queue)) {
897 			b->in_use = 0;
898 			cl->cl_head = b->next;
899 			if (cl->cl_head == b) {
900 				cl->cl_head = NULL;
901 			} else {
902 				b->next->prev = b->prev;
903 				b->prev->next = b->next;
904 			}
905 			continue;
906 		}
907 
908 		/*
909 		 * Advance the round robin.  Queues with bandwidths less
910 		 * then the hog bandwidth are allowed to burst.
911 		 */
912 		if (cl->cl_hogs_m1 == 0) {
913 			cl->cl_head = b->next;
914 		} else if (b->bw_delta) {
915 			bw = b->bw_bytes * machclk_freq / b->bw_delta;
916 			if (bw >= cl->cl_hogs_m1) {
917 				cl->cl_head = b->next;
918 			}
919 			/*
920 			 * XXX TODO -
921 			 */
922 		}
923 
924 		/*
925 		 * Return bucket b.
926 		 */
927 		break;
928 	}
929 	if (ispoll)
930 		cl->cl_polled = b;
931 	return(b);
932 }
933 
934 static void
935 fairq_purgeq(struct fairq_class *cl)
936 {
937 	fairq_bucket_t *b;
938 	struct mbuf *m;
939 
940 	while ((b = fairq_selectq(cl, 0)) != NULL) {
941 		while ((m = _getq(&b->queue)) != NULL) {
942 			PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m));
943 			m_freem(m);
944 		}
945 		KKASSERT(qlen(&b->queue) == 0);
946 	}
947 }
948 
949 static void
950 get_class_stats(struct fairq_classstats *sp, struct fairq_class *cl)
951 {
952 	fairq_bucket_t *b;
953 
954 	sp->class_handle = cl->cl_handle;
955 	sp->qlimit = cl->cl_qlimit;
956 	sp->xmit_cnt = cl->cl_xmitcnt;
957 	sp->drop_cnt = cl->cl_dropcnt;
958 	sp->qtype = cl->cl_qtype;
959 	sp->qlength = 0;
960 
961 	if (cl->cl_head) {
962 		b = cl->cl_head;
963 		do {
964 			sp->qlength += qlen(&b->queue);
965 			b = b->next;
966 		} while (b != cl->cl_head);
967 	}
968 
969 #ifdef ALTQ_RED
970 	if (cl->cl_qtype == Q_RED)
971 		red_getstats(cl->cl_red, &sp->red[0]);
972 #endif
973 #ifdef ALTQ_RIO
974 	if (cl->cl_qtype == Q_RIO)
975 		rio_getstats((rio_t *)cl->cl_red, &sp->red[0]);
976 #endif
977 }
978 
979 /* convert a class handle to the corresponding class pointer */
980 static struct fairq_class *
981 clh_to_clp(struct fairq_if *pif, uint32_t chandle)
982 {
983 	struct fairq_class *cl;
984 	int idx;
985 
986 	if (chandle == 0)
987 		return (NULL);
988 
989 	for (idx = pif->pif_maxpri; idx >= 0; idx--)
990 		if ((cl = pif->pif_classes[idx]) != NULL &&
991 		    cl->cl_handle == chandle)
992 			return (cl);
993 
994 	return (NULL);
995 }
996 
997 #endif /* ALTQ_FAIRQ */
998