xref: /netbsd-src/sys/altq/altq_rmclass.c (revision 274254cdae52594c1aa480a736aef78313d15c9c)
1 /*	$NetBSD: altq_rmclass.c,v 1.21 2008/07/15 16:18:08 christos Exp $	*/
2 /*	$KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $	*/
3 
4 /*
5  * Copyright (c) 1991-1997 Regents of the University of California.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *      This product includes software developed by the Network Research
19  *      Group at Lawrence Berkeley Laboratory.
20  * 4. Neither the name of the University nor of the Laboratory may be used
21  *    to endorse or promote products derived from this software without
22  *    specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * LBL code modified by speer@eng.sun.com, May 1977.
37  * For questions and/or comments, please send mail to cbq@ee.lbl.gov
38  */
39 
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: altq_rmclass.c,v 1.21 2008/07/15 16:18:08 christos Exp $");
42 
43 /* #ident "@(#)rm_class.c  1.48     97/12/05 SMI" */
44 
45 #ifdef _KERNEL_OPT
46 #include "opt_altq.h"
47 #include "opt_inet.h"
48 #endif
49 
50 #ifdef ALTQ_CBQ	/* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
51 
52 #include <sys/param.h>
53 #include <sys/malloc.h>
54 #include <sys/mbuf.h>
55 #include <sys/socket.h>
56 #include <sys/systm.h>
57 #include <sys/errno.h>
58 #include <sys/time.h>
59 #ifdef ALTQ3_COMPAT
60 #include <sys/kernel.h>
61 #endif
62 
63 #include <net/if.h>
64 #ifdef ALTQ3_COMPAT
65 #include <netinet/in.h>
66 #include <netinet/in_systm.h>
67 #include <netinet/ip.h>
68 #endif
69 
70 #include <altq/altq.h>
71 #include <altq/altq_rmclass.h>
72 #include <altq/altq_rmclass_debug.h>
73 #include <altq/altq_red.h>
74 #include <altq/altq_rio.h>
75 
76 /*
77  * Local Macros
78  */
79 
80 #define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
81 
82 /*
83  * Local routines.
84  */
85 
86 static int	rmc_satisfied(struct rm_class *, struct timeval *);
87 static void	rmc_wrr_set_weights(struct rm_ifdat *);
88 static void	rmc_depth_compute(struct rm_class *);
89 static void	rmc_depth_recompute(rm_class_t *);
90 
91 static mbuf_t	*_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
92 static mbuf_t	*_rmc_prr_dequeue_next(struct rm_ifdat *, int);
93 
94 static int	_rmc_addq(rm_class_t *, mbuf_t *);
95 static void	_rmc_dropq(rm_class_t *);
96 static mbuf_t	*_rmc_getq(rm_class_t *);
97 static mbuf_t	*_rmc_pollq(rm_class_t *);
98 
99 static int	rmc_under_limit(struct rm_class *, struct timeval *);
100 static void	rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
101 static void	rmc_drop_action(struct rm_class *);
102 static void	rmc_restart(struct rm_class *);
103 static void	rmc_root_overlimit(struct rm_class *, struct rm_class *);
104 
105 #define	BORROW_OFFTIME
106 /*
107  * BORROW_OFFTIME (experimental):
108  * borrow the offtime of the class borrowing from.
109  * the reason is that when its own offtime is set, the class is unable
110  * to borrow much, especially when cutoff is taking effect.
111  * but when the borrowed class is overloaded (advidle is close to minidle),
112  * use the borrowing class's offtime to avoid overload.
113  */
114 #define	ADJUST_CUTOFF
115 /*
116  * ADJUST_CUTOFF (experimental):
117  * if no underlimit class is found due to cutoff, increase cutoff and
118  * retry the scheduling loop.
119  * also, don't invoke delay_actions while cutoff is taking effect,
120  * since a sleeping class won't have a chance to be scheduled in the
121  * next loop.
122  *
123  * now heuristics for setting the top-level variable (cutoff_) becomes:
124  *	1. if a packet arrives for a not-overlimit class, set cutoff
125  *	   to the depth of the class.
126  *	2. if cutoff is i, and a packet arrives for an overlimit class
127  *	   with an underlimit ancestor at a lower level than i (say j),
128  *	   then set cutoff to j.
129  *	3. at scheduling a packet, if there is no underlimit class
130  *	   due to the current cutoff level, increase cutoff by 1 and
131  *	   then try to schedule again.
132  */
133 
134 /*
135  * rm_class_t *
136  * rmc_newclass(...) - Create a new resource management class at priority
137  * 'pri' on the interface given by 'ifd'.
138  *
139  * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
140  *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
141  *              than 100% of the bandwidth, this number should be the
142  *              'effective' rate for the class.  Let f be the
143  *              bandwidth fraction allocated to this class, and let
144  *              nsPerByte be the data rate of the output link in
145  *              nanoseconds/byte.  Then nsecPerByte is set to
146  *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
147  *              for a class that gets 50% of an ethernet's bandwidth.
148  *
149  * action       the routine to call when the class is over limit.
150  *
151  * maxq         max allowable queue size for class (in packets).
152  *
153  * parent       parent class pointer.
154  *
155  * borrow       class to borrow from (should be either 'parent' or null).
156  *
157  * maxidle      max value allowed for class 'idle' time estimate (this
158  *              parameter determines how large an initial burst of packets
159  *              can be before overlimit action is invoked.
160  *
161  * offtime      how long 'delay' action will delay when class goes over
162  *              limit (this parameter determines the steady-state burst
163  *              size when a class is running over its limit).
164  *
165  * Maxidle and offtime have to be computed from the following:  If the
166  * average packet size is s, the bandwidth fraction allocated to this
167  * class is f, we want to allow b packet bursts, and the gain of the
168  * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
169  *
170  *   ptime = s * nsPerByte * (1 - f) / f
171  *   maxidle = ptime * (1 - g^b) / g^b
172  *   minidle = -ptime * (1 / (f - 1))
173  *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
174  *
175  * Operationally, it's convenient to specify maxidle & offtime in units
176  * independent of the link bandwidth so the maxidle & offtime passed to
177  * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
178  * (The constant factor is a scale factor needed to make the parameters
179  * integers.  This scaling also means that the 'unscaled' values of
180  * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
181  * not nanoseconds.)  Also note that the 'idle' filter computation keeps
182  * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
183  * maxidle also must be scaled upward by this value.  Thus, the passed
184  * values for maxidle and offtime can be computed as follows:
185  *
186  * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
187  * offtime = offtime * 8 / (1000 * nsecPerByte)
188  *
189  * When USE_HRTIME is employed, then maxidle and offtime become:
190  * 	maxidle = maxilde * (8.0 / nsecPerByte);
191  * 	offtime = offtime * (8.0 / nsecPerByte);
192  */
193 struct rm_class *
194 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
195     void (*action)(rm_class_t *, rm_class_t *), int maxq,
196     struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
197     int minidle, u_int offtime, int pktsize, int flags)
198 {
199 	struct rm_class	*cl;
200 	struct rm_class	*peer;
201 	int		 s;
202 
203 	if (pri >= RM_MAXPRIO)
204 		return (NULL);
205 #ifndef ALTQ_RED
206 	if (flags & RMCF_RED) {
207 #ifdef ALTQ_DEBUG
208 		printf("rmc_newclass: RED not configured for CBQ!\n");
209 #endif
210 		return (NULL);
211 	}
212 #endif
213 #ifndef ALTQ_RIO
214 	if (flags & RMCF_RIO) {
215 #ifdef ALTQ_DEBUG
216 		printf("rmc_newclass: RIO not configured for CBQ!\n");
217 #endif
218 		return (NULL);
219 	}
220 #endif
221 
222 	cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_WAITOK|M_ZERO);
223 	if (cl == NULL)
224 		return (NULL);
225 	CALLOUT_INIT(&cl->callout_);
226 
227 	cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
228 	if (cl->q_ == NULL) {
229 		free(cl, M_DEVBUF);
230 		return (NULL);
231 	}
232 
233 	/*
234 	 * Class initialization.
235 	 */
236 	cl->children_ = NULL;
237 	cl->parent_ = parent;
238 	cl->borrow_ = borrow;
239 	cl->leaf_ = 1;
240 	cl->ifdat_ = ifd;
241 	cl->pri_ = pri;
242 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
243 	cl->depth_ = 0;
244 	cl->qthresh_ = 0;
245 	cl->ns_per_byte_ = nsecPerByte;
246 
247 	qlimit(cl->q_) = maxq;
248 	qtype(cl->q_) = Q_DROPHEAD;
249 	qlen(cl->q_) = 0;
250 	cl->flags_ = flags;
251 
252 #if 1 /* minidle is also scaled in ALTQ */
253 	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
254 	if (cl->minidle_ > 0)
255 		cl->minidle_ = 0;
256 #else
257 	cl->minidle_ = minidle;
258 #endif
259 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
260 	if (cl->maxidle_ == 0)
261 		cl->maxidle_ = 1;
262 #if 1 /* offtime is also scaled in ALTQ */
263 	cl->avgidle_ = cl->maxidle_;
264 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
265 	if (cl->offtime_ == 0)
266 		cl->offtime_ = 1;
267 #else
268 	cl->avgidle_ = 0;
269 	cl->offtime_ = (offtime * nsecPerByte) / 8;
270 #endif
271 	cl->overlimit = action;
272 
273 #ifdef ALTQ_RED
274 	if (flags & (RMCF_RED|RMCF_RIO)) {
275 		int red_flags, red_pkttime;
276 
277 		red_flags = 0;
278 		if (flags & RMCF_ECN)
279 			red_flags |= REDF_ECN;
280 		if (flags & RMCF_FLOWVALVE)
281 			red_flags |= REDF_FLOWVALVE;
282 #ifdef ALTQ_RIO
283 		if (flags & RMCF_CLEARDSCP)
284 			red_flags |= RIOF_CLEARDSCP;
285 #endif
286 		red_pkttime = nsecPerByte * pktsize  / 1000;
287 
288 		if (flags & RMCF_RED) {
289 			cl->red_ = red_alloc(0, 0,
290 			    qlimit(cl->q_) * 10/100,
291 			    qlimit(cl->q_) * 30/100,
292 			    red_flags, red_pkttime);
293 			if (cl->red_ != NULL)
294 				qtype(cl->q_) = Q_RED;
295 		}
296 #ifdef ALTQ_RIO
297 		else {
298 			cl->red_ = (red_t *)rio_alloc(0, NULL,
299 						      red_flags, red_pkttime);
300 			if (cl->red_ != NULL)
301 				qtype(cl->q_) = Q_RIO;
302 		}
303 #endif
304 	}
305 #endif /* ALTQ_RED */
306 
307 	/*
308 	 * put the class into the class tree
309 	 */
310 	s = splnet();
311 	if ((peer = ifd->active_[pri]) != NULL) {
312 		/* find the last class at this pri */
313 		cl->peer_ = peer;
314 		while (peer->peer_ != ifd->active_[pri])
315 			peer = peer->peer_;
316 		peer->peer_ = cl;
317 	} else {
318 		ifd->active_[pri] = cl;
319 		cl->peer_ = cl;
320 	}
321 
322 	if (cl->parent_) {
323 		cl->next_ = parent->children_;
324 		parent->children_ = cl;
325 		parent->leaf_ = 0;
326 	}
327 
328 	/*
329 	 * Compute the depth of this class and its ancestors in the class
330 	 * hierarchy.
331 	 */
332 	rmc_depth_compute(cl);
333 
334 	/*
335 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
336 	 */
337 	if (ifd->wrr_) {
338 		ifd->num_[pri]++;
339 		ifd->alloc_[pri] += cl->allotment_;
340 		rmc_wrr_set_weights(ifd);
341 	}
342 	splx(s);
343 	return (cl);
344 }
345 
346 int
347 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
348     int minidle, u_int offtime, int pktsize)
349 {
350 	struct rm_ifdat	*ifd;
351 	u_int		 old_allotment;
352 	int		 s;
353 
354 	ifd = cl->ifdat_;
355 	old_allotment = cl->allotment_;
356 
357 	s = splnet();
358 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
359 	cl->qthresh_ = 0;
360 	cl->ns_per_byte_ = nsecPerByte;
361 
362 	qlimit(cl->q_) = maxq;
363 
364 #if 1 /* minidle is also scaled in ALTQ */
365 	cl->minidle_ = (minidle * nsecPerByte) / 8;
366 	if (cl->minidle_ > 0)
367 		cl->minidle_ = 0;
368 #else
369 	cl->minidle_ = minidle;
370 #endif
371 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
372 	if (cl->maxidle_ == 0)
373 		cl->maxidle_ = 1;
374 #if 1 /* offtime is also scaled in ALTQ */
375 	cl->avgidle_ = cl->maxidle_;
376 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
377 	if (cl->offtime_ == 0)
378 		cl->offtime_ = 1;
379 #else
380 	cl->avgidle_ = 0;
381 	cl->offtime_ = (offtime * nsecPerByte) / 8;
382 #endif
383 
384 	/*
385 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
386 	 */
387 	if (ifd->wrr_) {
388 		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
389 		rmc_wrr_set_weights(ifd);
390 	}
391 	splx(s);
392 	return (0);
393 }
394 
395 /*
396  * static void
397  * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
398  *	the appropriate run robin weights for the CBQ weighted round robin
399  *	algorithm.
400  *
401  *	Returns: NONE
402  */
403 
404 static void
405 rmc_wrr_set_weights(struct rm_ifdat *ifd)
406 {
407 	int		i;
408 	struct rm_class	*cl, *clh;
409 
410 	for (i = 0; i < RM_MAXPRIO; i++) {
411 		/*
412 		 * This is inverted from that of the simulator to
413 		 * maintain precision.
414 		 */
415 		if (ifd->num_[i] == 0)
416 			ifd->M_[i] = 0;
417 		else
418 			ifd->M_[i] = ifd->alloc_[i] /
419 				(ifd->num_[i] * ifd->maxpkt_);
420 		/*
421 		 * Compute the weighted allotment for each class.
422 		 * This takes the expensive div instruction out
423 		 * of the main loop for the wrr scheduling path.
424 		 * These only get recomputed when a class comes or
425 		 * goes.
426 		 */
427 		if (ifd->active_[i] != NULL) {
428 			clh = cl = ifd->active_[i];
429 			do {
430 				/* safe-guard for slow link or alloc_ == 0 */
431 				if (ifd->M_[i] == 0)
432 					cl->w_allotment_ = 0;
433 				else
434 					cl->w_allotment_ = cl->allotment_ /
435 						ifd->M_[i];
436 				cl = cl->peer_;
437 			} while ((cl != NULL) && (cl != clh));
438 		}
439 	}
440 }
441 
442 int
443 rmc_get_weight(struct rm_ifdat *ifd, int pri)
444 {
445 	if ((pri >= 0) && (pri < RM_MAXPRIO))
446 		return (ifd->M_[pri]);
447 	else
448 		return (0);
449 }
450 
451 /*
452  * static void
453  * rmc_depth_compute(struct rm_class *cl) - This function computes the
454  *	appropriate depth of class 'cl' and its ancestors.
455  *
456  *	Returns:	NONE
457  */
458 
459 static void
460 rmc_depth_compute(struct rm_class *cl)
461 {
462 	rm_class_t	*t = cl, *p;
463 
464 	/*
465 	 * Recompute the depth for the branch of the tree.
466 	 */
467 	while (t != NULL) {
468 		p = t->parent_;
469 		if (p && (t->depth_ >= p->depth_)) {
470 			p->depth_ = t->depth_ + 1;
471 			t = p;
472 		} else
473 			t = NULL;
474 	}
475 }
476 
477 /*
478  * static void
479  * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
480  *	the depth of the tree after a class has been deleted.
481  *
482  *	Returns:	NONE
483  */
484 
485 static void
486 rmc_depth_recompute(rm_class_t *cl)
487 {
488 #if 1 /* ALTQ */
489 	rm_class_t	*p, *t;
490 
491 	p = cl;
492 	while (p != NULL) {
493 		if ((t = p->children_) == NULL) {
494 			p->depth_ = 0;
495 		} else {
496 			int cdepth = 0;
497 
498 			while (t != NULL) {
499 				if (t->depth_ > cdepth)
500 					cdepth = t->depth_;
501 				t = t->next_;
502 			}
503 
504 			if (p->depth_ == cdepth + 1)
505 				/* no change to this parent */
506 				return;
507 
508 			p->depth_ = cdepth + 1;
509 		}
510 
511 		p = p->parent_;
512 	}
513 #else
514 	rm_class_t	*t;
515 
516 	if (cl->depth_ >= 1) {
517 		if (cl->children_ == NULL) {
518 			cl->depth_ = 0;
519 		} else if ((t = cl->children_) != NULL) {
520 			while (t != NULL) {
521 				if (t->children_ != NULL)
522 					rmc_depth_recompute(t);
523 				t = t->next_;
524 			}
525 		} else
526 			rmc_depth_compute(cl);
527 	}
528 #endif
529 }
530 
531 /*
532  * void
533  * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
534  *	function deletes a class from the link-sharing structure and frees
535  *	all resources associated with the class.
536  *
537  *	Returns: NONE
538  */
539 
540 void
541 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
542 {
543 	struct rm_class	*p, *head, *previous;
544 	int		 s;
545 
546 	ASSERT(cl->children_ == NULL);
547 
548 	if (cl->sleeping_)
549 		CALLOUT_STOP(&cl->callout_);
550 
551 	s = splnet();
552 	/*
553 	 * Free packets in the packet queue.
554 	 * XXX - this may not be a desired behavior.  Packets should be
555 	 *		re-queued.
556 	 */
557 	rmc_dropall(cl);
558 
559 	/*
560 	 * If the class has a parent, then remove the class from the
561 	 * class from the parent's children chain.
562 	 */
563 	if (cl->parent_ != NULL) {
564 		head = cl->parent_->children_;
565 		p = previous = head;
566 		if (head->next_ == NULL) {
567 			ASSERT(head == cl);
568 			cl->parent_->children_ = NULL;
569 			cl->parent_->leaf_ = 1;
570 		} else while (p != NULL) {
571 			if (p == cl) {
572 				if (cl == head)
573 					cl->parent_->children_ = cl->next_;
574 				else
575 					previous->next_ = cl->next_;
576 				cl->next_ = NULL;
577 				p = NULL;
578 			} else {
579 				previous = p;
580 				p = p->next_;
581 			}
582 		}
583 	}
584 
585 	/*
586 	 * Delete class from class priority peer list.
587 	 */
588 	if ((p = ifd->active_[cl->pri_]) != NULL) {
589 		/*
590 		 * If there is more than one member of this priority
591 		 * level, then look for class(cl) in the priority level.
592 		 */
593 		if (p != p->peer_) {
594 			while (p->peer_ != cl)
595 				p = p->peer_;
596 			p->peer_ = cl->peer_;
597 
598 			if (ifd->active_[cl->pri_] == cl)
599 				ifd->active_[cl->pri_] = cl->peer_;
600 		} else {
601 			ASSERT(p == cl);
602 			ifd->active_[cl->pri_] = NULL;
603 		}
604 	}
605 
606 	/*
607 	 * Recompute the WRR weights.
608 	 */
609 	if (ifd->wrr_) {
610 		ifd->alloc_[cl->pri_] -= cl->allotment_;
611 		ifd->num_[cl->pri_]--;
612 		rmc_wrr_set_weights(ifd);
613 	}
614 
615 	/*
616 	 * Re-compute the depth of the tree.
617 	 */
618 #if 1 /* ALTQ */
619 	rmc_depth_recompute(cl->parent_);
620 #else
621 	rmc_depth_recompute(ifd->root_);
622 #endif
623 
624 	splx(s);
625 
626 	/*
627 	 * Free the class structure.
628 	 */
629 	if (cl->red_ != NULL) {
630 #ifdef ALTQ_RIO
631 		if (q_is_rio(cl->q_))
632 			rio_destroy((rio_t *)cl->red_);
633 #endif
634 #ifdef ALTQ_RED
635 		if (q_is_red(cl->q_))
636 			red_destroy(cl->red_);
637 #endif
638 	}
639 	free(cl->q_, M_DEVBUF);
640 	free(cl, M_DEVBUF);
641 }
642 
643 
644 /*
645  * int
646  * rmc_init(...) - Initialize the resource management data structures
647  *	associated with the output portion of interface 'ifp'.  'ifd' is
648  *	where the structures will be built (for backwards compatibility, the
649  *	structures aren't kept in the ifnet struct).  'nsecPerByte'
650  *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
651  *	'restart' is the driver-specific routine that the generic 'delay
652  *	until under limit' action will call to restart output.  `maxq'
653  *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
654  *	is the maximum number of packets that the resource management
655  *	code will allow to be queued 'downstream' (this is typically 1).
656  *
657  *	Returns:	0 on success
658  */
659 
660 int
661 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
662     void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
663     int minidle, u_int offtime, int flags)
664 {
665 	int i, mtu;
666 
667 	/*
668 	 * Initialize the CBQ tracing/debug facility.
669 	 */
670 	CBQTRACEINIT();
671 
672 	mtu = ifq->altq_ifp->if_mtu;
673 	if (mtu < 1) {
674 		printf("altq: %s: invalid MTU (interface not initialized?)\n",
675 		    ifq->altq_ifp->if_xname);
676 		return (EINVAL);
677 	}
678 
679 	(void)memset((char *)ifd, 0, sizeof (*ifd));
680 	ifd->ifq_ = ifq;
681 	ifd->restart = restart;
682 	ifd->maxqueued_ = maxqueued;
683 	ifd->ns_per_byte_ = nsecPerByte;
684 	ifd->maxpkt_ = mtu;
685 	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
686 	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
687 #if 1
688 	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
689 	if (mtu * nsecPerByte > 10 * 1000000)
690 		ifd->maxiftime_ /= 4;
691 #endif
692 
693 	reset_cutoff(ifd);
694 	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
695 
696 	/*
697 	 * Initialize the CBQ's WRR state.
698 	 */
699 	for (i = 0; i < RM_MAXPRIO; i++) {
700 		ifd->alloc_[i] = 0;
701 		ifd->M_[i] = 0;
702 		ifd->num_[i] = 0;
703 		ifd->na_[i] = 0;
704 		ifd->active_[i] = NULL;
705 	}
706 
707 	/*
708 	 * Initialize current packet state.
709 	 */
710 	ifd->qi_ = 0;
711 	ifd->qo_ = 0;
712 	for (i = 0; i < RM_MAXQUEUED; i++) {
713 		ifd->class_[i] = NULL;
714 		ifd->curlen_[i] = 0;
715 		ifd->borrowed_[i] = NULL;
716 	}
717 
718 	/*
719 	 * Create the root class of the link-sharing structure.
720 	 */
721 	if ((ifd->root_ = rmc_newclass(0, ifd,
722 				       nsecPerByte,
723 				       rmc_root_overlimit, maxq, 0, 0,
724 				       maxidle, minidle, offtime,
725 				       0, 0)) == NULL) {
726 		printf("rmc_init: root class not allocated\n");
727 		return (ENOMEM);
728 	}
729 	ifd->root_->depth_ = 0;
730 
731 	return (0);
732 }
733 
734 /*
735  * void
736  * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
737  *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
738  *	by a driver's if_output routine.  This routine must be called with
739  *	output packet completion interrupts locked out (to avoid racing with
740  *	rmc_dequeue_next).
741  *
742  *	Returns:	0 on successful queueing
743  *			-1 when packet drop occurs
744  */
745 int
746 rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
747 {
748 	struct timeval	 now;
749 	struct rm_ifdat *ifd = cl->ifdat_;
750 	int		 cpri = cl->pri_;
751 	int		 is_empty = qempty(cl->q_);
752 
753 	RM_GETTIME(now);
754 	if (ifd->cutoff_ > 0) {
755 		if (TV_LT(&cl->undertime_, &now)) {
756 			if (ifd->cutoff_ > cl->depth_)
757 				ifd->cutoff_ = cl->depth_;
758 			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
759 		}
760 #if 1 /* ALTQ */
761 		else {
762 			/*
763 			 * the class is overlimit. if the class has
764 			 * underlimit ancestors, set cutoff to the lowest
765 			 * depth among them.
766 			 */
767 			struct rm_class *borrow = cl->borrow_;
768 
769 			while (borrow != NULL &&
770 			       borrow->depth_ < ifd->cutoff_) {
771 				if (TV_LT(&borrow->undertime_, &now)) {
772 					ifd->cutoff_ = borrow->depth_;
773 					CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
774 					break;
775 				}
776 				borrow = borrow->borrow_;
777 			}
778 		}
779 #else /* !ALTQ */
780 		else if ((ifd->cutoff_ > 1) && cl->borrow_) {
781 			if (TV_LT(&cl->borrow_->undertime_, &now)) {
782 				ifd->cutoff_ = cl->borrow_->depth_;
783 				CBQTRACE(rmc_queue_packet, 'ffob',
784 					 cl->borrow_->depth_);
785 			}
786 		}
787 #endif /* !ALTQ */
788 	}
789 
790 	if (_rmc_addq(cl, m) < 0)
791 		/* failed */
792 		return (-1);
793 
794 	if (is_empty) {
795 		CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
796 		ifd->na_[cpri]++;
797 	}
798 
799 	if (qlen(cl->q_) > qlimit(cl->q_)) {
800 		/* note: qlimit can be set to 0 or 1 */
801 		rmc_drop_action(cl);
802 		return (-1);
803 	}
804 	return (0);
805 }
806 
807 /*
808  * void
809  * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
810  *	classes to see if there are satified.
811  */
812 
813 static void
814 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
815 {
816 	int		 i;
817 	rm_class_t	*p, *bp;
818 
819 	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
820 		if ((bp = ifd->active_[i]) != NULL) {
821 			p = bp;
822 			do {
823 				if (!rmc_satisfied(p, now)) {
824 					ifd->cutoff_ = p->depth_;
825 					return;
826 				}
827 				p = p->peer_;
828 			} while (p != bp);
829 		}
830 	}
831 
832 	reset_cutoff(ifd);
833 }
834 
835 /*
836  * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
837  */
838 
839 static int
840 rmc_satisfied(struct rm_class *cl, struct timeval *now)
841 {
842 	rm_class_t	*p;
843 
844 	if (cl == NULL)
845 		return (1);
846 	if (TV_LT(now, &cl->undertime_))
847 		return (1);
848 	if (cl->depth_ == 0) {
849 		if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
850 			return (0);
851 		else
852 			return (1);
853 	}
854 	if (cl->children_ != NULL) {
855 		p = cl->children_;
856 		while (p != NULL) {
857 			if (!rmc_satisfied(p, now))
858 				return (0);
859 			p = p->next_;
860 		}
861 	}
862 
863 	return (1);
864 }
865 
866 /*
867  * Return 1 if class 'cl' is under limit or can borrow from a parent,
868  * 0 if overlimit.  As a side-effect, this routine will invoke the
869  * class overlimit action if the class if overlimit.
870  */
871 
872 static int
873 rmc_under_limit(struct rm_class *cl, struct timeval *now)
874 {
875 	rm_class_t	*p = cl;
876 	rm_class_t	*top;
877 	struct rm_ifdat	*ifd = cl->ifdat_;
878 
879 	ifd->borrowed_[ifd->qi_] = NULL;
880 	/*
881 	 * If cl is the root class, then always return that it is
882 	 * underlimit.  Otherwise, check to see if the class is underlimit.
883 	 */
884 	if (cl->parent_ == NULL)
885 		return (1);
886 
887 	if (cl->sleeping_) {
888 		if (TV_LT(now, &cl->undertime_))
889 			return (0);
890 
891 		CALLOUT_STOP(&cl->callout_);
892 		cl->sleeping_ = 0;
893 		cl->undertime_.tv_sec = 0;
894 		return (1);
895 	}
896 
897 	top = NULL;
898 	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
899 		if (((cl = cl->borrow_) == NULL) ||
900 		    (cl->depth_ > ifd->cutoff_)) {
901 #ifdef ADJUST_CUTOFF
902 			if (cl != NULL)
903 				/* cutoff is taking effect, just
904 				   return false without calling
905 				   the delay action. */
906 				return (0);
907 #endif
908 #ifdef BORROW_OFFTIME
909 			/*
910 			 * check if the class can borrow offtime too.
911 			 * borrow offtime from the top of the borrow
912 			 * chain if the top class is not overloaded.
913 			 */
914 			if (cl != NULL) {
915 				/* cutoff is taking effect, use this class as top. */
916 				top = cl;
917 				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
918 			}
919 			if (top != NULL && top->avgidle_ == top->minidle_)
920 				top = NULL;
921 			p->overtime_ = *now;
922 			(p->overlimit)(p, top);
923 #else
924 			p->overtime_ = *now;
925 			(p->overlimit)(p, NULL);
926 #endif
927 			return (0);
928 		}
929 		top = cl;
930 	}
931 
932 	if (cl != p)
933 		ifd->borrowed_[ifd->qi_] = cl;
934 	return (1);
935 }
936 
937 /*
938  * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
939  *	Packet-by-packet round robin.
940  *
941  * The heart of the weighted round-robin scheduler, which decides which
942  * class next gets to send a packet.  Highest priority first, then
943  * weighted round-robin within priorites.
944  *
945  * Each able-to-send class gets to send until its byte allocation is
946  * exhausted.  Thus, the active pointer is only changed after a class has
947  * exhausted its allocation.
948  *
949  * If the scheduler finds no class that is underlimit or able to borrow,
950  * then the first class found that had a nonzero queue and is allowed to
951  * borrow gets to send.
952  */
953 
954 static mbuf_t *
955 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
956 {
957 	struct rm_class	*cl = NULL, *first = NULL;
958 	u_int		 deficit;
959 	int		 cpri;
960 	mbuf_t		*m;
961 	struct timeval	 now;
962 
963 	RM_GETTIME(now);
964 
965 	/*
966 	 * if the driver polls the top of the queue and then removes
967 	 * the polled packet, we must return the same packet.
968 	 */
969 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
970 		cl = ifd->pollcache_;
971 		cpri = cl->pri_;
972 		if (ifd->efficient_) {
973 			/* check if this class is overlimit */
974 			if (cl->undertime_.tv_sec != 0 &&
975 			    rmc_under_limit(cl, &now) == 0)
976 				first = cl;
977 		}
978 		ifd->pollcache_ = NULL;
979 		goto _wrr_out;
980 	}
981 	else {
982 		/* mode == ALTDQ_POLL || pollcache == NULL */
983 		ifd->pollcache_ = NULL;
984 		ifd->borrowed_[ifd->qi_] = NULL;
985 	}
986 #ifdef ADJUST_CUTOFF
987  _again:
988 #endif
989 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
990 		if (ifd->na_[cpri] == 0)
991 			continue;
992 		deficit = 0;
993 		/*
994 		 * Loop through twice for a priority level, if some class
995 		 * was unable to send a packet the first round because
996 		 * of the weighted round-robin mechanism.
997 		 * During the second loop at this level, deficit==2.
998 		 * (This second loop is not needed if for every class,
999 		 * "M[cl->pri_])" times "cl->allotment" is greater than
1000 		 * the byte size for the largest packet in the class.)
1001 		 */
1002  _wrr_loop:
1003 		cl = ifd->active_[cpri];
1004 		ASSERT(cl != NULL);
1005 		do {
1006 			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
1007 				cl->bytes_alloc_ += cl->w_allotment_;
1008 			if (!qempty(cl->q_)) {
1009 				if ((cl->undertime_.tv_sec == 0) ||
1010 				    rmc_under_limit(cl, &now)) {
1011 					if (cl->bytes_alloc_ > 0 || deficit > 1)
1012 						goto _wrr_out;
1013 
1014 					/* underlimit but no alloc */
1015 					deficit = 1;
1016 #if 1
1017 					ifd->borrowed_[ifd->qi_] = NULL;
1018 #endif
1019 				}
1020 				else if (first == NULL && cl->borrow_ != NULL)
1021 					first = cl; /* borrowing candidate */
1022 			}
1023 
1024 			cl->bytes_alloc_ = 0;
1025 			cl = cl->peer_;
1026 		} while (cl != ifd->active_[cpri]);
1027 
1028 		if (deficit == 1) {
1029 			/* first loop found an underlimit class with deficit */
1030 			/* Loop on same priority level, with new deficit.  */
1031 			deficit = 2;
1032 			goto _wrr_loop;
1033 		}
1034 	}
1035 
1036 #ifdef ADJUST_CUTOFF
1037 	/*
1038 	 * no underlimit class found.  if cutoff is taking effect,
1039 	 * increase cutoff and try again.
1040 	 */
1041 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1042 		ifd->cutoff_++;
1043 		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
1044 		goto _again;
1045 	}
1046 #endif /* ADJUST_CUTOFF */
1047 	/*
1048 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1049 	 * class we encounter will send a packet if all the classes
1050 	 * of the link-sharing structure are overlimit.
1051 	 */
1052 	reset_cutoff(ifd);
1053 	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
1054 
1055 	if (!ifd->efficient_ || first == NULL)
1056 		return (NULL);
1057 
1058 	cl = first;
1059 	cpri = cl->pri_;
1060 #if 0	/* too time-consuming for nothing */
1061 	if (cl->sleeping_)
1062 		CALLOUT_STOP(&cl->callout_);
1063 	cl->sleeping_ = 0;
1064 	cl->undertime_.tv_sec = 0;
1065 #endif
1066 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1067 	ifd->cutoff_ = cl->borrow_->depth_;
1068 
1069 	/*
1070 	 * Deque the packet and do the book keeping...
1071 	 */
1072  _wrr_out:
1073 	if (op == ALTDQ_REMOVE) {
1074 		m = _rmc_getq(cl);
1075 		if (m == NULL)
1076 			panic("_rmc_wrr_dequeue_next");
1077 		if (qempty(cl->q_))
1078 			ifd->na_[cpri]--;
1079 
1080 		/*
1081 		 * Update class statistics and link data.
1082 		 */
1083 		if (cl->bytes_alloc_ > 0)
1084 			cl->bytes_alloc_ -= m_pktlen(m);
1085 
1086 		if ((cl->bytes_alloc_ <= 0) || first == cl)
1087 			ifd->active_[cl->pri_] = cl->peer_;
1088 		else
1089 			ifd->active_[cl->pri_] = cl;
1090 
1091 		ifd->class_[ifd->qi_] = cl;
1092 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1093 		ifd->now_[ifd->qi_] = now;
1094 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1095 		ifd->queued_++;
1096 	} else {
1097 		/* mode == ALTDQ_PPOLL */
1098 		m = _rmc_pollq(cl);
1099 		ifd->pollcache_ = cl;
1100 	}
1101 	return (m);
1102 }
1103 
1104 /*
1105  * Dequeue & return next packet from the highest priority class that
1106  * has a packet to send & has enough allocation to send it.  This
1107  * routine is called by a driver whenever it needs a new packet to
1108  * output.
1109  */
1110 static mbuf_t *
1111 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
1112 {
1113 	mbuf_t		*m;
1114 	int		 cpri;
1115 	struct rm_class	*cl, *first = NULL;
1116 	struct timeval	 now;
1117 
1118 	RM_GETTIME(now);
1119 
1120 	/*
1121 	 * if the driver polls the top of the queue and then removes
1122 	 * the polled packet, we must return the same packet.
1123 	 */
1124 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
1125 		cl = ifd->pollcache_;
1126 		cpri = cl->pri_;
1127 		ifd->pollcache_ = NULL;
1128 		goto _prr_out;
1129 	} else {
1130 		/* mode == ALTDQ_POLL || pollcache == NULL */
1131 		ifd->pollcache_ = NULL;
1132 		ifd->borrowed_[ifd->qi_] = NULL;
1133 	}
1134 #ifdef ADJUST_CUTOFF
1135  _again:
1136 #endif
1137 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1138 		if (ifd->na_[cpri] == 0)
1139 			continue;
1140 		cl = ifd->active_[cpri];
1141 		ASSERT(cl != NULL);
1142 		do {
1143 			if (!qempty(cl->q_)) {
1144 				if ((cl->undertime_.tv_sec == 0) ||
1145 				    rmc_under_limit(cl, &now))
1146 					goto _prr_out;
1147 				if (first == NULL && cl->borrow_ != NULL)
1148 					first = cl;
1149 			}
1150 			cl = cl->peer_;
1151 		} while (cl != ifd->active_[cpri]);
1152 	}
1153 
1154 #ifdef ADJUST_CUTOFF
1155 	/*
1156 	 * no underlimit class found.  if cutoff is taking effect, increase
1157 	 * cutoff and try again.
1158 	 */
1159 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1160 		ifd->cutoff_++;
1161 		goto _again;
1162 	}
1163 #endif /* ADJUST_CUTOFF */
1164 	/*
1165 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1166 	 * class we encounter will send a packet if all the classes
1167 	 * of the link-sharing structure are overlimit.
1168 	 */
1169 	reset_cutoff(ifd);
1170 	if (!ifd->efficient_ || first == NULL)
1171 		return (NULL);
1172 
1173 	cl = first;
1174 	cpri = cl->pri_;
1175 #if 0	/* too time-consuming for nothing */
1176 	if (cl->sleeping_)
1177 		CALLOUT_STOP(&cl->callout_);
1178 	cl->sleeping_ = 0;
1179 	cl->undertime_.tv_sec = 0;
1180 #endif
1181 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1182 	ifd->cutoff_ = cl->borrow_->depth_;
1183 
1184 	/*
1185 	 * Deque the packet and do the book keeping...
1186 	 */
1187  _prr_out:
1188 	if (op == ALTDQ_REMOVE) {
1189 		m = _rmc_getq(cl);
1190 		if (m == NULL)
1191 			panic("_rmc_prr_dequeue_next");
1192 		if (qempty(cl->q_))
1193 			ifd->na_[cpri]--;
1194 
1195 		ifd->active_[cpri] = cl->peer_;
1196 
1197 		ifd->class_[ifd->qi_] = cl;
1198 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1199 		ifd->now_[ifd->qi_] = now;
1200 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1201 		ifd->queued_++;
1202 	} else {
1203 		/* mode == ALTDQ_POLL */
1204 		m = _rmc_pollq(cl);
1205 		ifd->pollcache_ = cl;
1206 	}
1207 	return (m);
1208 }
1209 
1210 /*
1211  * mbuf_t *
1212  * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
1213  *	is invoked by the packet driver to get the next packet to be
1214  *	dequeued and output on the link.  If WRR is enabled, then the
1215  *	WRR dequeue next routine will determine the next packet to sent.
1216  *	Otherwise, packet-by-packet round robin is invoked.
1217  *
1218  *	Returns:	NULL, if a packet is not available or if all
1219  *			classes are overlimit.
1220  *
1221  *			Otherwise, Pointer to the next packet.
1222  */
1223 
1224 mbuf_t *
1225 rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
1226 {
1227 	if (ifd->queued_ >= ifd->maxqueued_)
1228 		return (NULL);
1229 	else if (ifd->wrr_)
1230 		return (_rmc_wrr_dequeue_next(ifd, mode));
1231 	else
1232 		return (_rmc_prr_dequeue_next(ifd, mode));
1233 }
1234 
1235 /*
1236  * Update the utilization estimate for the packet that just completed.
1237  * The packet's class & the parent(s) of that class all get their
1238  * estimators updated.  This routine is called by the driver's output-
1239  * packet-completion interrupt service routine.
1240  */
1241 
1242 /*
1243  * a macro to approximate "divide by 1000" that gives 0.000999,
1244  * if a value has enough effective digits.
1245  * (on pentium, mul takes 9 cycles but div takes 46!)
1246  */
1247 #define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
1248 void
1249 rmc_update_class_util(struct rm_ifdat *ifd)
1250 {
1251 	int		 idle, avgidle, pktlen;
1252 	int		 pkt_time, tidle;
1253 	rm_class_t	*cl, *borrowed;
1254 	rm_class_t	*borrows;
1255 	struct timeval	*nowp;
1256 
1257 	/*
1258 	 * Get the most recent completed class.
1259 	 */
1260 	if ((cl = ifd->class_[ifd->qo_]) == NULL)
1261 		return;
1262 
1263 	pktlen = ifd->curlen_[ifd->qo_];
1264 	borrowed = ifd->borrowed_[ifd->qo_];
1265 	borrows = borrowed;
1266 
1267 	PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1268 
1269 	/*
1270 	 * Run estimator on class and its ancestors.
1271 	 */
1272 	/*
1273 	 * rm_update_class_util is designed to be called when the
1274 	 * transfer is completed from a xmit complete interrupt,
1275 	 * but most drivers don't implement an upcall for that.
1276 	 * so, just use estimated completion time.
1277 	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
1278 	 */
1279 	nowp = &ifd->now_[ifd->qo_];
1280 	/* get pkt_time (for link) in usec */
1281 #if 1  /* use approximation */
1282 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
1283 	pkt_time = NSEC_TO_USEC(pkt_time);
1284 #else
1285 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
1286 #endif
1287 #if 1 /* ALTQ4PPP */
1288 	if (TV_LT(nowp, &ifd->ifnow_)) {
1289 		int iftime;
1290 
1291 		/*
1292 		 * make sure the estimated completion time does not go
1293 		 * too far.  it can happen when the link layer supports
1294 		 * data compression or the interface speed is set to
1295 		 * a much lower value.
1296 		 */
1297 		TV_DELTA(&ifd->ifnow_, nowp, iftime);
1298 		if (iftime+pkt_time < ifd->maxiftime_) {
1299 			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1300 		} else {
1301 			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
1302 		}
1303 	} else {
1304 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1305 	}
1306 #else
1307 	if (TV_LT(nowp, &ifd->ifnow_)) {
1308 		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1309 	} else {
1310 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1311 	}
1312 #endif
1313 
1314 	while (cl != NULL) {
1315 		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
1316 		if (idle >= 2000000)
1317 			/*
1318 			 * this class is idle enough, reset avgidle.
1319 			 * (TV_DELTA returns 2000000 us when delta is large.)
1320 			 */
1321 			cl->avgidle_ = cl->maxidle_;
1322 
1323 		/* get pkt_time (for class) in usec */
1324 #if 1  /* use approximation */
1325 		pkt_time = pktlen * cl->ns_per_byte_;
1326 		pkt_time = NSEC_TO_USEC(pkt_time);
1327 #else
1328 		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
1329 #endif
1330 		idle -= pkt_time;
1331 
1332 		avgidle = cl->avgidle_;
1333 		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
1334 		cl->avgidle_ = avgidle;
1335 
1336 		/* Are we overlimit ? */
1337 		if (avgidle <= 0) {
1338 			CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
1339 #if 1 /* ALTQ */
1340 			/*
1341 			 * need some lower bound for avgidle, otherwise
1342 			 * a borrowing class gets unbounded penalty.
1343 			 */
1344 			if (avgidle < cl->minidle_)
1345 				avgidle = cl->avgidle_ = cl->minidle_;
1346 #endif
1347 			/* set next idle to make avgidle 0 */
1348 			tidle = pkt_time +
1349 				(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
1350 			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
1351 			++cl->stats_.over;
1352 		} else {
1353 			cl->avgidle_ =
1354 			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
1355 			cl->undertime_.tv_sec = 0;
1356 			if (cl->sleeping_) {
1357 				CALLOUT_STOP(&cl->callout_);
1358 				cl->sleeping_ = 0;
1359 			}
1360 		}
1361 
1362 		if (borrows != NULL) {
1363 			if (borrows != cl)
1364 				++cl->stats_.borrows;
1365 			else
1366 				borrows = NULL;
1367 		}
1368 		cl->last_ = ifd->ifnow_;
1369 		cl->last_pkttime_ = pkt_time;
1370 
1371 #if 1
1372 		if (cl->parent_ == NULL) {
1373 			/* take stats of root class */
1374 			PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1375 		}
1376 #endif
1377 
1378 		cl = cl->parent_;
1379 	}
1380 
1381 	/*
1382 	 * Check to see if cutoff needs to set to a new level.
1383 	 */
1384 	cl = ifd->class_[ifd->qo_];
1385 	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
1386 #if 1 /* ALTQ */
1387 		if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
1388 			rmc_tl_satisfied(ifd, nowp);
1389 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1390 		} else {
1391 			ifd->cutoff_ = borrowed->depth_;
1392 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1393 		}
1394 #else /* !ALTQ */
1395 		if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
1396 			reset_cutoff(ifd);
1397 #ifdef notdef
1398 			rmc_tl_satisfied(ifd, &now);
1399 #endif
1400 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1401 		} else {
1402 			ifd->cutoff_ = borrowed->depth_;
1403 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1404 		}
1405 #endif /* !ALTQ */
1406 	}
1407 
1408 	/*
1409 	 * Release class slot
1410 	 */
1411 	ifd->borrowed_[ifd->qo_] = NULL;
1412 	ifd->class_[ifd->qo_] = NULL;
1413 	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
1414 	ifd->queued_--;
1415 }
1416 
1417 /*
1418  * void
1419  * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
1420  *	over-limit action routines.  These get invoked by rmc_under_limit()
1421  *	if a class with packets to send if over its bandwidth limit & can't
1422  *	borrow from a parent class.
1423  *
1424  *	Returns: NONE
1425  */
1426 
1427 static void
1428 rmc_drop_action(struct rm_class *cl)
1429 {
1430 	struct rm_ifdat	*ifd = cl->ifdat_;
1431 
1432 	ASSERT(qlen(cl->q_) > 0);
1433 	_rmc_dropq(cl);
1434 	if (qempty(cl->q_))
1435 		ifd->na_[cl->pri_]--;
1436 }
1437 
1438 void
1439 rmc_dropall(struct rm_class *cl)
1440 {
1441 	struct rm_ifdat	*ifd = cl->ifdat_;
1442 
1443 	if (!qempty(cl->q_)) {
1444 		_flushq(cl->q_);
1445 
1446 		ifd->na_[cl->pri_]--;
1447 	}
1448 }
1449 
1450 #if (__FreeBSD_version > 300000)
1451 static int tvhzto(struct timeval *);
1452 
1453 static int
1454 tvhzto(struct timeval *tv)
1455 {
1456 	struct timeval t2;
1457 
1458 	getmicrotime(&t2);
1459 	t2.tv_sec = tv->tv_sec - t2.tv_sec;
1460 	t2.tv_usec = tv->tv_usec - t2.tv_usec;
1461 	return (tvtohz(&t2));
1462 }
1463 #endif /* __FreeBSD_version > 300000 */
1464 
1465 /*
1466  * void
1467  * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
1468  *	delay action routine.  It is invoked via rmc_under_limit when the
1469  *	packet is discoverd to be overlimit.
1470  *
1471  *	If the delay action is result of borrow class being overlimit, then
1472  *	delay for the offtime of the borrowing class that is overlimit.
1473  *
1474  *	Returns: NONE
1475  */
1476 
1477 void
1478 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
1479 {
1480 	int	ndelay, t, extradelay;
1481 
1482 	cl->stats_.overactions++;
1483 	TV_DELTA(&cl->undertime_, &cl->overtime_, ndelay);
1484 #ifndef BORROW_OFFTIME
1485 	ndelay += cl->offtime_;
1486 #endif
1487 
1488 	if (!cl->sleeping_) {
1489 		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
1490 #ifdef BORROW_OFFTIME
1491 		if (borrow != NULL)
1492 			extradelay = borrow->offtime_;
1493 		else
1494 #endif
1495 			extradelay = cl->offtime_;
1496 
1497 #ifdef ALTQ
1498 		/*
1499 		 * XXX recalculate suspend time:
1500 		 * current undertime is (tidle + pkt_time) calculated
1501 		 * from the last transmission.
1502 		 *	tidle: time required to bring avgidle back to 0
1503 		 *	pkt_time: target waiting time for this class
1504 		 * we need to replace pkt_time by offtime
1505 		 */
1506 		extradelay -= cl->last_pkttime_;
1507 #endif
1508 		if (extradelay > 0) {
1509 			TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
1510 			ndelay += extradelay;
1511 		}
1512 
1513 		cl->sleeping_ = 1;
1514 		cl->stats_.delays++;
1515 
1516 		/*
1517 		 * Since packets are phased randomly with respect to the
1518 		 * clock, 1 tick (the next clock tick) can be an arbitrarily
1519 		 * short time so we have to wait for at least two ticks.
1520 		 * NOTE:  If there's no other traffic, we need the timer as
1521 		 * a 'backstop' to restart this class.
1522 		 */
1523 		if (ndelay > tick * 2) {
1524 #ifdef __FreeBSD__
1525 			/* FreeBSD rounds up the tick */
1526 			t = tvhzto(&cl->undertime_);
1527 #else
1528 			/* other BSDs round down the tick */
1529 			t = tvhzto(&cl->undertime_) + 1;
1530 #endif
1531 		} else
1532 			t = 2;
1533 		CALLOUT_RESET(&cl->callout_, t,
1534 			      (timeout_t *)rmc_restart, (void *)cl);
1535 	}
1536 }
1537 
1538 /*
1539  * void
1540  * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
1541  *	called by the system timer code & is responsible checking if the
1542  *	class is still sleeping (it might have been restarted as a side
1543  *	effect of the queue scan on a packet arrival) and, if so, restarting
1544  *	output for the class.  Inspecting the class state & restarting output
1545  *	require locking the class structure.  In general the driver is
1546  *	responsible for locking but this is the only routine that is not
1547  *	called directly or indirectly from the interface driver so it has
1548  *	know about system locking conventions.  Under bsd, locking is done
1549  *	by raising IPL to splnet so that's what's implemented here.  On a
1550  *	different system this would probably need to be changed.
1551  *
1552  *	Returns:	NONE
1553  */
1554 
1555 static void
1556 rmc_restart(struct rm_class *cl)
1557 {
1558 	struct rm_ifdat	*ifd = cl->ifdat_;
1559 	int		 s;
1560 
1561 	s = splnet();
1562 	if (cl->sleeping_) {
1563 		cl->sleeping_ = 0;
1564 		cl->undertime_.tv_sec = 0;
1565 
1566 		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
1567 			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
1568 			(ifd->restart)(ifd->ifq_);
1569 		}
1570 	}
1571 	splx(s);
1572 }
1573 
1574 /*
1575  * void
1576  * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
1577  *	handling routine for the root class of the link sharing structure.
1578  *
1579  *	Returns: NONE
1580  */
1581 
1582 static void
1583 rmc_root_overlimit(struct rm_class *cl,
1584     struct rm_class *borrow)
1585 {
1586 	panic("rmc_root_overlimit");
1587 }
1588 
1589 /*
1590  * Packet Queue handling routines.  Eventually, this is to localize the
1591  *	effects on the code whether queues are red queues or droptail
1592  *	queues.
1593  */
1594 
1595 static int
1596 _rmc_addq(rm_class_t *cl, mbuf_t *m)
1597 {
1598 #ifdef ALTQ_RIO
1599 	if (q_is_rio(cl->q_))
1600 		return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
1601 #endif
1602 #ifdef ALTQ_RED
1603 	if (q_is_red(cl->q_))
1604 		return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
1605 #endif /* ALTQ_RED */
1606 
1607 	if (cl->flags_ & RMCF_CLEARDSCP)
1608 		write_dsfield(m, cl->pktattr_, 0);
1609 
1610 	_addq(cl->q_, m);
1611 	return (0);
1612 }
1613 
1614 /* note: _rmc_dropq is not called for red */
1615 static void
1616 _rmc_dropq(rm_class_t *cl)
1617 {
1618 	mbuf_t	*m;
1619 
1620 	if ((m = _getq(cl->q_)) != NULL)
1621 		m_freem(m);
1622 }
1623 
1624 static mbuf_t *
1625 _rmc_getq(rm_class_t *cl)
1626 {
1627 #ifdef ALTQ_RIO
1628 	if (q_is_rio(cl->q_))
1629 		return rio_getq((rio_t *)cl->red_, cl->q_);
1630 #endif
1631 #ifdef ALTQ_RED
1632 	if (q_is_red(cl->q_))
1633 		return red_getq(cl->red_, cl->q_);
1634 #endif
1635 	return _getq(cl->q_);
1636 }
1637 
1638 static mbuf_t *
1639 _rmc_pollq(rm_class_t *cl)
1640 {
1641 	return qhead(cl->q_);
1642 }
1643 
1644 #ifdef CBQ_TRACE
1645 
1646 struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
1647 struct cbqtrace		*cbqtrace_ptr = NULL;
1648 int			 cbqtrace_count;
1649 
1650 /*
1651  * DDB hook to trace cbq events:
1652  *  the last 1024 events are held in a circular buffer.
1653  *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
1654  */
1655 void cbqtrace_dump(int);
1656 static char *rmc_funcname(void *);
1657 
1658 static struct rmc_funcs {
1659 	void	*func;
1660 	char	*name;
1661 } rmc_funcs[] =
1662 {
1663 	rmc_init,		"rmc_init",
1664 	rmc_queue_packet,	"rmc_queue_packet",
1665 	rmc_under_limit,	"rmc_under_limit",
1666 	rmc_update_class_util,	"rmc_update_class_util",
1667 	rmc_delay_action,	"rmc_delay_action",
1668 	rmc_restart,		"rmc_restart",
1669 	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
1670 	NULL,			NULL
1671 };
1672 
1673 static char *
1674 rmc_funcname(void *func)
1675 {
1676 	struct rmc_funcs *fp;
1677 
1678 	for (fp = rmc_funcs; fp->func != NULL; fp++)
1679 		if (fp->func == func)
1680 			return (fp->name);
1681 	return ("unknown");
1682 }
1683 
1684 void
1685 cbqtrace_dump(int counter)
1686 {
1687 	int	 i, *p;
1688 	char	*cp;
1689 
1690 	counter = counter % NCBQTRACE;
1691 	p = (int *)&cbqtrace_buffer[counter];
1692 
1693 	for (i=0; i<20; i++) {
1694 		printf("[0x%x] ", *p++);
1695 		printf("%s: ", rmc_funcname((void *)*p++));
1696 		cp = (char *)p++;
1697 		printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
1698 		printf("%d\n",*p++);
1699 
1700 		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
1701 			p = (int *)cbqtrace_buffer;
1702 	}
1703 }
1704 #endif /* CBQ_TRACE */
1705 #endif /* ALTQ_CBQ */
1706 
1707 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ)
1708 #if !defined(__GNUC__) || defined(ALTQ_DEBUG)
1709 
1710 void
1711 _addq(class_queue_t *q, mbuf_t *m)
1712 {
1713         mbuf_t	*m0;
1714 
1715 	if ((m0 = qtail(q)) != NULL)
1716 		m->m_nextpkt = m0->m_nextpkt;
1717 	else
1718 		m0 = m;
1719 	m0->m_nextpkt = m;
1720 	qtail(q) = m;
1721 	qlen(q)++;
1722 }
1723 
1724 mbuf_t *
1725 _getq(class_queue_t *q)
1726 {
1727 	mbuf_t	*m, *m0;
1728 
1729 	if ((m = qtail(q)) == NULL)
1730 		return (NULL);
1731 	if ((m0 = m->m_nextpkt) != m)
1732 		m->m_nextpkt = m0->m_nextpkt;
1733 	else {
1734 		ASSERT(qlen(q) == 1);
1735 		qtail(q) = NULL;
1736 	}
1737 	qlen(q)--;
1738 	m0->m_nextpkt = NULL;
1739 	return (m0);
1740 }
1741 
1742 /* drop a packet at the tail of the queue */
1743 mbuf_t *
1744 _getq_tail(class_queue_t *q)
1745 {
1746 	mbuf_t	*m, *m0, *prev;
1747 
1748 	if ((m = m0 = qtail(q)) == NULL)
1749 		return NULL;
1750 	do {
1751 		prev = m0;
1752 		m0 = m0->m_nextpkt;
1753 	} while (m0 != m);
1754 	prev->m_nextpkt = m->m_nextpkt;
1755 	if (prev == m)  {
1756 		ASSERT(qlen(q) == 1);
1757 		qtail(q) = NULL;
1758 	} else
1759 		qtail(q) = prev;
1760 	qlen(q)--;
1761 	m->m_nextpkt = NULL;
1762 	return (m);
1763 }
1764 
1765 /* randomly select a packet in the queue */
1766 mbuf_t *
1767 _getq_random(class_queue_t *q)
1768 {
1769 	struct mbuf	*m;
1770 	int		 i, n;
1771 
1772 	if ((m = qtail(q)) == NULL)
1773 		return NULL;
1774 	if (m->m_nextpkt == m) {
1775 		ASSERT(qlen(q) == 1);
1776 		qtail(q) = NULL;
1777 	} else {
1778 		struct mbuf *prev = NULL;
1779 
1780 		n = arc4random() % qlen(q) + 1;
1781 		for (i = 0; i < n; i++) {
1782 			prev = m;
1783 			m = m->m_nextpkt;
1784 		}
1785 		prev->m_nextpkt = m->m_nextpkt;
1786 		if (m == qtail(q))
1787 			qtail(q) = prev;
1788 	}
1789 	qlen(q)--;
1790 	m->m_nextpkt = NULL;
1791 	return (m);
1792 }
1793 
1794 void
1795 _removeq(class_queue_t *q, mbuf_t *m)
1796 {
1797 	mbuf_t	*m0, *prev;
1798 
1799 	m0 = qtail(q);
1800 	do {
1801 		prev = m0;
1802 		m0 = m0->m_nextpkt;
1803 	} while (m0 != m);
1804 	prev->m_nextpkt = m->m_nextpkt;
1805 	if (prev == m)
1806 		qtail(q) = NULL;
1807 	else if (qtail(q) == m)
1808 		qtail(q) = prev;
1809 	qlen(q)--;
1810 }
1811 
1812 void
1813 _flushq(class_queue_t *q)
1814 {
1815 	mbuf_t *m;
1816 
1817 	while ((m = _getq(q)) != NULL)
1818 		m_freem(m);
1819 	ASSERT(qlen(q) == 0);
1820 }
1821 
1822 #endif /* !__GNUC__ || ALTQ_DEBUG */
1823 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
1824