xref: /netbsd-src/sys/altq/altq_red.c (revision 9fbd88883c38d0c0fbfcbe66d76fe6b0fab3f9de)
1 /*	$NetBSD: altq_red.c,v 1.5 2001/11/12 23:14:22 lukem Exp $	*/
2 /*	$KAME: altq_red.c,v 1.8 2000/12/14 08:12:46 thorpej Exp $	*/
3 
4 /*
5  * Copyright (C) 1997-2000
6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  */
30 /*
31  * Copyright (c) 1990-1994 Regents of the University of California.
32  * All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  * 1. Redistributions of source code must retain the above copyright
38  *    notice, this list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright
40  *    notice, this list of conditions and the following disclaimer in the
41  *    documentation and/or other materials provided with the distribution.
42  * 3. All advertising materials mentioning features or use of this software
43  *    must display the following acknowledgement:
44  *	This product includes software developed by the Computer Systems
45  *	Engineering Group at Lawrence Berkeley Laboratory.
46  * 4. Neither the name of the University nor of the Laboratory may be used
47  *    to endorse or promote products derived from this software without
48  *    specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  */
62 
63 #include <sys/cdefs.h>
64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.5 2001/11/12 23:14:22 lukem Exp $");
65 
66 #if defined(__FreeBSD__) || defined(__NetBSD__)
67 #include "opt_altq.h"
68 #if (__FreeBSD__ != 2)
69 #include "opt_inet.h"
70 #ifdef __FreeBSD__
71 #include "opt_inet6.h"
72 #endif
73 #endif
74 #endif /* __FreeBSD__ || __NetBSD__ */
75 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
76 
77 #include <sys/param.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/socket.h>
81 #include <sys/sockio.h>
82 #include <sys/systm.h>
83 #include <sys/proc.h>
84 #include <sys/errno.h>
85 #include <sys/kernel.h>
86 #ifdef ALTQ_FLOWVALVE
87 #include <sys/queue.h>
88 #include <sys/time.h>
89 #endif
90 
91 #include <net/if.h>
92 #include <net/if_types.h>
93 
94 #include <netinet/in.h>
95 #include <netinet/in_systm.h>
96 #include <netinet/ip.h>
97 #ifdef INET6
98 #include <netinet/ip6.h>
99 #endif
100 
101 #include <altq/altq.h>
102 #include <altq/altq_conf.h>
103 #include <altq/altq_red.h>
104 #ifdef ALTQ_FLOWVALVE
105 #include <altq/altq_flowvalve.h>
106 #endif
107 
108 /*
109  * ALTQ/RED (Random Early Detection) implementation using 32-bit
110  * fixed-point calculation.
111  *
112  * written by kjc using the ns code as a reference.
113  * you can learn more about red and ns from Sally's home page at
114  * http://www-nrg.ee.lbl.gov/floyd/
115  *
116  * most of the red parameter values are fixed in this implementation
117  * to prevent fixed-point overflow/underflow.
118  * if you change the parameters, watch out for overflow/underflow!
119  *
120  * the parameters used are recommended values by Sally.
121  * the corresponding ns config looks:
122  *	q_weight=0.00195
123  *	minthresh=5 maxthresh=15 queue-size=60
124  *	linterm=30
125  *	dropmech=drop-tail
126  *	bytes=false (can't be handled by 32-bit fixed-point)
127  *	doubleq=false dqthresh=false
128  *	wait=true
129  */
130 /*
131  * alternative red parameters for a slow link.
132  *
133  * assume the queue length becomes from zero to L and keeps L, it takes
134  * N packets for q_avg to reach 63% of L.
135  * when q_weight is 0.002, N is about 500 packets.
136  * for a slow link like dial-up, 500 packets takes more than 1 minute!
137  * when q_weight is 0.008, N is about 127 packets.
138  * when q_weight is 0.016, N is about 63 packets.
139  * bursts of 50 packets are allowd for 0.002, bursts of 25 packets
140  * are allowed for 0.016.
141  * see Sally's paper for more details.
142  */
143 /* normal red parameters */
144 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
145 				/* q_weight = 0.00195 */
146 
147 /* red parameters for a slow link */
148 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
149 				/* q_weight = 0.0078125 */
150 
151 /* red parameters for a very slow link (e.g., dialup) */
152 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
153 				/* q_weight = 0.015625 */
154 
155 /* fixed-point uses 12-bit decimal places */
156 #define	FP_SHIFT	12	/* fixed-point shift */
157 
158 /* red parameters for drop probability */
159 #define	INV_P_MAX	10	/* inverse of max drop probability */
160 #define	TH_MIN		5	/* min threshold */
161 #define	TH_MAX		15	/* max threshold */
162 
163 #define	RED_LIMIT	60	/* default max queue lenght */
164 #define	RED_STATS		/* collect statistics */
165 
166 /*
167  * our default policy for forced-drop is drop-tail.
168  * (in altq-1.1.2 or earlier, the default was random-drop.
169  * but it makes more sense to punish the cause of the surge.)
170  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
171  */
172 
173 #ifdef ALTQ_FLOWVALVE
174 /*
175  * flow-valve is an extension to protect red from unresponsive flows
176  * and to promote end-to-end congestion control.
177  * flow-valve observes the average drop rates of the flows that have
178  * experienced packet drops in the recent past.
179  * when the average drop rate exceeds the threshold, the flow is
180  * blocked by the flow-valve.  the trapped flow should back off
181  * exponentially to escape from the flow-valve.
182  */
183 #ifdef RED_RANDOM_DROP
184 #error "random-drop can't be used with flow-valve!"
185 #endif
186 #endif /* ALTQ_FLOWVALVE */
187 
188 /* red_list keeps all red_queue_t's allocated. */
189 static red_queue_t *red_list = NULL;
190 
191 /* default red parameter values */
192 static int default_th_min = TH_MIN;
193 static int default_th_max = TH_MAX;
194 static int default_inv_pmax = INV_P_MAX;
195 
196 /* internal function prototypes */
197 static int red_enqueue __P((struct ifaltq *, struct mbuf *,
198 			    struct altq_pktattr *));
199 static struct mbuf *red_dequeue __P((struct ifaltq *, int));
200 static int red_request __P((struct ifaltq *, int, void *));
201 static void red_purgeq __P((red_queue_t *));
202 static int red_detach __P((red_queue_t *));
203 #ifdef ALTQ_FLOWVALVE
204 static __inline struct fve *flowlist_lookup __P((struct flowvalve *,
205 			 struct altq_pktattr *, struct timeval *));
206 static __inline struct fve *flowlist_reclaim __P((struct flowvalve *,
207 						  struct altq_pktattr *));
208 static __inline void flowlist_move_to_head __P((struct flowvalve *,
209 						struct fve *));
210 static __inline int fv_p2f __P((struct flowvalve *, int));
211 static struct flowvalve *fv_alloc __P((struct red *));
212 static void fv_destroy __P((struct flowvalve *));
213 static int fv_checkflow __P((struct flowvalve *, struct altq_pktattr *,
214 			     struct fve **));
215 static void fv_dropbyred __P((struct flowvalve *fv, struct altq_pktattr *,
216 			      struct fve *));
217 #endif
218 
219 /*
220  * red device interface
221  */
222 altqdev_decl(red);
223 
224 int
225 redopen(dev, flag, fmt, p)
226 	dev_t dev;
227 	int flag, fmt;
228 	struct proc *p;
229 {
230 	/* everything will be done when the queueing scheme is attached. */
231 	return 0;
232 }
233 
234 int
235 redclose(dev, flag, fmt, p)
236 	dev_t dev;
237 	int flag, fmt;
238 	struct proc *p;
239 {
240 	red_queue_t *rqp;
241 	int err, error = 0;
242 
243 	while ((rqp = red_list) != NULL) {
244 		/* destroy all */
245 		err = red_detach(rqp);
246 		if (err != 0 && error == 0)
247 			error = err;
248 	}
249 
250 	return error;
251 }
252 
253 int
254 redioctl(dev, cmd, addr, flag, p)
255 	dev_t dev;
256 	ioctlcmd_t cmd;
257 	caddr_t addr;
258 	int flag;
259 	struct proc *p;
260 {
261 	red_queue_t *rqp;
262 	struct red_interface *ifacep;
263 	struct ifnet *ifp;
264 	int	error = 0;
265 
266 	/* check super-user privilege */
267 	switch (cmd) {
268 	case RED_GETSTATS:
269 		break;
270 	default:
271 #if (__FreeBSD_version > 400000)
272 		if ((error = suser(p)) != 0)
273 #else
274 		if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
275 #endif
276 			return (error);
277 		break;
278 	}
279 
280 	switch (cmd) {
281 
282 	case RED_ENABLE:
283 		ifacep = (struct red_interface *)addr;
284 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
285 			error = EBADF;
286 			break;
287 		}
288 		error = altq_enable(rqp->rq_ifq);
289 		break;
290 
291 	case RED_DISABLE:
292 		ifacep = (struct red_interface *)addr;
293 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
294 			error = EBADF;
295 			break;
296 		}
297 		error = altq_disable(rqp->rq_ifq);
298 		break;
299 
300 	case RED_IF_ATTACH:
301 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
302 		if (ifp == NULL) {
303 			error = ENXIO;
304 			break;
305 		}
306 
307 		/* allocate and initialize red_queue_t */
308 		MALLOC(rqp, red_queue_t *, sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
309 		if (rqp == NULL) {
310 			error = ENOMEM;
311 			break;
312 		}
313 		bzero(rqp, sizeof(red_queue_t));
314 
315 		MALLOC(rqp->rq_q, class_queue_t *, sizeof(class_queue_t),
316 		       M_DEVBUF, M_WAITOK);
317 		if (rqp->rq_q == NULL) {
318 			FREE(rqp, M_DEVBUF);
319 			error = ENOMEM;
320 			break;
321 		}
322 		bzero(rqp->rq_q, sizeof(class_queue_t));
323 
324 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
325 		if (rqp->rq_red == NULL) {
326 			FREE(rqp->rq_q, M_DEVBUF);
327 			FREE(rqp, M_DEVBUF);
328 			error = ENOMEM;
329 			break;
330 		}
331 
332 		rqp->rq_ifq = &ifp->if_snd;
333 		qtail(rqp->rq_q) = NULL;
334 		qlen(rqp->rq_q) = 0;
335 		qlimit(rqp->rq_q) = RED_LIMIT;
336 		qtype(rqp->rq_q) = Q_RED;
337 
338 		/*
339 		 * set RED to this ifnet structure.
340 		 */
341 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
342 				    red_enqueue, red_dequeue, red_request,
343 				    NULL, NULL);
344 		if (error) {
345 			red_destroy(rqp->rq_red);
346 			FREE(rqp->rq_q, M_DEVBUF);
347 			FREE(rqp, M_DEVBUF);
348 			break;
349 		}
350 
351 		/* add this state to the red list */
352 		rqp->rq_next = red_list;
353 		red_list = rqp;
354 		break;
355 
356 	case RED_IF_DETACH:
357 		ifacep = (struct red_interface *)addr;
358 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
359 			error = EBADF;
360 			break;
361 		}
362 		error = red_detach(rqp);
363 		break;
364 
365 	case RED_GETSTATS:
366 		do {
367 			struct red_stats *q_stats;
368 			red_t *rp;
369 
370 			q_stats = (struct red_stats *)addr;
371 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
372 					     ALTQT_RED)) == NULL) {
373 				error = EBADF;
374 				break;
375 			}
376 
377 			q_stats->q_len 	   = qlen(rqp->rq_q);
378 			q_stats->q_limit   = qlimit(rqp->rq_q);
379 
380 			rp = rqp->rq_red;
381 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
382 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
383 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
384 			q_stats->drop_forced   = rp->red_stats.drop_forced;
385 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
386 			q_stats->marked_packets = rp->red_stats.marked_packets;
387 
388 			q_stats->weight		= rp->red_weight;
389 			q_stats->inv_pmax	= rp->red_inv_pmax;
390 			q_stats->th_min		= rp->red_thmin;
391 			q_stats->th_max		= rp->red_thmax;
392 
393 #ifdef ALTQ_FLOWVALVE
394 			if (rp->red_flowvalve != NULL) {
395 				struct flowvalve *fv = rp->red_flowvalve;
396 				q_stats->fv_flows    = fv->fv_flows;
397 				q_stats->fv_pass     = fv->fv_stats.pass;
398 				q_stats->fv_predrop  = fv->fv_stats.predrop;
399 				q_stats->fv_alloc    = fv->fv_stats.alloc;
400 				q_stats->fv_escape   = fv->fv_stats.escape;
401 			} else {
402 #endif /* ALTQ_FLOWVALVE */
403 				q_stats->fv_flows    = 0;
404 				q_stats->fv_pass     = 0;
405 				q_stats->fv_predrop  = 0;
406 				q_stats->fv_alloc    = 0;
407 				q_stats->fv_escape   = 0;
408 #ifdef ALTQ_FLOWVALVE
409 			}
410 #endif /* ALTQ_FLOWVALVE */
411 		} while (0);
412 		break;
413 
414 	case RED_CONFIG:
415 		do {
416 			struct red_conf *fc;
417 			red_t *new;
418 			int s, limit;
419 
420 			fc = (struct red_conf *)addr;
421 			if ((rqp = altq_lookup(fc->iface.red_ifname,
422 					       ALTQT_RED)) == NULL) {
423 				error = EBADF;
424 				break;
425 			}
426 			new = red_alloc(fc->red_weight,
427 					fc->red_inv_pmax,
428 					fc->red_thmin,
429 					fc->red_thmax,
430 					fc->red_flags,
431 					fc->red_pkttime);
432 			if (new == NULL) {
433 				error = ENOMEM;
434 				break;
435 			}
436 
437 			s = splnet();
438 			red_purgeq(rqp);
439 			limit = fc->red_limit;
440 			if (limit < fc->red_thmax)
441 				limit = fc->red_thmax;
442 			qlimit(rqp->rq_q) = limit;
443 			fc->red_limit = limit;	/* write back the new value */
444 
445 			red_destroy(rqp->rq_red);
446 			rqp->rq_red = new;
447 
448 			splx(s);
449 
450 			/* write back new values */
451 			fc->red_limit = limit;
452 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
453 			fc->red_thmin = rqp->rq_red->red_thmin;
454 			fc->red_thmax = rqp->rq_red->red_thmax;
455 
456 		} while (0);
457 		break;
458 
459 	case RED_SETDEFAULTS:
460 		do {
461 			struct redparams *rp;
462 
463 			rp = (struct redparams *)addr;
464 
465 			default_th_min = rp->th_min;
466 			default_th_max = rp->th_max;
467 			default_inv_pmax = rp->inv_pmax;
468 		} while (0);
469 		break;
470 
471 	default:
472 		error = EINVAL;
473 		break;
474 	}
475 	return error;
476 }
477 
478 static int
479 red_detach(rqp)
480 	red_queue_t *rqp;
481 {
482 	red_queue_t *tmp;
483 	int error = 0;
484 
485 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
486 		altq_disable(rqp->rq_ifq);
487 
488 	if ((error = altq_detach(rqp->rq_ifq)))
489 		return (error);
490 
491 	if (red_list == rqp)
492 		red_list = rqp->rq_next;
493 	else {
494 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
495 			if (tmp->rq_next == rqp) {
496 				tmp->rq_next = rqp->rq_next;
497 				break;
498 			}
499 		if (tmp == NULL)
500 			printf("red_detach: no state found in red_list!\n");
501 	}
502 
503 	red_destroy(rqp->rq_red);
504 	FREE(rqp->rq_q, M_DEVBUF);
505 	FREE(rqp, M_DEVBUF);
506 	return (error);
507 }
508 
509 /*
510  * red support routines
511  */
512 
513 red_t *
514 red_alloc(weight, inv_pmax, th_min, th_max, flags, pkttime)
515 	int	weight, inv_pmax, th_min, th_max;
516 	int	flags, pkttime;
517 {
518 	red_t 	*rp;
519 	int	w, i;
520 	int	npkts_per_sec;
521 
522 	MALLOC(rp, red_t *, sizeof(red_t), M_DEVBUF, M_WAITOK);
523 	if (rp == NULL)
524 		return (NULL);
525 	bzero(rp, sizeof(red_t));
526 
527 	rp->red_avg = 0;
528 	rp->red_idle = 1;
529 
530 	if (weight == 0)
531 		rp->red_weight = W_WEIGHT;
532 	else
533 		rp->red_weight = weight;
534 	if (inv_pmax == 0)
535 		rp->red_inv_pmax = default_inv_pmax;
536 	else
537 		rp->red_inv_pmax = inv_pmax;
538 	if (th_min == 0)
539 		rp->red_thmin = default_th_min;
540 	else
541 		rp->red_thmin = th_min;
542 	if (th_max == 0)
543 		rp->red_thmax = default_th_max;
544 	else
545 		rp->red_thmax = th_max;
546 
547 	rp->red_flags = flags;
548 
549 	if (pkttime == 0)
550 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
551 		rp->red_pkttime = 800;
552 	else
553 		rp->red_pkttime = pkttime;
554 
555 	if (weight == 0) {
556 		/* when the link is very slow, adjust red parameters */
557 		npkts_per_sec = 1000000 / rp->red_pkttime;
558 		if (npkts_per_sec < 50) {
559 			/* up to about 400Kbps */
560 			rp->red_weight = W_WEIGHT_2;
561 		} else if (npkts_per_sec < 300) {
562 			/* up to about 2.4Mbps */
563 			rp->red_weight = W_WEIGHT_1;
564 		}
565 	}
566 
567 	/* calculate wshift.  weight must be power of 2 */
568 	w = rp->red_weight;
569 	for (i = 0; w > 1; i++)
570 		w = w >> 1;
571 	rp->red_wshift = i;
572 	w = 1 << rp->red_wshift;
573 	if (w != rp->red_weight) {
574 		printf("invalid weight value %d for red! use %d\n",
575 		       rp->red_weight, w);
576 		rp->red_weight = w;
577 	}
578 
579 	/*
580 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
581 	 * to be compared with avg.
582 	 */
583 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
584 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
585 
586 	/*
587 	 * precompute probability denominator
588 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
589 	 */
590 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
591 			 * rp->red_inv_pmax) << FP_SHIFT;
592 
593 	/* allocate weight table */
594 	rp->red_wtab = wtab_alloc(rp->red_weight);
595 
596 	microtime(&rp->red_last);
597 #ifdef ALTQ_FLOWVALVE
598 	if (flags & REDF_FLOWVALVE)
599 		rp->red_flowvalve = fv_alloc(rp);
600 	/* if fv_alloc failes, flowvalve is just disabled */
601 #endif
602 	return (rp);
603 }
604 
605 void
606 red_destroy(rp)
607 	red_t *rp;
608 {
609 #ifdef ALTQ_FLOWVALVE
610 	if (rp->red_flowvalve != NULL)
611 		fv_destroy(rp->red_flowvalve);
612 #endif
613 	wtab_destroy(rp->red_wtab);
614 	FREE(rp, M_DEVBUF);
615 }
616 
617 void
618 red_getstats(rp, sp)
619 	red_t *rp;
620 	struct redstats *sp;
621 {
622 	sp->q_avg 		= rp->red_avg >> rp->red_wshift;
623 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
624 	sp->drop_cnt		= rp->red_stats.drop_cnt;
625 	sp->drop_forced		= rp->red_stats.drop_forced;
626 	sp->drop_unforced	= rp->red_stats.drop_unforced;
627 	sp->marked_packets	= rp->red_stats.marked_packets;
628 }
629 
630 /*
631  * enqueue routine:
632  *
633  *	returns: 0 when successfully queued.
634  *		 ENOBUFS when drop occurs.
635  */
636 static int
637 red_enqueue(ifq, m, pktattr)
638 	struct ifaltq *ifq;
639 	struct mbuf *m;
640 	struct altq_pktattr *pktattr;
641 {
642 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
643 
644 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
645 		return ENOBUFS;
646 	ifq->ifq_len++;
647 	return 0;
648 }
649 
650 int
651 red_addq(rp, q, m, pktattr)
652 	red_t *rp;
653 	class_queue_t *q;
654 	struct mbuf *m;
655 	struct altq_pktattr *pktattr;
656 {
657 	int avg, droptype;
658 	int n;
659 #ifdef ALTQ_FLOWVALVE
660 	struct fve *fve = NULL;
661 
662 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
663 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
664 			m_freem(m);
665 			return (-1);
666 		}
667 #endif
668 
669 	avg = rp->red_avg;
670 
671 	/*
672 	 * if we were idle, we pretend that n packets arrived during
673 	 * the idle period.
674 	 */
675 	if (rp->red_idle) {
676 		struct timeval now;
677 		int t;
678 
679 		rp->red_idle = 0;
680 		microtime(&now);
681 		t = (now.tv_sec - rp->red_last.tv_sec);
682 		if (t > 60) {
683 			/*
684 			 * being idle for more than 1 minute, set avg to zero.
685 			 * this prevents t from overflow.
686 			 */
687 			avg = 0;
688 		} else {
689 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
690 			n = t / rp->red_pkttime - 1;
691 
692 			/* the following line does (avg = (1 - Wq)^n * avg) */
693 			if (n > 0)
694 				avg = (avg >> FP_SHIFT) *
695 				    pow_w(rp->red_wtab, n);
696 		}
697 	}
698 
699 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
700 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
701 	rp->red_avg = avg;		/* save the new value */
702 
703 	/*
704 	 * red_count keeps a tally of arriving traffic that has not
705 	 * been dropped.
706 	 */
707 	rp->red_count++;
708 
709 	/* see if we drop early */
710 	droptype = DTYPE_NODROP;
711 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
712 		if (avg >= rp->red_thmax_s) {
713 			/* avg >= th_max: forced drop */
714 			droptype = DTYPE_FORCED;
715 		} else if (rp->red_old == 0) {
716 			/* first exceeds th_min */
717 			rp->red_count = 1;
718 			rp->red_old = 1;
719 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
720 				      rp->red_probd, rp->red_count)) {
721 			/* mark or drop by red */
722 			if ((rp->red_flags & REDF_ECN) &&
723 			    mark_ecn(m, pktattr, rp->red_flags)) {
724 				/* successfully marked.  do not drop. */
725 				rp->red_count = 0;
726 #ifdef RED_STATS
727 				rp->red_stats.marked_packets++;
728 #endif
729 			} else {
730 				/* unforced drop by red */
731 				droptype = DTYPE_EARLY;
732 			}
733 		}
734 	} else {
735 		/* avg < th_min */
736 		rp->red_old = 0;
737 	}
738 
739 	/*
740 	 * if the queue length hits the hard limit, it's a forced drop.
741 	 */
742 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
743 		droptype = DTYPE_FORCED;
744 
745 #ifdef RED_RANDOM_DROP
746 	/* if successful or forced drop, enqueue this packet. */
747 	if (droptype != DTYPE_EARLY)
748 		_addq(q, m);
749 #else
750 	/* if successful, enqueue this packet. */
751 	if (droptype == DTYPE_NODROP)
752 		_addq(q, m);
753 #endif
754 	if (droptype != DTYPE_NODROP) {
755 		if (droptype == DTYPE_EARLY) {
756 			/* drop the incoming packet */
757 #ifdef RED_STATS
758 			rp->red_stats.drop_unforced++;
759 #endif
760 		} else {
761 			/* forced drop, select a victim packet in the queue. */
762 #ifdef RED_RANDOM_DROP
763 			m = _getq_random(q);
764 #endif
765 #ifdef RED_STATS
766 			rp->red_stats.drop_forced++;
767 #endif
768 		}
769 #ifdef RED_STATS
770 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
771 #endif
772 		rp->red_count = 0;
773 #ifdef ALTQ_FLOWVALVE
774 		if (rp->red_flowvalve != NULL)
775 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
776 #endif
777 		m_freem(m);
778 		return (-1);
779 	}
780 	/* successfully queued */
781 #ifdef RED_STATS
782 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
783 #endif
784 	return (0);
785 }
786 
787 /*
788  * early-drop probability is calculated as follows:
789  *   prob = p_max * (avg - th_min) / (th_max - th_min)
790  *   prob_a = prob / (2 - count*prob)
791  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
792  * here prob_a increases as successive undrop count increases.
793  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
794  * becomes 1 when (count >= (2 / prob))).
795  */
796 int
797 drop_early(fp_len, fp_probd, count)
798 	int fp_len;	/* (avg - TH_MIN) in fixed-point */
799 	int fp_probd;	/* (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */
800 	int count;	/* how many successive undropped packets */
801 {
802 	int d;		/* denominator of drop-probability */
803 
804 	d = fp_probd - count * fp_len;
805 	if (d <= 0)
806 		/* count exceeds the hard limit: drop or mark */
807 		return (1);
808 
809 	/*
810 	 * now the range of d is [1..600] in fixed-point. (when
811 	 * th_max-th_min=10 and p_max=1/30)
812 	 * drop probability = (avg - TH_MIN) / d
813 	 */
814 
815 	if ((random() % d) < fp_len) {
816 		/* drop or mark */
817 		return (1);
818 	}
819 	/* no drop/mark */
820 	return (0);
821 }
822 
823 /*
824  * try to mark CE bit to the packet.
825  *    returns 1 if successfully marked, 0 otherwise.
826  */
827 int
828 mark_ecn(m, pktattr, flags)
829 	struct mbuf *m;
830 	struct altq_pktattr *pktattr;
831 	int flags;
832 {
833 	struct mbuf *m0;
834 
835 	if (pktattr == NULL ||
836 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
837 		return (0);
838 
839 	/* verify that pattr_hdr is within the mbuf data */
840 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
841 		if ((pktattr->pattr_hdr >= m0->m_data) &&
842 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
843 			break;
844 	if (m0 == NULL) {
845 		/* ick, pattr_hdr is stale */
846 		pktattr->pattr_af = AF_UNSPEC;
847 		return (0);
848 	}
849 
850 	switch (pktattr->pattr_af) {
851 	case AF_INET:
852 		if (flags & REDF_ECN4) {
853 			struct ip *ip = (struct ip *)pktattr->pattr_hdr;
854 
855 			if (ip->ip_v != 4)
856 				return (0);	/* version mismatch! */
857 			if (ip->ip_tos & IPTOS_ECT) {
858 				/* ECN-capable, mark ECN bit. */
859 				if ((ip->ip_tos & IPTOS_CE) == 0) {
860 #if (IPTOS_CE == 0x01)
861 					u_short sum;
862 
863 					ip->ip_tos |= IPTOS_CE;
864 					/*
865 					 * optimized version when IPTOS_CE
866 					 * is 0x01.
867 					 *   HC' = HC -1   when HC > 0
868 					 *       = 0xfffe  when HC = 0
869 					 */
870 					sum = ntohs(ip->ip_sum);
871 					if (sum == 0)
872 						sum = 0xfffe;
873 					else
874 						sum -= 1;
875 					ip->ip_sum = htons(sum);
876 #else /* IPTOS_CE != 0x01 */
877 					long sum;
878 
879 					ip->ip_tos |= IPTOS_CE;
880 					/*
881 					 * update checksum (from RFC1624)
882 					 *	   HC' = ~(~HC + ~m + m')
883 					 */
884 					sum = ~ntohs(ip->ip_sum) & 0xffff;
885 					sum += 0xffff + IPTOS_CE;
886 					sum = (sum >> 16) + (sum & 0xffff);
887 					sum += (sum >> 16);  /* add carry */
888 
889 					ip->ip_sum = htons(~sum & 0xffff);
890 #endif /* IPTOS_CE != 0x01 */
891 				}
892 				return (1);
893 			}
894 		}
895 		break;
896 #ifdef INET6
897 	case AF_INET6:
898 		if (flags & REDF_ECN6) {
899 			struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
900 			u_int32_t flowlabel;
901 
902 			flowlabel = ntohl(ip6->ip6_flow);
903 			if ((flowlabel >> 28) != 6)
904 				return (0);	/* version mismatch! */
905 			if (flowlabel & (IPTOS_ECT << 20)) {
906 				/* ECN-capable, mark ECN bit. */
907 				flowlabel |= (IPTOS_CE << 20);
908 				ip6->ip6_flow = htonl(flowlabel);
909 				return (1);
910 			}
911 		}
912 		break;
913 #endif  /* INET6 */
914 	}
915 
916 	/* not marked */
917 	return (0);
918 }
919 
920 /*
921  * dequeue routine:
922  *	must be called in splnet.
923  *
924  *	returns: mbuf dequeued.
925  *		 NULL when no packet is available in the queue.
926  */
927 
928 static struct mbuf *
929 red_dequeue(ifq, op)
930 	struct ifaltq *ifq;
931 	int op;
932 {
933 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
934 	struct mbuf *m;
935 
936 	if (op == ALTDQ_POLL)
937 		return qhead(rqp->rq_q);
938 
939 	/* op == ALTDQ_REMOVE */
940 	m =  red_getq(rqp->rq_red, rqp->rq_q);
941 	if (m != NULL)
942 		ifq->ifq_len--;
943 	return (m);
944 }
945 
946 struct mbuf *
947 red_getq(rp, q)
948 	red_t *rp;
949 	class_queue_t *q;
950 {
951 	struct mbuf *m;
952 
953 	if ((m = _getq(q)) == NULL) {
954 		if (rp->red_idle == 0) {
955 			rp->red_idle = 1;
956 			microtime(&rp->red_last);
957 		}
958 		return NULL;
959 	}
960 
961 	rp->red_idle = 0;
962 	return (m);
963 }
964 
965 static int
966 red_request(ifq, req, arg)
967 	struct ifaltq *ifq;
968 	int req;
969 	void *arg;
970 {
971 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
972 
973 	switch (req) {
974 	case ALTRQ_PURGE:
975 		red_purgeq(rqp);
976 		break;
977 	}
978 	return (0);
979 }
980 
981 static void
982 red_purgeq(rqp)
983 	red_queue_t *rqp;
984 {
985 	_flushq(rqp->rq_q);
986 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
987 		rqp->rq_ifq->ifq_len = 0;
988 }
989 
990 
991 /*
992  * helper routine to calibrate avg during idle.
993  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
994  * here Wq = 1/weight and the code assumes Wq is close to zero.
995  *
996  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
997  */
998 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
999 
1000 struct wtab *
1001 wtab_alloc(weight)
1002 	int weight;
1003 {
1004 	struct wtab *w;
1005 	int i;
1006 
1007 	for (w = wtab_list; w != NULL; w = w->w_next)
1008 		if (w->w_weight == weight) {
1009 			w->w_refcount++;
1010 			return (w);
1011 		}
1012 
1013 	MALLOC(w, struct wtab *, sizeof(struct wtab), M_DEVBUF, M_WAITOK);
1014 	if (w == NULL)
1015 		panic("wtab_alloc: malloc failed!");
1016 	bzero(w, sizeof(struct wtab));
1017 	w->w_weight = weight;
1018 	w->w_refcount = 1;
1019 	w->w_next = wtab_list;
1020 	wtab_list = w;
1021 
1022 	/* initialize the weight table */
1023 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
1024 	for (i = 1; i < 32; i++) {
1025 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
1026 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
1027 			w->w_param_max = 1 << i;
1028 	}
1029 
1030 	return (w);
1031 }
1032 
1033 int
1034 wtab_destroy(w)
1035 	struct wtab *w;
1036 {
1037 	struct wtab *prev;
1038 
1039 	if (--w->w_refcount > 0)
1040 		return (0);
1041 
1042 	if (wtab_list == w)
1043 		wtab_list = w->w_next;
1044 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
1045 		if (prev->w_next == w) {
1046 			prev->w_next = w->w_next;
1047 			break;
1048 		}
1049 
1050 	FREE(w, M_DEVBUF);
1051 	return (0);
1052 }
1053 
1054 int32_t
1055 pow_w(w, n)
1056 	struct wtab *w;
1057 	int n;
1058 {
1059 	int	i, bit;
1060 	int32_t	val;
1061 
1062 	if (n >= w->w_param_max)
1063 		return (0);
1064 
1065 	val = 1 << FP_SHIFT;
1066 	if (n <= 0)
1067 		return (val);
1068 
1069 	bit = 1;
1070 	i = 0;
1071 	while (n) {
1072 		if (n & bit) {
1073 			val = (val * w->w_tab[i]) >> FP_SHIFT;
1074 			n &= ~bit;
1075 		}
1076 		i++;
1077 		bit <<=  1;
1078 	}
1079 	return (val);
1080 }
1081 
1082 #ifdef ALTQ_FLOWVALVE
1083 
1084 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
1085 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
1086 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
1087 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
1088 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
1089 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
1090 
1091 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
1092 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
1093 
1094 #define	FV_N			10	/* update fve_f every FV_N packets */
1095 
1096 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
1097 #define	FV_TTHRESH		3  /* time threshold to delete fve */
1098 #define	FV_ALPHA		5  /* extra packet count */
1099 
1100 #define	FV_STATS
1101 
1102 #if (__FreeBSD_version > 300000)
1103 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
1104 #else
1105 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
1106 #endif
1107 
1108 /*
1109  * Brtt table: 127 entry table to convert drop rate (p) to
1110  * the corresponding bandwidth fraction (f)
1111  * the following equation is implemented to use scaled values,
1112  * fve_p and fve_f, in the fixed point format.
1113  *
1114  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
1115  *   f = Brtt(p) / (max_th + alpha)
1116  */
1117 #define	BRTT_SIZE	128
1118 #define	BRTT_SHIFT	12
1119 #define	BRTT_MASK	0x0007f000
1120 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
1121 
1122 const int brtt_tab[BRTT_SIZE] = {
1123 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
1124 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
1125 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
1126 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
1127 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
1128 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
1129 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
1130 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
1131 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
1132 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
1133 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
1134 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
1135 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
1136 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
1137 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
1138 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
1139 };
1140 
1141 static __inline struct fve *
1142 flowlist_lookup(fv, pktattr, now)
1143 	struct flowvalve *fv;
1144 	struct altq_pktattr *pktattr;
1145 	struct timeval *now;
1146 {
1147 	struct fve *fve;
1148 	int flows;
1149 	struct ip *ip;
1150 #ifdef INET6
1151 	struct ip6_hdr *ip6;
1152 #endif
1153 	struct timeval tthresh;
1154 
1155 	if (pktattr == NULL)
1156 		return (NULL);
1157 
1158 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
1159 	flows = 0;
1160 	/*
1161 	 * search the flow list
1162 	 */
1163 	switch (pktattr->pattr_af) {
1164 	case AF_INET:
1165 		ip = (struct ip *)pktattr->pattr_hdr;
1166 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1167 			if (fve->fve_lastdrop.tv_sec == 0)
1168 				break;
1169 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1170 				fve->fve_lastdrop.tv_sec = 0;
1171 				break;
1172 			}
1173 			if (fve->fve_flow.flow_af == AF_INET &&
1174 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
1175 			    ip->ip_src.s_addr &&
1176 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
1177 			    ip->ip_dst.s_addr)
1178 				return (fve);
1179 			flows++;
1180 		}
1181 		break;
1182 #ifdef INET6
1183 	case AF_INET6:
1184 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1185 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1186 			if (fve->fve_lastdrop.tv_sec == 0)
1187 				break;
1188 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1189 				fve->fve_lastdrop.tv_sec = 0;
1190 				break;
1191 			}
1192 			if (fve->fve_flow.flow_af == AF_INET6 &&
1193 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
1194 					       &ip6->ip6_src) &&
1195 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
1196 					       &ip6->ip6_dst))
1197 				return (fve);
1198 			flows++;
1199 		}
1200 		break;
1201 #endif /* INET6 */
1202 
1203 	default:
1204 		/* unknown protocol.  no drop. */
1205 		return (NULL);
1206 	}
1207 	fv->fv_flows = flows;	/* save the number of active fve's */
1208 	return (NULL);
1209 }
1210 
1211 static __inline struct fve *
1212 flowlist_reclaim(fv, pktattr)
1213 	struct flowvalve *fv;
1214 	struct altq_pktattr *pktattr;
1215 {
1216 	struct fve *fve;
1217 	struct ip *ip;
1218 #ifdef INET6
1219 	struct ip6_hdr *ip6;
1220 #endif
1221 
1222 	/*
1223 	 * get an entry from the tail of the LRU list.
1224 	 */
1225 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
1226 
1227 	switch (pktattr->pattr_af) {
1228 	case AF_INET:
1229 		ip = (struct ip *)pktattr->pattr_hdr;
1230 		fve->fve_flow.flow_af = AF_INET;
1231 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
1232 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
1233 		break;
1234 #ifdef INET6
1235 	case AF_INET6:
1236 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1237 		fve->fve_flow.flow_af = AF_INET6;
1238 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
1239 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
1240 		break;
1241 #endif
1242 	}
1243 
1244 	fve->fve_state = Green;
1245 	fve->fve_p = 0.0;
1246 	fve->fve_f = 0.0;
1247 	fve->fve_ifseq = fv->fv_ifseq - 1;
1248 	fve->fve_count = 0;
1249 
1250 	fv->fv_flows++;
1251 #ifdef FV_STATS
1252 	fv->fv_stats.alloc++;
1253 #endif
1254 	return (fve);
1255 }
1256 
1257 static __inline void
1258 flowlist_move_to_head(fv, fve)
1259 	struct flowvalve *fv;
1260 	struct fve *fve;
1261 {
1262 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
1263 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
1264 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
1265 	}
1266 }
1267 
1268 /*
1269  * allocate flowvalve structure
1270  */
1271 static struct flowvalve *
1272 fv_alloc(rp)
1273 	struct red *rp;
1274 {
1275 	struct flowvalve *fv;
1276 	struct fve *fve;
1277 	int i, num;
1278 
1279 	num = FV_FLOWLISTSIZE;
1280 	MALLOC(fv, struct flowvalve *, sizeof(struct flowvalve),
1281 	       M_DEVBUF, M_WAITOK);
1282 	if (fv == NULL)
1283 		return (NULL);
1284 	bzero(fv, sizeof(struct flowvalve));
1285 
1286 	MALLOC(fv->fv_fves, struct fve *, sizeof(struct fve) * num,
1287 	       M_DEVBUF, M_WAITOK);
1288 	if (fv->fv_fves == NULL) {
1289 		FREE(fv, M_DEVBUF);
1290 		return (NULL);
1291 	}
1292 	bzero(fv->fv_fves, sizeof(struct fve) * num);
1293 
1294 	fv->fv_flows = 0;
1295 	TAILQ_INIT(&fv->fv_flowlist);
1296 	for (i = 0; i < num; i++) {
1297 		fve = &fv->fv_fves[i];
1298 		fve->fve_lastdrop.tv_sec = 0;
1299 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
1300 	}
1301 
1302 	/* initialize drop rate threshold in scaled fixed-point */
1303 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
1304 
1305 	/* initialize drop rate to fraction table */
1306 	MALLOC(fv->fv_p2ftab, int *, sizeof(int) * BRTT_SIZE,
1307 	       M_DEVBUF, M_WAITOK);
1308 	if (fv->fv_p2ftab == NULL) {
1309 		FREE(fv->fv_fves, M_DEVBUF);
1310 		FREE(fv, M_DEVBUF);
1311 		return (NULL);
1312 	}
1313 	/*
1314 	 * create the p2f table.
1315 	 * (shift is used to keep the precision)
1316 	 */
1317 	for (i = 1; i < BRTT_SIZE; i++) {
1318 		int f;
1319 
1320 		f = brtt_tab[i] << 8;
1321 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
1322 	}
1323 
1324 	return (fv);
1325 }
1326 
1327 static void fv_destroy(fv)
1328 	struct flowvalve *fv;
1329 {
1330 	FREE(fv->fv_p2ftab, M_DEVBUF);
1331 	FREE(fv->fv_fves, M_DEVBUF);
1332 	FREE(fv, M_DEVBUF);
1333 }
1334 
1335 static __inline int
1336 fv_p2f(fv, p)
1337 	struct flowvalve	*fv;
1338 	int	p;
1339 {
1340 	int val, f;
1341 
1342 	if (p >= BRTT_PMAX)
1343 		f = fv->fv_p2ftab[BRTT_SIZE-1];
1344 	else if ((val = (p & BRTT_MASK)))
1345 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
1346 	else
1347 		f = fv->fv_p2ftab[1];
1348 	return (f);
1349 }
1350 
1351 /*
1352  * check if an arriving packet should be pre-dropped.
1353  * called from red_addq() when a packet arrives.
1354  * returns 1 when the packet should be pre-dropped.
1355  * should be called in splnet.
1356  */
1357 static int
1358 fv_checkflow(fv, pktattr, fcache)
1359 	struct flowvalve *fv;
1360 	struct altq_pktattr *pktattr;
1361 	struct fve **fcache;
1362 {
1363 	struct fve *fve;
1364 	struct timeval now;
1365 
1366 	fv->fv_ifseq++;
1367 	FV_TIMESTAMP(&now);
1368 
1369 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1370 		/* no matching entry in the flowlist */
1371 		return (0);
1372 
1373 	*fcache = fve;
1374 
1375 	/* update fraction f for every FV_N packets */
1376 	if (++fve->fve_count == FV_N) {
1377 		/*
1378 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
1379 		 */
1380 		fve->fve_f =
1381 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
1382 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
1383 		fve->fve_ifseq = fv->fv_ifseq;
1384 		fve->fve_count = 0;
1385 	}
1386 
1387 	/*
1388 	 * overpumping test
1389 	 */
1390 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
1391 		int fthresh;
1392 
1393 		/* calculate a threshold */
1394 		fthresh = fv_p2f(fv, fve->fve_p);
1395 		if (fve->fve_f > fthresh)
1396 			fve->fve_state = Red;
1397 	}
1398 
1399 	if (fve->fve_state == Red) {
1400 		/*
1401 		 * backoff test
1402 		 */
1403 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
1404 			/* no drop for at least FV_BACKOFFTHRESH sec */
1405 			fve->fve_p = 0;
1406 			fve->fve_state = Green;
1407 #ifdef FV_STATS
1408 			fv->fv_stats.escape++;
1409 #endif
1410 		} else {
1411 			/* block this flow */
1412 			flowlist_move_to_head(fv, fve);
1413 			fve->fve_lastdrop = now;
1414 #ifdef FV_STATS
1415 			fv->fv_stats.predrop++;
1416 #endif
1417 			return (1);
1418 		}
1419 	}
1420 
1421 	/*
1422 	 * p = (1 - Wp) * p
1423 	 */
1424 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
1425 	if (fve->fve_p < 0)
1426 		fve->fve_p = 0;
1427 #ifdef FV_STATS
1428 	fv->fv_stats.pass++;
1429 #endif
1430 	return (0);
1431 }
1432 
1433 /*
1434  * called from red_addq when a packet is dropped by red.
1435  * should be called in splnet.
1436  */
1437 static void fv_dropbyred(fv, pktattr, fcache)
1438 	struct flowvalve *fv;
1439 	struct altq_pktattr *pktattr;
1440 	struct fve *fcache;
1441 {
1442 	struct fve *fve;
1443 	struct timeval now;
1444 
1445 	if (pktattr == NULL)
1446 		return;
1447 	FV_TIMESTAMP(&now);
1448 
1449 	if (fcache != NULL)
1450 		/* the fve of this packet is already cached */
1451 		fve = fcache;
1452 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1453 		fve = flowlist_reclaim(fv, pktattr);
1454 
1455 	flowlist_move_to_head(fv, fve);
1456 
1457 	/*
1458 	 * update p:  the following line cancels the update
1459 	 *	      in fv_checkflow() and calculate
1460 	 *	p = Wp + (1 - Wp) * p
1461 	 */
1462 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
1463 
1464 	fve->fve_lastdrop = now;
1465 }
1466 
1467 #endif /* ALTQ_FLOWVALVE */
1468 
1469 #ifdef KLD_MODULE
1470 
1471 static struct altqsw red_sw =
1472 	{"red", redopen, redclose, redioctl};
1473 
1474 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
1475 
1476 #endif /* KLD_MODULE */
1477 
1478 #endif /* ALTQ_RED */
1479