xref: /netbsd-src/sys/altq/altq_red.c (revision 0633b543c2bc91e93b550ec10f8042775c1b1b4c)
1 /*	$NetBSD: altq_red.c,v 1.36 2025/01/08 13:00:04 joe Exp $	*/
2 /*	$KAME: altq_red.c,v 1.20 2005/04/13 03:44:25 suz Exp $	*/
3 
4 /*
5  * Copyright (C) 1997-2003
6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  */
30 /*
31  * Copyright (c) 1990-1994 Regents of the University of California.
32  * All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  * 1. Redistributions of source code must retain the above copyright
38  *    notice, this list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright
40  *    notice, this list of conditions and the following disclaimer in the
41  *    documentation and/or other materials provided with the distribution.
42  * 3. All advertising materials mentioning features or use of this software
43  *    must display the following acknowledgement:
44  *	This product includes software developed by the Computer Systems
45  *	Engineering Group at Lawrence Berkeley Laboratory.
46  * 4. Neither the name of the University nor of the Laboratory may be used
47  *    to endorse or promote products derived from this software without
48  *    specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  */
62 
63 #include <sys/cdefs.h>
64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.36 2025/01/08 13:00:04 joe Exp $");
65 
66 #ifdef _KERNEL_OPT
67 #include "opt_altq.h"
68 #include "opt_inet.h"
69 #include "pf.h"
70 #endif
71 
72 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
73 
74 #include <sys/param.h>
75 #include <sys/malloc.h>
76 #include <sys/mbuf.h>
77 #include <sys/socket.h>
78 #include <sys/systm.h>
79 #include <sys/errno.h>
80 #include <sys/kauth.h>
81 #if 1 /* ALTQ3_COMPAT */
82 #include <sys/sockio.h>
83 #include <sys/proc.h>
84 #include <sys/kernel.h>
85 #ifdef ALTQ_FLOWVALVE
86 #include <sys/queue.h>
87 #include <sys/time.h>
88 #endif
89 #endif /* ALTQ3_COMPAT */
90 #include <sys/cprng.h>
91 
92 #include <net/if.h>
93 
94 #include <netinet/in.h>
95 #include <netinet/in_systm.h>
96 #include <netinet/ip.h>
97 #ifdef INET6
98 #include <netinet/ip6.h>
99 #endif
100 
101 #if NPF > 0
102 #include <net/pfvar.h>
103 #endif
104 #include <altq/altq.h>
105 #include <altq/altq_red.h>
106 #ifdef ALTQ3_COMPAT
107 #include <altq/altq_conf.h>
108 #ifdef ALTQ_FLOWVALVE
109 #include <altq/altq_flowvalve.h>
110 #endif
111 #endif
112 
113 /*
114  * ALTQ/RED (Random Early Detection) implementation using 32-bit
115  * fixed-point calculation.
116  *
117  * written by kjc using the ns code as a reference.
118  * you can learn more about red and ns from Sally's home page at
119  * http://www-nrg.ee.lbl.gov/floyd/
120  *
121  * most of the red parameter values are fixed in this implementation
122  * to prevent fixed-point overflow/underflow.
123  * if you change the parameters, watch out for overflow/underflow!
124  *
125  * the parameters used are recommended values by Sally.
126  * the corresponding ns config looks:
127  *	q_weight=0.00195
128  *	minthresh=5 maxthresh=15 queue-size=60
129  *	linterm=30
130  *	dropmech=drop-tail
131  *	bytes=false (can't be handled by 32-bit fixed-point)
132  *	doubleq=false dqthresh=false
133  *	wait=true
134  */
135 /*
136  * alternative red parameters for a slow link.
137  *
138  * assume the queue length becomes from zero to L and keeps L, it takes
139  * N packets for q_avg to reach 63% of L.
140  * when q_weight is 0.002, N is about 500 packets.
141  * for a slow link like dial-up, 500 packets takes more than 1 minute!
142  * when q_weight is 0.008, N is about 127 packets.
143  * when q_weight is 0.016, N is about 63 packets.
144  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
145  * are allowed for 0.016.
146  * see Sally's paper for more details.
147  */
148 /* normal red parameters */
149 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
150 				/* q_weight = 0.00195 */
151 
152 /* red parameters for a slow link */
153 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
154 				/* q_weight = 0.0078125 */
155 
156 /* red parameters for a very slow link (e.g., dialup) */
157 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
158 				/* q_weight = 0.015625 */
159 
160 /* fixed-point uses 12-bit decimal places */
161 #define	FP_SHIFT	12	/* fixed-point shift */
162 
163 /* red parameters for drop probability */
164 #define	INV_P_MAX	10	/* inverse of max drop probability */
165 #define	TH_MIN		5	/* min threshold */
166 #define	TH_MAX		15	/* max threshold */
167 
168 #define	RED_LIMIT	60	/* default max queue length */
169 #define	RED_STATS		/* collect statistics */
170 
171 /*
172  * our default policy for forced-drop is drop-tail.
173  * (in altq-1.1.2 or earlier, the default was random-drop.
174  * but it makes more sense to punish the cause of the surge.)
175  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
176  */
177 
178 #ifdef ALTQ3_COMPAT
179 #ifdef ALTQ_FLOWVALVE
180 /*
181  * flow-valve is an extension to protect red from unresponsive flows
182  * and to promote end-to-end congestion control.
183  * flow-valve observes the average drop rates of the flows that have
184  * experienced packet drops in the recent past.
185  * when the average drop rate exceeds the threshold, the flow is
186  * blocked by the flow-valve.  the trapped flow should back off
187  * exponentially to escape from the flow-valve.
188  */
189 #ifdef RED_RANDOM_DROP
190 #error "random-drop can't be used with flow-valve!"
191 #endif
192 #endif /* ALTQ_FLOWVALVE */
193 
194 /* red_list keeps all red_queue_t's allocated. */
195 static red_queue_t *red_list = NULL;
196 
197 #endif /* ALTQ3_COMPAT */
198 
199 /* default red parameter values */
200 static int default_th_min = TH_MIN;
201 static int default_th_max = TH_MAX;
202 static int default_inv_pmax = INV_P_MAX;
203 
204 #ifdef ALTQ3_COMPAT
205 /* internal function prototypes */
206 static int red_enqueue(struct ifaltq *, struct mbuf *);
207 static struct mbuf *red_dequeue(struct ifaltq *, int);
208 static int red_request(struct ifaltq *, int, void *);
209 static void red_purgeq(red_queue_t *);
210 static int red_detach(red_queue_t *);
211 #ifdef ALTQ_FLOWVALVE
212 static inline struct fve *flowlist_lookup(struct flowvalve *,
213 			 struct altq_pktattr *, struct timeval *);
214 static inline struct fve *flowlist_reclaim(struct flowvalve *,
215 					     struct altq_pktattr *);
216 static inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
217 static inline int fv_p2f(struct flowvalve *, int);
218 static struct flowvalve *fv_alloc(struct red *);
219 static void fv_destroy(struct flowvalve *);
220 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
221 			struct fve **);
222 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
223 			 struct fve *);
224 #endif
225 #endif /* ALTQ3_COMPAT */
226 
227 /*
228  * red support routines
229  */
230 red_t *
231 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
232    int pkttime)
233 {
234 	red_t	*rp;
235 	int	 w, i;
236 	int	 npkts_per_sec;
237 
238 	rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK|M_ZERO);
239 	if (rp == NULL)
240 		return NULL;
241 
242 	rp->red_avg = 0;
243 	rp->red_idle = 1;
244 
245 	if (weight == 0)
246 		rp->red_weight = W_WEIGHT;
247 	else
248 		rp->red_weight = weight;
249 	if (inv_pmax == 0)
250 		rp->red_inv_pmax = default_inv_pmax;
251 	else
252 		rp->red_inv_pmax = inv_pmax;
253 	if (th_min == 0)
254 		rp->red_thmin = default_th_min;
255 	else
256 		rp->red_thmin = th_min;
257 	if (th_max == 0)
258 		rp->red_thmax = default_th_max;
259 	else
260 		rp->red_thmax = th_max;
261 
262 	rp->red_flags = flags;
263 
264 	if (pkttime == 0)
265 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
266 		rp->red_pkttime = 800;
267 	else
268 		rp->red_pkttime = pkttime;
269 
270 	if (weight == 0) {
271 		/* when the link is very slow, adjust red parameters */
272 		npkts_per_sec = 1000000 / rp->red_pkttime;
273 		if (npkts_per_sec < 50) {
274 			/* up to about 400Kbps */
275 			rp->red_weight = W_WEIGHT_2;
276 		} else if (npkts_per_sec < 300) {
277 			/* up to about 2.4Mbps */
278 			rp->red_weight = W_WEIGHT_1;
279 		}
280 	}
281 
282 	/* calculate wshift.  weight must be power of 2 */
283 	w = rp->red_weight;
284 	for (i = 0; w > 1; i++)
285 		w = w >> 1;
286 	rp->red_wshift = i;
287 	w = 1 << rp->red_wshift;
288 	if (w != rp->red_weight) {
289 		printf("invalid weight value %d for red! use %d\n",
290 		       rp->red_weight, w);
291 		rp->red_weight = w;
292 	}
293 
294 	/*
295 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
296 	 * to be compared with avg.
297 	 */
298 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
299 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
300 
301 	/*
302 	 * precompute probability denominator
303 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
304 	 */
305 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
306 			 * rp->red_inv_pmax) << FP_SHIFT;
307 
308 	/* allocate weight table */
309 	rp->red_wtab = wtab_alloc(rp->red_weight);
310 
311 	microtime(&rp->red_last);
312 #ifdef ALTQ3_COMPAT
313 #ifdef ALTQ_FLOWVALVE
314 	if (flags & REDF_FLOWVALVE)
315 		rp->red_flowvalve = fv_alloc(rp);
316 	/* if fv_alloc fails, flowvalve is just disabled */
317 #endif
318 #endif /* ALTQ3_COMPAT */
319 	return rp;
320 }
321 
322 void
323 red_destroy(red_t *rp)
324 {
325 #ifdef ALTQ3_COMPAT
326 #ifdef ALTQ_FLOWVALVE
327 	if (rp->red_flowvalve != NULL)
328 		fv_destroy(rp->red_flowvalve);
329 #endif
330 #endif /* ALTQ3_COMPAT */
331 	wtab_destroy(rp->red_wtab);
332 	free(rp, M_DEVBUF);
333 }
334 
335 void
336 red_getstats(red_t *rp, struct redstats *sp)
337 {
338 	sp->q_avg		= rp->red_avg >> rp->red_wshift;
339 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
340 	sp->drop_cnt		= rp->red_stats.drop_cnt;
341 	sp->drop_forced		= rp->red_stats.drop_forced;
342 	sp->drop_unforced	= rp->red_stats.drop_unforced;
343 	sp->marked_packets	= rp->red_stats.marked_packets;
344 }
345 
346 int
347 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
348     struct altq_pktattr *pktattr)
349 {
350 	int avg, droptype;
351 	int n;
352 #ifdef ALTQ3_COMPAT
353 #ifdef ALTQ_FLOWVALVE
354 	struct fve *fve = NULL;
355 
356 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
357 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
358 			m_freem(m);
359 			return -1;
360 		}
361 #endif
362 #endif /* ALTQ3_COMPAT */
363 
364 	avg = rp->red_avg;
365 
366 	/*
367 	 * if we were idle, we pretend that n packets arrived during
368 	 * the idle period.
369 	 */
370 	if (rp->red_idle) {
371 		struct timeval now;
372 		int t;
373 
374 		rp->red_idle = 0;
375 		microtime(&now);
376 		t = (now.tv_sec - rp->red_last.tv_sec);
377 		if (t > 60) {
378 			/*
379 			 * being idle for more than 1 minute, set avg to zero.
380 			 * this prevents t from overflow.
381 			 */
382 			avg = 0;
383 		} else {
384 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
385 			n = t / rp->red_pkttime - 1;
386 
387 			/* the following line does (avg = (1 - Wq)^n * avg) */
388 			if (n > 0)
389 				avg = (avg >> FP_SHIFT) *
390 				    pow_w(rp->red_wtab, n);
391 		}
392 	}
393 
394 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
395 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
396 	rp->red_avg = avg;		/* save the new value */
397 
398 	/*
399 	 * red_count keeps a tally of arriving traffic that has not
400 	 * been dropped.
401 	 */
402 	rp->red_count++;
403 
404 	/* see if we drop early */
405 	droptype = DTYPE_NODROP;
406 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
407 		if (avg >= rp->red_thmax_s) {
408 			/* avg >= th_max: forced drop */
409 			droptype = DTYPE_FORCED;
410 		} else if (rp->red_old == 0) {
411 			/* first exceeds th_min */
412 			rp->red_count = 1;
413 			rp->red_old = 1;
414 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
415 				      rp->red_probd, rp->red_count)) {
416 			/* mark or drop by red */
417 			if ((rp->red_flags & REDF_ECN) &&
418 			    mark_ecn(m, pktattr, rp->red_flags)) {
419 				/* successfully marked.  do not drop. */
420 				rp->red_count = 0;
421 #ifdef RED_STATS
422 				rp->red_stats.marked_packets++;
423 #endif
424 			} else {
425 				/* unforced drop by red */
426 				droptype = DTYPE_EARLY;
427 			}
428 		}
429 	} else {
430 		/* avg < th_min */
431 		rp->red_old = 0;
432 	}
433 
434 	/*
435 	 * if the queue length hits the hard limit, it's a forced drop.
436 	 */
437 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
438 		droptype = DTYPE_FORCED;
439 
440 #ifdef RED_RANDOM_DROP
441 	/* if successful or forced drop, enqueue this packet. */
442 	if (droptype != DTYPE_EARLY)
443 		_addq(q, m);
444 #else
445 	/* if successful, enqueue this packet. */
446 	if (droptype == DTYPE_NODROP)
447 		_addq(q, m);
448 #endif
449 	if (droptype != DTYPE_NODROP) {
450 		if (droptype == DTYPE_EARLY) {
451 			/* drop the incoming packet */
452 #ifdef RED_STATS
453 			rp->red_stats.drop_unforced++;
454 #endif
455 		} else {
456 			/* forced drop, select a victim packet in the queue. */
457 #ifdef RED_RANDOM_DROP
458 			m = _getq_random(q);
459 #endif
460 #ifdef RED_STATS
461 			rp->red_stats.drop_forced++;
462 #endif
463 		}
464 #ifdef RED_STATS
465 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
466 #endif
467 		rp->red_count = 0;
468 #ifdef ALTQ3_COMPAT
469 #ifdef ALTQ_FLOWVALVE
470 		if (rp->red_flowvalve != NULL)
471 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
472 #endif
473 #endif /* ALTQ3_COMPAT */
474 		m_freem(m);
475 		return -1;
476 	}
477 	/* successfully queued */
478 #ifdef RED_STATS
479 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
480 #endif
481 	return 0;
482 }
483 
484 /*
485  * early-drop probability is calculated as follows:
486  *   prob = p_max * (avg - th_min) / (th_max - th_min)
487  *   prob_a = prob / (2 - count*prob)
488  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
489  * here prob_a increases as successive undrop count increases.
490  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
491  * becomes 1 when (count >= (2 / prob))).
492  */
493 int
494 drop_early(int fp_len, int fp_probd, int count)
495 {
496 	int	d;		/* denominator of drop-probability */
497 
498 	d = fp_probd - count * fp_len;
499 	if (d <= 0)
500 		/* count exceeds the hard limit: drop or mark */
501 		return 1;
502 
503 	/*
504 	 * now the range of d is [1..600] in fixed-point. (when
505 	 * th_max-th_min=10 and p_max=1/30)
506 	 * drop probability = (avg - TH_MIN) / d
507 	 */
508 
509 	if ((cprng_fast32() % d) < fp_len) {
510 		/* drop or mark */
511 		return 1;
512 	}
513 	/* no drop/mark */
514 	return 0;
515 }
516 
517 /*
518  * try to mark CE bit to the packet.
519  *    returns 1 if successfully marked, 0 otherwise.
520  */
521 int
522 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
523 {
524 	struct mbuf	*m0;
525 	struct m_tag	*t;
526 	struct altq_tag	*at;
527 	void		*hdr;
528 	int		 af;
529 
530 	t = m_tag_find(m, PACKET_TAG_ALTQ_QID);
531 	if (t != NULL) {
532 		at = (struct altq_tag *)(t + 1);
533 		if (at == NULL)
534 			return 0;
535 		af = at->af;
536 		hdr = at->hdr;
537 #ifdef ALTQ3_COMPAT
538 	} else if (pktattr != NULL) {
539 		af = pktattr->pattr_af;
540 		hdr = pktattr->pattr_hdr;
541 #endif /* ALTQ3_COMPAT */
542 	} else
543 		return 0;
544 
545 	if (af != AF_INET && af != AF_INET6)
546 		return 0;
547 
548 	/* verify that pattr_hdr is within the mbuf data */
549 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
550 		if (((char *)hdr >= m0->m_data) &&
551 		    ((char *)hdr < m0->m_data + m0->m_len))
552 			break;
553 	if (m0 == NULL) {
554 		/* ick, tag info is stale */
555 		return 0;
556 	}
557 
558 	switch (af) {
559 	case AF_INET:
560 		if (flags & REDF_ECN4) {
561 			struct ip *ip = hdr;
562 			u_int8_t otos;
563 			int sum;
564 
565 			if (ip->ip_v != 4)
566 				return 0;	/* version mismatch! */
567 
568 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
569 				return 0;	/* not-ECT */
570 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
571 				return 1;	/* already marked */
572 
573 			/*
574 			 * ecn-capable but not marked,
575 			 * mark CE and update checksum
576 			 */
577 			otos = ip->ip_tos;
578 			ip->ip_tos |= IPTOS_ECN_CE;
579 			/*
580 			 * update checksum (from RFC1624)
581 			 *	   HC' = ~(~HC + ~m + m')
582 			 */
583 			sum = ~ntohs(ip->ip_sum) & 0xffff;
584 			sum += (~otos & 0xffff) + ip->ip_tos;
585 			sum = (sum >> 16) + (sum & 0xffff);
586 			sum += (sum >> 16);  /* add carry */
587 			ip->ip_sum = htons(~sum & 0xffff);
588 			return 1;
589 		}
590 		break;
591 #ifdef INET6
592 	case AF_INET6:
593 		if (flags & REDF_ECN6) {
594 			struct ip6_hdr *ip6 = hdr;
595 			u_int32_t flowlabel;
596 
597 			flowlabel = ntohl(ip6->ip6_flow);
598 			if ((flowlabel >> 28) != 6)
599 				return 0;	/* version mismatch! */
600 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
601 			    (IPTOS_ECN_NOTECT << 20))
602 				return 0;	/* not-ECT */
603 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
604 			    (IPTOS_ECN_CE << 20))
605 				return 1;	/* already marked */
606 			/*
607 			 * ecn-capable but not marked,  mark CE
608 			 */
609 			flowlabel |= (IPTOS_ECN_CE << 20);
610 			ip6->ip6_flow = htonl(flowlabel);
611 			return 1;
612 		}
613 		break;
614 #endif  /* INET6 */
615 	}
616 
617 	/* not marked */
618 	return 0;
619 }
620 
621 struct mbuf *
622 red_getq(red_t *rp, class_queue_t *q)
623 {
624 	struct mbuf *m;
625 
626 	if ((m = _getq(q)) == NULL) {
627 		if (rp->red_idle == 0) {
628 			rp->red_idle = 1;
629 			microtime(&rp->red_last);
630 		}
631 		return NULL;
632 	}
633 
634 	rp->red_idle = 0;
635 	return m;
636 }
637 
638 /*
639  * helper routine to calibrate avg during idle.
640  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
641  * here Wq = 1/weight and the code assumes Wq is close to zero.
642  *
643  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
644  */
645 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
646 
647 struct wtab *
648 wtab_alloc(int weight)
649 {
650 	struct wtab	*w;
651 	int		 i;
652 
653 	for (w = wtab_list; w != NULL; w = w->w_next)
654 		if (w->w_weight == weight) {
655 			w->w_refcount++;
656 			return w;
657 		}
658 
659 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
660 	if (w == NULL)
661 		panic("wtab_alloc: malloc failed!");
662 	w->w_weight = weight;
663 	w->w_refcount = 1;
664 	w->w_next = wtab_list;
665 	wtab_list = w;
666 
667 	/* initialize the weight table */
668 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
669 	for (i = 1; i < 32; i++) {
670 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
671 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
672 			w->w_param_max = 1 << i;
673 	}
674 
675 	return w;
676 }
677 
678 int
679 wtab_destroy(struct wtab *w)
680 {
681 	struct wtab	*prev;
682 
683 	if (--w->w_refcount > 0)
684 		return 0;
685 
686 	if (wtab_list == w)
687 		wtab_list = w->w_next;
688 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
689 		if (prev->w_next == w) {
690 			prev->w_next = w->w_next;
691 			break;
692 		}
693 
694 	free(w, M_DEVBUF);
695 	return 0;
696 }
697 
698 int32_t
699 pow_w(struct wtab *w, int n)
700 {
701 	int	i, bit;
702 	int32_t	val;
703 
704 	if (n >= w->w_param_max)
705 		return 0;
706 
707 	val = 1 << FP_SHIFT;
708 	if (n <= 0)
709 		return val;
710 
711 	bit = 1;
712 	i = 0;
713 	while (n) {
714 		if (n & bit) {
715 			val = (val * w->w_tab[i]) >> FP_SHIFT;
716 			n &= ~bit;
717 		}
718 		i++;
719 		bit <<=  1;
720 	}
721 	return val;
722 }
723 
724 #ifdef ALTQ3_COMPAT
725 /*
726  * red device interface
727  */
728 altqdev_decl(red);
729 
730 int
731 redopen(dev_t dev, int flag, int fmt,
732     struct lwp *l)
733 {
734 	/* everything will be done when the queueing scheme is attached. */
735 	return 0;
736 }
737 
738 int
739 redclose(dev_t dev, int flag, int fmt,
740     struct lwp *l)
741 {
742 	red_queue_t *rqp;
743 	int err, error = 0;
744 
745 	while ((rqp = red_list) != NULL) {
746 		/* destroy all */
747 		err = red_detach(rqp);
748 		if (err != 0 && error == 0)
749 			error = err;
750 	}
751 
752 	return error;
753 }
754 
755 int
756 redioctl(dev_t dev, ioctlcmd_t cmd, void *addr, int flag,
757     struct lwp *l)
758 {
759 	red_queue_t *rqp;
760 	struct red_interface *ifacep;
761 	struct ifnet *ifp;
762 	int	error = 0;
763 
764 	/* check super-user privilege */
765 	switch (cmd) {
766 	case RED_GETSTATS:
767 		break;
768 	default:
769 		if ((error = kauth_authorize_network(l->l_cred,
770 		    KAUTH_NETWORK_ALTQ, KAUTH_REQ_NETWORK_ALTQ_RED, NULL,
771 		    NULL, NULL)) != 0)
772 			return error;
773 		break;
774 	}
775 
776 	switch (cmd) {
777 
778 	case RED_ENABLE:
779 		ifacep = (struct red_interface *)addr;
780 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
781 			error = EBADF;
782 			break;
783 		}
784 		error = altq_enable(rqp->rq_ifq);
785 		break;
786 
787 	case RED_DISABLE:
788 		ifacep = (struct red_interface *)addr;
789 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
790 			error = EBADF;
791 			break;
792 		}
793 		error = altq_disable(rqp->rq_ifq);
794 		break;
795 
796 	case RED_IF_ATTACH:
797 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
798 		if (ifp == NULL) {
799 			error = ENXIO;
800 			break;
801 		}
802 
803 		/* allocate and initialize red_queue_t */
804 		rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
805 		if (rqp == NULL) {
806 			error = ENOMEM;
807 			break;
808 		}
809 
810 		rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF,
811 		    M_WAITOK|M_ZERO);
812 		if (rqp->rq_q == NULL) {
813 			free(rqp, M_DEVBUF);
814 			error = ENOMEM;
815 			break;
816 		}
817 
818 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
819 		if (rqp->rq_red == NULL) {
820 			free(rqp->rq_q, M_DEVBUF);
821 			free(rqp, M_DEVBUF);
822 			error = ENOMEM;
823 			break;
824 		}
825 
826 		rqp->rq_ifq = &ifp->if_snd;
827 		qtail(rqp->rq_q) = NULL;
828 		qlen(rqp->rq_q) = 0;
829 		qlimit(rqp->rq_q) = RED_LIMIT;
830 		qtype(rqp->rq_q) = Q_RED;
831 
832 		/*
833 		 * set RED to this ifnet structure.
834 		 */
835 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
836 				    red_enqueue, red_dequeue, red_request,
837 				    NULL, NULL);
838 		if (error) {
839 			red_destroy(rqp->rq_red);
840 			free(rqp->rq_q, M_DEVBUF);
841 			free(rqp, M_DEVBUF);
842 			break;
843 		}
844 
845 		/* add this state to the red list */
846 		rqp->rq_next = red_list;
847 		red_list = rqp;
848 		break;
849 
850 	case RED_IF_DETACH:
851 		ifacep = (struct red_interface *)addr;
852 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
853 			error = EBADF;
854 			break;
855 		}
856 		error = red_detach(rqp);
857 		break;
858 
859 	case RED_GETSTATS:
860 		do {
861 			struct red_stats *q_stats;
862 			red_t *rp;
863 
864 			q_stats = (struct red_stats *)addr;
865 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
866 					     ALTQT_RED)) == NULL) {
867 				error = EBADF;
868 				break;
869 			}
870 
871 			q_stats->q_len 	   = qlen(rqp->rq_q);
872 			q_stats->q_limit   = qlimit(rqp->rq_q);
873 
874 			rp = rqp->rq_red;
875 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
876 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
877 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
878 			q_stats->drop_forced   = rp->red_stats.drop_forced;
879 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
880 			q_stats->marked_packets = rp->red_stats.marked_packets;
881 
882 			q_stats->weight		= rp->red_weight;
883 			q_stats->inv_pmax	= rp->red_inv_pmax;
884 			q_stats->th_min		= rp->red_thmin;
885 			q_stats->th_max		= rp->red_thmax;
886 
887 #ifdef ALTQ_FLOWVALVE
888 			if (rp->red_flowvalve != NULL) {
889 				struct flowvalve *fv = rp->red_flowvalve;
890 				q_stats->fv_flows    = fv->fv_flows;
891 				q_stats->fv_pass     = fv->fv_stats.pass;
892 				q_stats->fv_predrop  = fv->fv_stats.predrop;
893 				q_stats->fv_alloc    = fv->fv_stats.alloc;
894 				q_stats->fv_escape   = fv->fv_stats.escape;
895 			} else {
896 #endif /* ALTQ_FLOWVALVE */
897 				q_stats->fv_flows    = 0;
898 				q_stats->fv_pass     = 0;
899 				q_stats->fv_predrop  = 0;
900 				q_stats->fv_alloc    = 0;
901 				q_stats->fv_escape   = 0;
902 #ifdef ALTQ_FLOWVALVE
903 			}
904 #endif /* ALTQ_FLOWVALVE */
905 		} while (/*CONSTCOND*/ 0);
906 		break;
907 
908 	case RED_CONFIG:
909 		do {
910 			struct red_conf *fc;
911 			red_t *new;
912 			int s, limit;
913 
914 			fc = (struct red_conf *)addr;
915 			if ((rqp = altq_lookup(fc->iface.red_ifname,
916 					       ALTQT_RED)) == NULL) {
917 				error = EBADF;
918 				break;
919 			}
920 			new = red_alloc(fc->red_weight,
921 					fc->red_inv_pmax,
922 					fc->red_thmin,
923 					fc->red_thmax,
924 					fc->red_flags,
925 					fc->red_pkttime);
926 			if (new == NULL) {
927 				error = ENOMEM;
928 				break;
929 			}
930 
931 			s = splnet();
932 			red_purgeq(rqp);
933 			limit = fc->red_limit;
934 			if (limit < fc->red_thmax)
935 				limit = fc->red_thmax;
936 			qlimit(rqp->rq_q) = limit;
937 			fc->red_limit = limit;	/* write back the new value */
938 
939 			red_destroy(rqp->rq_red);
940 			rqp->rq_red = new;
941 
942 			splx(s);
943 
944 			/* write back new values */
945 			fc->red_limit = limit;
946 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
947 			fc->red_thmin = rqp->rq_red->red_thmin;
948 			fc->red_thmax = rqp->rq_red->red_thmax;
949 
950 		} while (/*CONSTCOND*/ 0);
951 		break;
952 
953 	case RED_SETDEFAULTS:
954 		do {
955 			struct redparams *rp;
956 
957 			rp = (struct redparams *)addr;
958 
959 			default_th_min = rp->th_min;
960 			default_th_max = rp->th_max;
961 			default_inv_pmax = rp->inv_pmax;
962 		} while (/*CONSTCOND*/ 0);
963 		break;
964 
965 	default:
966 		error = EINVAL;
967 		break;
968 	}
969 	return error;
970 }
971 
972 static int
973 red_detach(red_queue_t *rqp)
974 {
975 	red_queue_t *tmp;
976 	int error = 0;
977 
978 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
979 		altq_disable(rqp->rq_ifq);
980 
981 	if ((error = altq_detach(rqp->rq_ifq)))
982 		return error;
983 
984 	if (red_list == rqp)
985 		red_list = rqp->rq_next;
986 	else {
987 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
988 			if (tmp->rq_next == rqp) {
989 				tmp->rq_next = rqp->rq_next;
990 				break;
991 			}
992 		if (tmp == NULL)
993 			printf("red_detach: no state found in red_list!\n");
994 	}
995 
996 	red_destroy(rqp->rq_red);
997 	free(rqp->rq_q, M_DEVBUF);
998 	free(rqp, M_DEVBUF);
999 	return error;
1000 }
1001 
1002 /*
1003  * enqueue routine:
1004  *
1005  *	returns: 0 when successfully queued.
1006  *		 ENOBUFS when drop occurs.
1007  */
1008 static int
1009 red_enqueue(struct ifaltq *ifq, struct mbuf *m)
1010 {
1011 	struct altq_pktattr pktattr;
1012 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1013 
1014 	pktattr.pattr_class = m->m_pkthdr.pattr_class;
1015 	pktattr.pattr_af = m->m_pkthdr.pattr_af;
1016 	pktattr.pattr_hdr = m->m_pkthdr.pattr_hdr;
1017 
1018 	if (red_addq(rqp->rq_red, rqp->rq_q, m, &pktattr) < 0)
1019 		return ENOBUFS;
1020 	ifq->ifq_len++;
1021 	return 0;
1022 }
1023 
1024 /*
1025  * dequeue routine:
1026  *	must be called in splnet.
1027  *
1028  *	returns: mbuf dequeued.
1029  *		 NULL when no packet is available in the queue.
1030  */
1031 
1032 static struct mbuf *
1033 red_dequeue(struct ifaltq *ifq, int op)
1034 {
1035 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1036 	struct mbuf *m;
1037 
1038 	if (op == ALTDQ_POLL)
1039 		return qhead(rqp->rq_q);
1040 
1041 	/* op == ALTDQ_REMOVE */
1042 	m =  red_getq(rqp->rq_red, rqp->rq_q);
1043 	if (m != NULL)
1044 		ifq->ifq_len--;
1045 	return m;
1046 }
1047 
1048 static int
1049 red_request(struct ifaltq *ifq, int req, void *arg)
1050 {
1051 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1052 
1053 	switch (req) {
1054 	case ALTRQ_PURGE:
1055 		red_purgeq(rqp);
1056 		break;
1057 	}
1058 	return 0;
1059 }
1060 
1061 static void
1062 red_purgeq(red_queue_t *rqp)
1063 {
1064 	_flushq(rqp->rq_q);
1065 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
1066 		rqp->rq_ifq->ifq_len = 0;
1067 }
1068 
1069 #ifdef ALTQ_FLOWVALVE
1070 
1071 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
1072 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
1073 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
1074 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
1075 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
1076 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
1077 
1078 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
1079 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
1080 
1081 #define	FV_N			10	/* update fve_f every FV_N packets */
1082 
1083 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
1084 #define	FV_TTHRESH		3  /* time threshold to delete fve */
1085 #define	FV_ALPHA		5  /* extra packet count */
1086 
1087 #define	FV_STATS
1088 
1089 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
1090 
1091 /*
1092  * Brtt table: 127 entry table to convert drop rate (p) to
1093  * the corresponding bandwidth fraction (f)
1094  * the following equation is implemented to use scaled values,
1095  * fve_p and fve_f, in the fixed point format.
1096  *
1097  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
1098  *   f = Brtt(p) / (max_th + alpha)
1099  */
1100 #define	BRTT_SIZE	128
1101 #define	BRTT_SHIFT	12
1102 #define	BRTT_MASK	0x0007f000
1103 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
1104 
1105 const int brtt_tab[BRTT_SIZE] = {
1106 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
1107 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
1108 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
1109 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
1110 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
1111 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
1112 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
1113 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
1114 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
1115 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
1116 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
1117 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
1118 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
1119 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
1120 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
1121 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
1122 };
1123 
1124 static inline struct fve *
1125 flowlist_lookup(struct flowvalve *fv, struct altq_pktattr *pktattr,
1126     struct timeval *now)
1127 {
1128 	struct fve *fve;
1129 	int flows;
1130 	struct ip *ip;
1131 #ifdef INET6
1132 	struct ip6_hdr *ip6;
1133 #endif
1134 	struct timeval tthresh;
1135 
1136 	if (pktattr == NULL)
1137 		return NULL;
1138 
1139 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
1140 	flows = 0;
1141 	/*
1142 	 * search the flow list
1143 	 */
1144 	switch (pktattr->pattr_af) {
1145 	case AF_INET:
1146 		ip = (struct ip *)pktattr->pattr_hdr;
1147 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1148 			if (fve->fve_lastdrop.tv_sec == 0)
1149 				break;
1150 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1151 				fve->fve_lastdrop.tv_sec = 0;
1152 				break;
1153 			}
1154 			if (fve->fve_flow.flow_af == AF_INET &&
1155 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
1156 			    ip->ip_src.s_addr &&
1157 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
1158 			    ip->ip_dst.s_addr)
1159 				return fve;
1160 			flows++;
1161 		}
1162 		break;
1163 #ifdef INET6
1164 	case AF_INET6:
1165 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1166 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1167 			if (fve->fve_lastdrop.tv_sec == 0)
1168 				break;
1169 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1170 				fve->fve_lastdrop.tv_sec = 0;
1171 				break;
1172 			}
1173 			if (fve->fve_flow.flow_af == AF_INET6 &&
1174 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
1175 					       &ip6->ip6_src) &&
1176 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
1177 					       &ip6->ip6_dst))
1178 				return fve;
1179 			flows++;
1180 		}
1181 		break;
1182 #endif /* INET6 */
1183 
1184 	default:
1185 		/* unknown protocol.  no drop. */
1186 		return NULL;
1187 	}
1188 	fv->fv_flows = flows;	/* save the number of active fve's */
1189 	return NULL;
1190 }
1191 
1192 static inline struct fve *
1193 flowlist_reclaim(struct flowvalve *fv, struct altq_pktattr *pktattr)
1194 {
1195 	struct fve *fve;
1196 	struct ip *ip;
1197 #ifdef INET6
1198 	struct ip6_hdr *ip6;
1199 #endif
1200 
1201 	/*
1202 	 * get an entry from the tail of the LRU list.
1203 	 */
1204 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
1205 
1206 	switch (pktattr->pattr_af) {
1207 	case AF_INET:
1208 		ip = (struct ip *)pktattr->pattr_hdr;
1209 		fve->fve_flow.flow_af = AF_INET;
1210 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
1211 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
1212 		break;
1213 #ifdef INET6
1214 	case AF_INET6:
1215 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1216 		fve->fve_flow.flow_af = AF_INET6;
1217 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
1218 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
1219 		break;
1220 #endif
1221 	}
1222 
1223 	fve->fve_state = Green;
1224 	fve->fve_p = 0.0;
1225 	fve->fve_f = 0.0;
1226 	fve->fve_ifseq = fv->fv_ifseq - 1;
1227 	fve->fve_count = 0;
1228 
1229 	fv->fv_flows++;
1230 #ifdef FV_STATS
1231 	fv->fv_stats.alloc++;
1232 #endif
1233 	return fve;
1234 }
1235 
1236 static inline void
1237 flowlist_move_to_head(struct flowvalve *fv, struct fve *fve)
1238 {
1239 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
1240 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
1241 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
1242 	}
1243 }
1244 
1245 /*
1246  * allocate flowvalve structure
1247  */
1248 static struct flowvalve *
1249 fv_alloc(struct red *rp)
1250 {
1251 	struct flowvalve *fv;
1252 	struct fve *fve;
1253 	int i, num;
1254 
1255 	num = FV_FLOWLISTSIZE;
1256 	fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK|M_ZERO);
1257 	if (fv == NULL)
1258 		return NULL;
1259 
1260 	fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF,
1261 	    M_WAITOK|M_ZERO);
1262 	if (fv->fv_fves == NULL) {
1263 		free(fv, M_DEVBUF);
1264 		return NULL;
1265 	}
1266 
1267 	fv->fv_flows = 0;
1268 	TAILQ_INIT(&fv->fv_flowlist);
1269 	for (i = 0; i < num; i++) {
1270 		fve = &fv->fv_fves[i];
1271 		fve->fve_lastdrop.tv_sec = 0;
1272 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
1273 	}
1274 
1275 	/* initialize drop rate threshold in scaled fixed-point */
1276 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
1277 
1278 	/* initialize drop rate to fraction table */
1279 	fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK);
1280 	if (fv->fv_p2ftab == NULL) {
1281 		free(fv->fv_fves, M_DEVBUF);
1282 		free(fv, M_DEVBUF);
1283 		return NULL;
1284 	}
1285 	/*
1286 	 * create the p2f table.
1287 	 * (shift is used to keep the precision)
1288 	 */
1289 	for (i = 1; i < BRTT_SIZE; i++) {
1290 		int f;
1291 
1292 		f = brtt_tab[i] << 8;
1293 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
1294 	}
1295 
1296 	return fv;
1297 }
1298 
1299 static void
1300 fv_destroy(struct flowvalve *fv)
1301 {
1302 	free(fv->fv_p2ftab, M_DEVBUF);
1303 	free(fv->fv_fves, M_DEVBUF);
1304 	free(fv, M_DEVBUF);
1305 }
1306 
1307 static inline int
1308 fv_p2f(struct flowvalve *fv, int p)
1309 {
1310 	int val, f;
1311 
1312 	if (p >= BRTT_PMAX)
1313 		f = fv->fv_p2ftab[BRTT_SIZE-1];
1314 	else if ((val = (p & BRTT_MASK)))
1315 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
1316 	else
1317 		f = fv->fv_p2ftab[1];
1318 	return f;
1319 }
1320 
1321 /*
1322  * check if an arriving packet should be pre-dropped.
1323  * called from red_addq() when a packet arrives.
1324  * returns 1 when the packet should be pre-dropped.
1325  * should be called in splnet.
1326  */
1327 static int
1328 fv_checkflow(struct flowvalve *fv, struct altq_pktattr *pktattr,
1329     struct fve **fcache)
1330 {
1331 	struct fve *fve;
1332 	struct timeval now;
1333 
1334 	fv->fv_ifseq++;
1335 	FV_TIMESTAMP(&now);
1336 
1337 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1338 		/* no matching entry in the flowlist */
1339 		return 0;
1340 
1341 	*fcache = fve;
1342 
1343 	/* update fraction f for every FV_N packets */
1344 	if (++fve->fve_count == FV_N) {
1345 		/*
1346 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
1347 		 */
1348 		fve->fve_f =
1349 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
1350 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
1351 		fve->fve_ifseq = fv->fv_ifseq;
1352 		fve->fve_count = 0;
1353 	}
1354 
1355 	/*
1356 	 * overpumping test
1357 	 */
1358 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
1359 		int fthresh;
1360 
1361 		/* calculate a threshold */
1362 		fthresh = fv_p2f(fv, fve->fve_p);
1363 		if (fve->fve_f > fthresh)
1364 			fve->fve_state = Red;
1365 	}
1366 
1367 	if (fve->fve_state == Red) {
1368 		/*
1369 		 * backoff test
1370 		 */
1371 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
1372 			/* no drop for at least FV_BACKOFFTHRESH sec */
1373 			fve->fve_p = 0;
1374 			fve->fve_state = Green;
1375 #ifdef FV_STATS
1376 			fv->fv_stats.escape++;
1377 #endif
1378 		} else {
1379 			/* block this flow */
1380 			flowlist_move_to_head(fv, fve);
1381 			fve->fve_lastdrop = now;
1382 #ifdef FV_STATS
1383 			fv->fv_stats.predrop++;
1384 #endif
1385 			return 1;
1386 		}
1387 	}
1388 
1389 	/*
1390 	 * p = (1 - Wp) * p
1391 	 */
1392 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
1393 	if (fve->fve_p < 0)
1394 		fve->fve_p = 0;
1395 #ifdef FV_STATS
1396 	fv->fv_stats.pass++;
1397 #endif
1398 	return 0;
1399 }
1400 
1401 /*
1402  * called from red_addq when a packet is dropped by red.
1403  * should be called in splnet.
1404  */
1405 static void
1406 fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *pktattr,
1407     struct fve *fcache)
1408 {
1409 	struct fve *fve;
1410 	struct timeval now;
1411 
1412 	if (pktattr == NULL)
1413 		return;
1414 	FV_TIMESTAMP(&now);
1415 
1416 	if (fcache != NULL)
1417 		/* the fve of this packet is already cached */
1418 		fve = fcache;
1419 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1420 		fve = flowlist_reclaim(fv, pktattr);
1421 
1422 	flowlist_move_to_head(fv, fve);
1423 
1424 	/*
1425 	 * update p:  the following line cancels the update
1426 	 *	      in fv_checkflow() and calculate
1427 	 *	p = Wp + (1 - Wp) * p
1428 	 */
1429 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
1430 
1431 	fve->fve_lastdrop = now;
1432 }
1433 
1434 #endif /* ALTQ_FLOWVALVE */
1435 
1436 #ifdef KLD_MODULE
1437 
1438 static struct altqsw red_sw =
1439 	{"red", redopen, redclose, redioctl};
1440 
1441 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
1442 MODULE_VERSION(altq_red, 1);
1443 
1444 #endif /* KLD_MODULE */
1445 #endif /* ALTQ3_COMPAT */
1446 
1447 #endif /* ALTQ_RED */
1448