1 /* $NetBSD: altq_rmclass.c,v 1.22 2011/11/19 22:51:18 tls Exp $ */ 2 /* $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $ */ 3 4 /* 5 * Copyright (c) 1991-1997 Regents of the University of California. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the Network Research 19 * Group at Lawrence Berkeley Laboratory. 20 * 4. Neither the name of the University nor of the Laboratory may be used 21 * to endorse or promote products derived from this software without 22 * specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * LBL code modified by speer@eng.sun.com, May 1977. 37 * For questions and/or comments, please send mail to cbq@ee.lbl.gov 38 */ 39 40 #include <sys/cdefs.h> 41 __KERNEL_RCSID(0, "$NetBSD: altq_rmclass.c,v 1.22 2011/11/19 22:51:18 tls Exp $"); 42 43 /* #ident "@(#)rm_class.c 1.48 97/12/05 SMI" */ 44 45 #ifdef _KERNEL_OPT 46 #include "opt_altq.h" 47 #include "opt_inet.h" 48 #endif 49 50 #ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ 51 52 #include <sys/param.h> 53 #include <sys/malloc.h> 54 #include <sys/mbuf.h> 55 #include <sys/socket.h> 56 #include <sys/systm.h> 57 #include <sys/errno.h> 58 #include <sys/time.h> 59 #ifdef ALTQ3_COMPAT 60 #include <sys/kernel.h> 61 #endif 62 #include <sys/cprng.h> 63 64 #include <net/if.h> 65 #ifdef ALTQ3_COMPAT 66 #include <netinet/in.h> 67 #include <netinet/in_systm.h> 68 #include <netinet/ip.h> 69 #endif 70 71 #include <altq/altq.h> 72 #include <altq/altq_rmclass.h> 73 #include <altq/altq_rmclass_debug.h> 74 #include <altq/altq_red.h> 75 #include <altq/altq_rio.h> 76 77 /* 78 * Local Macros 79 */ 80 81 #define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } 82 83 /* 84 * Local routines. 85 */ 86 87 static int rmc_satisfied(struct rm_class *, struct timeval *); 88 static void rmc_wrr_set_weights(struct rm_ifdat *); 89 static void rmc_depth_compute(struct rm_class *); 90 static void rmc_depth_recompute(rm_class_t *); 91 92 static mbuf_t *_rmc_wrr_dequeue_next(struct rm_ifdat *, int); 93 static mbuf_t *_rmc_prr_dequeue_next(struct rm_ifdat *, int); 94 95 static int _rmc_addq(rm_class_t *, mbuf_t *); 96 static void _rmc_dropq(rm_class_t *); 97 static mbuf_t *_rmc_getq(rm_class_t *); 98 static mbuf_t *_rmc_pollq(rm_class_t *); 99 100 static int rmc_under_limit(struct rm_class *, struct timeval *); 101 static void rmc_tl_satisfied(struct rm_ifdat *, struct timeval *); 102 static void rmc_drop_action(struct rm_class *); 103 static void rmc_restart(struct rm_class *); 104 static void rmc_root_overlimit(struct rm_class *, struct rm_class *); 105 106 #define BORROW_OFFTIME 107 /* 108 * BORROW_OFFTIME (experimental): 109 * borrow the offtime of the class borrowing from. 110 * the reason is that when its own offtime is set, the class is unable 111 * to borrow much, especially when cutoff is taking effect. 112 * but when the borrowed class is overloaded (advidle is close to minidle), 113 * use the borrowing class's offtime to avoid overload. 114 */ 115 #define ADJUST_CUTOFF 116 /* 117 * ADJUST_CUTOFF (experimental): 118 * if no underlimit class is found due to cutoff, increase cutoff and 119 * retry the scheduling loop. 120 * also, don't invoke delay_actions while cutoff is taking effect, 121 * since a sleeping class won't have a chance to be scheduled in the 122 * next loop. 123 * 124 * now heuristics for setting the top-level variable (cutoff_) becomes: 125 * 1. if a packet arrives for a not-overlimit class, set cutoff 126 * to the depth of the class. 127 * 2. if cutoff is i, and a packet arrives for an overlimit class 128 * with an underlimit ancestor at a lower level than i (say j), 129 * then set cutoff to j. 130 * 3. at scheduling a packet, if there is no underlimit class 131 * due to the current cutoff level, increase cutoff by 1 and 132 * then try to schedule again. 133 */ 134 135 /* 136 * rm_class_t * 137 * rmc_newclass(...) - Create a new resource management class at priority 138 * 'pri' on the interface given by 'ifd'. 139 * 140 * nsecPerByte is the data rate of the interface in nanoseconds/byte. 141 * E.g., 800 for a 10Mb/s ethernet. If the class gets less 142 * than 100% of the bandwidth, this number should be the 143 * 'effective' rate for the class. Let f be the 144 * bandwidth fraction allocated to this class, and let 145 * nsPerByte be the data rate of the output link in 146 * nanoseconds/byte. Then nsecPerByte is set to 147 * nsPerByte / f. E.g., 1600 (= 800 / .5) 148 * for a class that gets 50% of an ethernet's bandwidth. 149 * 150 * action the routine to call when the class is over limit. 151 * 152 * maxq max allowable queue size for class (in packets). 153 * 154 * parent parent class pointer. 155 * 156 * borrow class to borrow from (should be either 'parent' or null). 157 * 158 * maxidle max value allowed for class 'idle' time estimate (this 159 * parameter determines how large an initial burst of packets 160 * can be before overlimit action is invoked. 161 * 162 * offtime how long 'delay' action will delay when class goes over 163 * limit (this parameter determines the steady-state burst 164 * size when a class is running over its limit). 165 * 166 * Maxidle and offtime have to be computed from the following: If the 167 * average packet size is s, the bandwidth fraction allocated to this 168 * class is f, we want to allow b packet bursts, and the gain of the 169 * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: 170 * 171 * ptime = s * nsPerByte * (1 - f) / f 172 * maxidle = ptime * (1 - g^b) / g^b 173 * minidle = -ptime * (1 / (f - 1)) 174 * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) 175 * 176 * Operationally, it's convenient to specify maxidle & offtime in units 177 * independent of the link bandwidth so the maxidle & offtime passed to 178 * this routine are the above values multiplied by 8*f/(1000*nsPerByte). 179 * (The constant factor is a scale factor needed to make the parameters 180 * integers. This scaling also means that the 'unscaled' values of 181 * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, 182 * not nanoseconds.) Also note that the 'idle' filter computation keeps 183 * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of 184 * maxidle also must be scaled upward by this value. Thus, the passed 185 * values for maxidle and offtime can be computed as follows: 186 * 187 * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) 188 * offtime = offtime * 8 / (1000 * nsecPerByte) 189 * 190 * When USE_HRTIME is employed, then maxidle and offtime become: 191 * maxidle = maxilde * (8.0 / nsecPerByte); 192 * offtime = offtime * (8.0 / nsecPerByte); 193 */ 194 struct rm_class * 195 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte, 196 void (*action)(rm_class_t *, rm_class_t *), int maxq, 197 struct rm_class *parent, struct rm_class *borrow, u_int maxidle, 198 int minidle, u_int offtime, int pktsize, int flags) 199 { 200 struct rm_class *cl; 201 struct rm_class *peer; 202 int s; 203 204 if (pri >= RM_MAXPRIO) 205 return (NULL); 206 #ifndef ALTQ_RED 207 if (flags & RMCF_RED) { 208 #ifdef ALTQ_DEBUG 209 printf("rmc_newclass: RED not configured for CBQ!\n"); 210 #endif 211 return (NULL); 212 } 213 #endif 214 #ifndef ALTQ_RIO 215 if (flags & RMCF_RIO) { 216 #ifdef ALTQ_DEBUG 217 printf("rmc_newclass: RIO not configured for CBQ!\n"); 218 #endif 219 return (NULL); 220 } 221 #endif 222 223 cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_WAITOK|M_ZERO); 224 if (cl == NULL) 225 return (NULL); 226 CALLOUT_INIT(&cl->callout_); 227 228 cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_WAITOK|M_ZERO); 229 if (cl->q_ == NULL) { 230 free(cl, M_DEVBUF); 231 return (NULL); 232 } 233 234 /* 235 * Class initialization. 236 */ 237 cl->children_ = NULL; 238 cl->parent_ = parent; 239 cl->borrow_ = borrow; 240 cl->leaf_ = 1; 241 cl->ifdat_ = ifd; 242 cl->pri_ = pri; 243 cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ 244 cl->depth_ = 0; 245 cl->qthresh_ = 0; 246 cl->ns_per_byte_ = nsecPerByte; 247 248 qlimit(cl->q_) = maxq; 249 qtype(cl->q_) = Q_DROPHEAD; 250 qlen(cl->q_) = 0; 251 cl->flags_ = flags; 252 253 #if 1 /* minidle is also scaled in ALTQ */ 254 cl->minidle_ = (minidle * (int)nsecPerByte) / 8; 255 if (cl->minidle_ > 0) 256 cl->minidle_ = 0; 257 #else 258 cl->minidle_ = minidle; 259 #endif 260 cl->maxidle_ = (maxidle * nsecPerByte) / 8; 261 if (cl->maxidle_ == 0) 262 cl->maxidle_ = 1; 263 #if 1 /* offtime is also scaled in ALTQ */ 264 cl->avgidle_ = cl->maxidle_; 265 cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; 266 if (cl->offtime_ == 0) 267 cl->offtime_ = 1; 268 #else 269 cl->avgidle_ = 0; 270 cl->offtime_ = (offtime * nsecPerByte) / 8; 271 #endif 272 cl->overlimit = action; 273 274 #ifdef ALTQ_RED 275 if (flags & (RMCF_RED|RMCF_RIO)) { 276 int red_flags, red_pkttime; 277 278 red_flags = 0; 279 if (flags & RMCF_ECN) 280 red_flags |= REDF_ECN; 281 if (flags & RMCF_FLOWVALVE) 282 red_flags |= REDF_FLOWVALVE; 283 #ifdef ALTQ_RIO 284 if (flags & RMCF_CLEARDSCP) 285 red_flags |= RIOF_CLEARDSCP; 286 #endif 287 red_pkttime = nsecPerByte * pktsize / 1000; 288 289 if (flags & RMCF_RED) { 290 cl->red_ = red_alloc(0, 0, 291 qlimit(cl->q_) * 10/100, 292 qlimit(cl->q_) * 30/100, 293 red_flags, red_pkttime); 294 if (cl->red_ != NULL) 295 qtype(cl->q_) = Q_RED; 296 } 297 #ifdef ALTQ_RIO 298 else { 299 cl->red_ = (red_t *)rio_alloc(0, NULL, 300 red_flags, red_pkttime); 301 if (cl->red_ != NULL) 302 qtype(cl->q_) = Q_RIO; 303 } 304 #endif 305 } 306 #endif /* ALTQ_RED */ 307 308 /* 309 * put the class into the class tree 310 */ 311 s = splnet(); 312 if ((peer = ifd->active_[pri]) != NULL) { 313 /* find the last class at this pri */ 314 cl->peer_ = peer; 315 while (peer->peer_ != ifd->active_[pri]) 316 peer = peer->peer_; 317 peer->peer_ = cl; 318 } else { 319 ifd->active_[pri] = cl; 320 cl->peer_ = cl; 321 } 322 323 if (cl->parent_) { 324 cl->next_ = parent->children_; 325 parent->children_ = cl; 326 parent->leaf_ = 0; 327 } 328 329 /* 330 * Compute the depth of this class and its ancestors in the class 331 * hierarchy. 332 */ 333 rmc_depth_compute(cl); 334 335 /* 336 * If CBQ's WRR is enabled, then initialize the class WRR state. 337 */ 338 if (ifd->wrr_) { 339 ifd->num_[pri]++; 340 ifd->alloc_[pri] += cl->allotment_; 341 rmc_wrr_set_weights(ifd); 342 } 343 splx(s); 344 return (cl); 345 } 346 347 int 348 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle, 349 int minidle, u_int offtime, int pktsize) 350 { 351 struct rm_ifdat *ifd; 352 u_int old_allotment; 353 int s; 354 355 ifd = cl->ifdat_; 356 old_allotment = cl->allotment_; 357 358 s = splnet(); 359 cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ 360 cl->qthresh_ = 0; 361 cl->ns_per_byte_ = nsecPerByte; 362 363 qlimit(cl->q_) = maxq; 364 365 #if 1 /* minidle is also scaled in ALTQ */ 366 cl->minidle_ = (minidle * nsecPerByte) / 8; 367 if (cl->minidle_ > 0) 368 cl->minidle_ = 0; 369 #else 370 cl->minidle_ = minidle; 371 #endif 372 cl->maxidle_ = (maxidle * nsecPerByte) / 8; 373 if (cl->maxidle_ == 0) 374 cl->maxidle_ = 1; 375 #if 1 /* offtime is also scaled in ALTQ */ 376 cl->avgidle_ = cl->maxidle_; 377 cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; 378 if (cl->offtime_ == 0) 379 cl->offtime_ = 1; 380 #else 381 cl->avgidle_ = 0; 382 cl->offtime_ = (offtime * nsecPerByte) / 8; 383 #endif 384 385 /* 386 * If CBQ's WRR is enabled, then initialize the class WRR state. 387 */ 388 if (ifd->wrr_) { 389 ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; 390 rmc_wrr_set_weights(ifd); 391 } 392 splx(s); 393 return (0); 394 } 395 396 /* 397 * static void 398 * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes 399 * the appropriate run robin weights for the CBQ weighted round robin 400 * algorithm. 401 * 402 * Returns: NONE 403 */ 404 405 static void 406 rmc_wrr_set_weights(struct rm_ifdat *ifd) 407 { 408 int i; 409 struct rm_class *cl, *clh; 410 411 for (i = 0; i < RM_MAXPRIO; i++) { 412 /* 413 * This is inverted from that of the simulator to 414 * maintain precision. 415 */ 416 if (ifd->num_[i] == 0) 417 ifd->M_[i] = 0; 418 else 419 ifd->M_[i] = ifd->alloc_[i] / 420 (ifd->num_[i] * ifd->maxpkt_); 421 /* 422 * Compute the weighted allotment for each class. 423 * This takes the expensive div instruction out 424 * of the main loop for the wrr scheduling path. 425 * These only get recomputed when a class comes or 426 * goes. 427 */ 428 if (ifd->active_[i] != NULL) { 429 clh = cl = ifd->active_[i]; 430 do { 431 /* safe-guard for slow link or alloc_ == 0 */ 432 if (ifd->M_[i] == 0) 433 cl->w_allotment_ = 0; 434 else 435 cl->w_allotment_ = cl->allotment_ / 436 ifd->M_[i]; 437 cl = cl->peer_; 438 } while ((cl != NULL) && (cl != clh)); 439 } 440 } 441 } 442 443 int 444 rmc_get_weight(struct rm_ifdat *ifd, int pri) 445 { 446 if ((pri >= 0) && (pri < RM_MAXPRIO)) 447 return (ifd->M_[pri]); 448 else 449 return (0); 450 } 451 452 /* 453 * static void 454 * rmc_depth_compute(struct rm_class *cl) - This function computes the 455 * appropriate depth of class 'cl' and its ancestors. 456 * 457 * Returns: NONE 458 */ 459 460 static void 461 rmc_depth_compute(struct rm_class *cl) 462 { 463 rm_class_t *t = cl, *p; 464 465 /* 466 * Recompute the depth for the branch of the tree. 467 */ 468 while (t != NULL) { 469 p = t->parent_; 470 if (p && (t->depth_ >= p->depth_)) { 471 p->depth_ = t->depth_ + 1; 472 t = p; 473 } else 474 t = NULL; 475 } 476 } 477 478 /* 479 * static void 480 * rmc_depth_recompute(struct rm_class *cl) - This function re-computes 481 * the depth of the tree after a class has been deleted. 482 * 483 * Returns: NONE 484 */ 485 486 static void 487 rmc_depth_recompute(rm_class_t *cl) 488 { 489 #if 1 /* ALTQ */ 490 rm_class_t *p, *t; 491 492 p = cl; 493 while (p != NULL) { 494 if ((t = p->children_) == NULL) { 495 p->depth_ = 0; 496 } else { 497 int cdepth = 0; 498 499 while (t != NULL) { 500 if (t->depth_ > cdepth) 501 cdepth = t->depth_; 502 t = t->next_; 503 } 504 505 if (p->depth_ == cdepth + 1) 506 /* no change to this parent */ 507 return; 508 509 p->depth_ = cdepth + 1; 510 } 511 512 p = p->parent_; 513 } 514 #else 515 rm_class_t *t; 516 517 if (cl->depth_ >= 1) { 518 if (cl->children_ == NULL) { 519 cl->depth_ = 0; 520 } else if ((t = cl->children_) != NULL) { 521 while (t != NULL) { 522 if (t->children_ != NULL) 523 rmc_depth_recompute(t); 524 t = t->next_; 525 } 526 } else 527 rmc_depth_compute(cl); 528 } 529 #endif 530 } 531 532 /* 533 * void 534 * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This 535 * function deletes a class from the link-sharing structure and frees 536 * all resources associated with the class. 537 * 538 * Returns: NONE 539 */ 540 541 void 542 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl) 543 { 544 struct rm_class *p, *head, *previous; 545 int s; 546 547 ASSERT(cl->children_ == NULL); 548 549 if (cl->sleeping_) 550 CALLOUT_STOP(&cl->callout_); 551 552 s = splnet(); 553 /* 554 * Free packets in the packet queue. 555 * XXX - this may not be a desired behavior. Packets should be 556 * re-queued. 557 */ 558 rmc_dropall(cl); 559 560 /* 561 * If the class has a parent, then remove the class from the 562 * class from the parent's children chain. 563 */ 564 if (cl->parent_ != NULL) { 565 head = cl->parent_->children_; 566 p = previous = head; 567 if (head->next_ == NULL) { 568 ASSERT(head == cl); 569 cl->parent_->children_ = NULL; 570 cl->parent_->leaf_ = 1; 571 } else while (p != NULL) { 572 if (p == cl) { 573 if (cl == head) 574 cl->parent_->children_ = cl->next_; 575 else 576 previous->next_ = cl->next_; 577 cl->next_ = NULL; 578 p = NULL; 579 } else { 580 previous = p; 581 p = p->next_; 582 } 583 } 584 } 585 586 /* 587 * Delete class from class priority peer list. 588 */ 589 if ((p = ifd->active_[cl->pri_]) != NULL) { 590 /* 591 * If there is more than one member of this priority 592 * level, then look for class(cl) in the priority level. 593 */ 594 if (p != p->peer_) { 595 while (p->peer_ != cl) 596 p = p->peer_; 597 p->peer_ = cl->peer_; 598 599 if (ifd->active_[cl->pri_] == cl) 600 ifd->active_[cl->pri_] = cl->peer_; 601 } else { 602 ASSERT(p == cl); 603 ifd->active_[cl->pri_] = NULL; 604 } 605 } 606 607 /* 608 * Recompute the WRR weights. 609 */ 610 if (ifd->wrr_) { 611 ifd->alloc_[cl->pri_] -= cl->allotment_; 612 ifd->num_[cl->pri_]--; 613 rmc_wrr_set_weights(ifd); 614 } 615 616 /* 617 * Re-compute the depth of the tree. 618 */ 619 #if 1 /* ALTQ */ 620 rmc_depth_recompute(cl->parent_); 621 #else 622 rmc_depth_recompute(ifd->root_); 623 #endif 624 625 splx(s); 626 627 /* 628 * Free the class structure. 629 */ 630 if (cl->red_ != NULL) { 631 #ifdef ALTQ_RIO 632 if (q_is_rio(cl->q_)) 633 rio_destroy((rio_t *)cl->red_); 634 #endif 635 #ifdef ALTQ_RED 636 if (q_is_red(cl->q_)) 637 red_destroy(cl->red_); 638 #endif 639 } 640 free(cl->q_, M_DEVBUF); 641 free(cl, M_DEVBUF); 642 } 643 644 645 /* 646 * int 647 * rmc_init(...) - Initialize the resource management data structures 648 * associated with the output portion of interface 'ifp'. 'ifd' is 649 * where the structures will be built (for backwards compatibility, the 650 * structures aren't kept in the ifnet struct). 'nsecPerByte' 651 * gives the link speed (inverse of bandwidth) in nanoseconds/byte. 652 * 'restart' is the driver-specific routine that the generic 'delay 653 * until under limit' action will call to restart output. `maxq' 654 * is the queue size of the 'link' & 'default' classes. 'maxqueued' 655 * is the maximum number of packets that the resource management 656 * code will allow to be queued 'downstream' (this is typically 1). 657 * 658 * Returns: 0 on success 659 */ 660 661 int 662 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte, 663 void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle, 664 int minidle, u_int offtime, int flags) 665 { 666 int i, mtu; 667 668 /* 669 * Initialize the CBQ tracing/debug facility. 670 */ 671 CBQTRACEINIT(); 672 673 mtu = ifq->altq_ifp->if_mtu; 674 if (mtu < 1) { 675 printf("altq: %s: invalid MTU (interface not initialized?)\n", 676 ifq->altq_ifp->if_xname); 677 return (EINVAL); 678 } 679 680 (void)memset((char *)ifd, 0, sizeof (*ifd)); 681 ifd->ifq_ = ifq; 682 ifd->restart = restart; 683 ifd->maxqueued_ = maxqueued; 684 ifd->ns_per_byte_ = nsecPerByte; 685 ifd->maxpkt_ = mtu; 686 ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; 687 ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; 688 #if 1 689 ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16; 690 if (mtu * nsecPerByte > 10 * 1000000) 691 ifd->maxiftime_ /= 4; 692 #endif 693 694 reset_cutoff(ifd); 695 CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); 696 697 /* 698 * Initialize the CBQ's WRR state. 699 */ 700 for (i = 0; i < RM_MAXPRIO; i++) { 701 ifd->alloc_[i] = 0; 702 ifd->M_[i] = 0; 703 ifd->num_[i] = 0; 704 ifd->na_[i] = 0; 705 ifd->active_[i] = NULL; 706 } 707 708 /* 709 * Initialize current packet state. 710 */ 711 ifd->qi_ = 0; 712 ifd->qo_ = 0; 713 for (i = 0; i < RM_MAXQUEUED; i++) { 714 ifd->class_[i] = NULL; 715 ifd->curlen_[i] = 0; 716 ifd->borrowed_[i] = NULL; 717 } 718 719 /* 720 * Create the root class of the link-sharing structure. 721 */ 722 if ((ifd->root_ = rmc_newclass(0, ifd, 723 nsecPerByte, 724 rmc_root_overlimit, maxq, 0, 0, 725 maxidle, minidle, offtime, 726 0, 0)) == NULL) { 727 printf("rmc_init: root class not allocated\n"); 728 return (ENOMEM); 729 } 730 ifd->root_->depth_ = 0; 731 732 return (0); 733 } 734 735 /* 736 * void 737 * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by 738 * mbuf 'm' to queue for resource class 'cl'. This routine is called 739 * by a driver's if_output routine. This routine must be called with 740 * output packet completion interrupts locked out (to avoid racing with 741 * rmc_dequeue_next). 742 * 743 * Returns: 0 on successful queueing 744 * -1 when packet drop occurs 745 */ 746 int 747 rmc_queue_packet(struct rm_class *cl, mbuf_t *m) 748 { 749 struct timeval now; 750 struct rm_ifdat *ifd = cl->ifdat_; 751 int cpri = cl->pri_; 752 int is_empty = qempty(cl->q_); 753 754 RM_GETTIME(now); 755 if (ifd->cutoff_ > 0) { 756 if (TV_LT(&cl->undertime_, &now)) { 757 if (ifd->cutoff_ > cl->depth_) 758 ifd->cutoff_ = cl->depth_; 759 CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); 760 } 761 #if 1 /* ALTQ */ 762 else { 763 /* 764 * the class is overlimit. if the class has 765 * underlimit ancestors, set cutoff to the lowest 766 * depth among them. 767 */ 768 struct rm_class *borrow = cl->borrow_; 769 770 while (borrow != NULL && 771 borrow->depth_ < ifd->cutoff_) { 772 if (TV_LT(&borrow->undertime_, &now)) { 773 ifd->cutoff_ = borrow->depth_; 774 CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_); 775 break; 776 } 777 borrow = borrow->borrow_; 778 } 779 } 780 #else /* !ALTQ */ 781 else if ((ifd->cutoff_ > 1) && cl->borrow_) { 782 if (TV_LT(&cl->borrow_->undertime_, &now)) { 783 ifd->cutoff_ = cl->borrow_->depth_; 784 CBQTRACE(rmc_queue_packet, 'ffob', 785 cl->borrow_->depth_); 786 } 787 } 788 #endif /* !ALTQ */ 789 } 790 791 if (_rmc_addq(cl, m) < 0) 792 /* failed */ 793 return (-1); 794 795 if (is_empty) { 796 CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle); 797 ifd->na_[cpri]++; 798 } 799 800 if (qlen(cl->q_) > qlimit(cl->q_)) { 801 /* note: qlimit can be set to 0 or 1 */ 802 rmc_drop_action(cl); 803 return (-1); 804 } 805 return (0); 806 } 807 808 /* 809 * void 810 * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all 811 * classes to see if there are satified. 812 */ 813 814 static void 815 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) 816 { 817 int i; 818 rm_class_t *p, *bp; 819 820 for (i = RM_MAXPRIO - 1; i >= 0; i--) { 821 if ((bp = ifd->active_[i]) != NULL) { 822 p = bp; 823 do { 824 if (!rmc_satisfied(p, now)) { 825 ifd->cutoff_ = p->depth_; 826 return; 827 } 828 p = p->peer_; 829 } while (p != bp); 830 } 831 } 832 833 reset_cutoff(ifd); 834 } 835 836 /* 837 * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. 838 */ 839 840 static int 841 rmc_satisfied(struct rm_class *cl, struct timeval *now) 842 { 843 rm_class_t *p; 844 845 if (cl == NULL) 846 return (1); 847 if (TV_LT(now, &cl->undertime_)) 848 return (1); 849 if (cl->depth_ == 0) { 850 if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_)) 851 return (0); 852 else 853 return (1); 854 } 855 if (cl->children_ != NULL) { 856 p = cl->children_; 857 while (p != NULL) { 858 if (!rmc_satisfied(p, now)) 859 return (0); 860 p = p->next_; 861 } 862 } 863 864 return (1); 865 } 866 867 /* 868 * Return 1 if class 'cl' is under limit or can borrow from a parent, 869 * 0 if overlimit. As a side-effect, this routine will invoke the 870 * class overlimit action if the class if overlimit. 871 */ 872 873 static int 874 rmc_under_limit(struct rm_class *cl, struct timeval *now) 875 { 876 rm_class_t *p = cl; 877 rm_class_t *top; 878 struct rm_ifdat *ifd = cl->ifdat_; 879 880 ifd->borrowed_[ifd->qi_] = NULL; 881 /* 882 * If cl is the root class, then always return that it is 883 * underlimit. Otherwise, check to see if the class is underlimit. 884 */ 885 if (cl->parent_ == NULL) 886 return (1); 887 888 if (cl->sleeping_) { 889 if (TV_LT(now, &cl->undertime_)) 890 return (0); 891 892 CALLOUT_STOP(&cl->callout_); 893 cl->sleeping_ = 0; 894 cl->undertime_.tv_sec = 0; 895 return (1); 896 } 897 898 top = NULL; 899 while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) { 900 if (((cl = cl->borrow_) == NULL) || 901 (cl->depth_ > ifd->cutoff_)) { 902 #ifdef ADJUST_CUTOFF 903 if (cl != NULL) 904 /* cutoff is taking effect, just 905 return false without calling 906 the delay action. */ 907 return (0); 908 #endif 909 #ifdef BORROW_OFFTIME 910 /* 911 * check if the class can borrow offtime too. 912 * borrow offtime from the top of the borrow 913 * chain if the top class is not overloaded. 914 */ 915 if (cl != NULL) { 916 /* cutoff is taking effect, use this class as top. */ 917 top = cl; 918 CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); 919 } 920 if (top != NULL && top->avgidle_ == top->minidle_) 921 top = NULL; 922 p->overtime_ = *now; 923 (p->overlimit)(p, top); 924 #else 925 p->overtime_ = *now; 926 (p->overlimit)(p, NULL); 927 #endif 928 return (0); 929 } 930 top = cl; 931 } 932 933 if (cl != p) 934 ifd->borrowed_[ifd->qi_] = cl; 935 return (1); 936 } 937 938 /* 939 * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to 940 * Packet-by-packet round robin. 941 * 942 * The heart of the weighted round-robin scheduler, which decides which 943 * class next gets to send a packet. Highest priority first, then 944 * weighted round-robin within priorites. 945 * 946 * Each able-to-send class gets to send until its byte allocation is 947 * exhausted. Thus, the active pointer is only changed after a class has 948 * exhausted its allocation. 949 * 950 * If the scheduler finds no class that is underlimit or able to borrow, 951 * then the first class found that had a nonzero queue and is allowed to 952 * borrow gets to send. 953 */ 954 955 static mbuf_t * 956 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op) 957 { 958 struct rm_class *cl = NULL, *first = NULL; 959 u_int deficit; 960 int cpri; 961 mbuf_t *m; 962 struct timeval now; 963 964 RM_GETTIME(now); 965 966 /* 967 * if the driver polls the top of the queue and then removes 968 * the polled packet, we must return the same packet. 969 */ 970 if (op == ALTDQ_REMOVE && ifd->pollcache_) { 971 cl = ifd->pollcache_; 972 cpri = cl->pri_; 973 if (ifd->efficient_) { 974 /* check if this class is overlimit */ 975 if (cl->undertime_.tv_sec != 0 && 976 rmc_under_limit(cl, &now) == 0) 977 first = cl; 978 } 979 ifd->pollcache_ = NULL; 980 goto _wrr_out; 981 } 982 else { 983 /* mode == ALTDQ_POLL || pollcache == NULL */ 984 ifd->pollcache_ = NULL; 985 ifd->borrowed_[ifd->qi_] = NULL; 986 } 987 #ifdef ADJUST_CUTOFF 988 _again: 989 #endif 990 for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { 991 if (ifd->na_[cpri] == 0) 992 continue; 993 deficit = 0; 994 /* 995 * Loop through twice for a priority level, if some class 996 * was unable to send a packet the first round because 997 * of the weighted round-robin mechanism. 998 * During the second loop at this level, deficit==2. 999 * (This second loop is not needed if for every class, 1000 * "M[cl->pri_])" times "cl->allotment" is greater than 1001 * the byte size for the largest packet in the class.) 1002 */ 1003 _wrr_loop: 1004 cl = ifd->active_[cpri]; 1005 ASSERT(cl != NULL); 1006 do { 1007 if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) 1008 cl->bytes_alloc_ += cl->w_allotment_; 1009 if (!qempty(cl->q_)) { 1010 if ((cl->undertime_.tv_sec == 0) || 1011 rmc_under_limit(cl, &now)) { 1012 if (cl->bytes_alloc_ > 0 || deficit > 1) 1013 goto _wrr_out; 1014 1015 /* underlimit but no alloc */ 1016 deficit = 1; 1017 #if 1 1018 ifd->borrowed_[ifd->qi_] = NULL; 1019 #endif 1020 } 1021 else if (first == NULL && cl->borrow_ != NULL) 1022 first = cl; /* borrowing candidate */ 1023 } 1024 1025 cl->bytes_alloc_ = 0; 1026 cl = cl->peer_; 1027 } while (cl != ifd->active_[cpri]); 1028 1029 if (deficit == 1) { 1030 /* first loop found an underlimit class with deficit */ 1031 /* Loop on same priority level, with new deficit. */ 1032 deficit = 2; 1033 goto _wrr_loop; 1034 } 1035 } 1036 1037 #ifdef ADJUST_CUTOFF 1038 /* 1039 * no underlimit class found. if cutoff is taking effect, 1040 * increase cutoff and try again. 1041 */ 1042 if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { 1043 ifd->cutoff_++; 1044 CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); 1045 goto _again; 1046 } 1047 #endif /* ADJUST_CUTOFF */ 1048 /* 1049 * If LINK_EFFICIENCY is turned on, then the first overlimit 1050 * class we encounter will send a packet if all the classes 1051 * of the link-sharing structure are overlimit. 1052 */ 1053 reset_cutoff(ifd); 1054 CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); 1055 1056 if (!ifd->efficient_ || first == NULL) 1057 return (NULL); 1058 1059 cl = first; 1060 cpri = cl->pri_; 1061 #if 0 /* too time-consuming for nothing */ 1062 if (cl->sleeping_) 1063 CALLOUT_STOP(&cl->callout_); 1064 cl->sleeping_ = 0; 1065 cl->undertime_.tv_sec = 0; 1066 #endif 1067 ifd->borrowed_[ifd->qi_] = cl->borrow_; 1068 ifd->cutoff_ = cl->borrow_->depth_; 1069 1070 /* 1071 * Deque the packet and do the book keeping... 1072 */ 1073 _wrr_out: 1074 if (op == ALTDQ_REMOVE) { 1075 m = _rmc_getq(cl); 1076 if (m == NULL) 1077 panic("_rmc_wrr_dequeue_next"); 1078 if (qempty(cl->q_)) 1079 ifd->na_[cpri]--; 1080 1081 /* 1082 * Update class statistics and link data. 1083 */ 1084 if (cl->bytes_alloc_ > 0) 1085 cl->bytes_alloc_ -= m_pktlen(m); 1086 1087 if ((cl->bytes_alloc_ <= 0) || first == cl) 1088 ifd->active_[cl->pri_] = cl->peer_; 1089 else 1090 ifd->active_[cl->pri_] = cl; 1091 1092 ifd->class_[ifd->qi_] = cl; 1093 ifd->curlen_[ifd->qi_] = m_pktlen(m); 1094 ifd->now_[ifd->qi_] = now; 1095 ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; 1096 ifd->queued_++; 1097 } else { 1098 /* mode == ALTDQ_PPOLL */ 1099 m = _rmc_pollq(cl); 1100 ifd->pollcache_ = cl; 1101 } 1102 return (m); 1103 } 1104 1105 /* 1106 * Dequeue & return next packet from the highest priority class that 1107 * has a packet to send & has enough allocation to send it. This 1108 * routine is called by a driver whenever it needs a new packet to 1109 * output. 1110 */ 1111 static mbuf_t * 1112 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op) 1113 { 1114 mbuf_t *m; 1115 int cpri; 1116 struct rm_class *cl, *first = NULL; 1117 struct timeval now; 1118 1119 RM_GETTIME(now); 1120 1121 /* 1122 * if the driver polls the top of the queue and then removes 1123 * the polled packet, we must return the same packet. 1124 */ 1125 if (op == ALTDQ_REMOVE && ifd->pollcache_) { 1126 cl = ifd->pollcache_; 1127 cpri = cl->pri_; 1128 ifd->pollcache_ = NULL; 1129 goto _prr_out; 1130 } else { 1131 /* mode == ALTDQ_POLL || pollcache == NULL */ 1132 ifd->pollcache_ = NULL; 1133 ifd->borrowed_[ifd->qi_] = NULL; 1134 } 1135 #ifdef ADJUST_CUTOFF 1136 _again: 1137 #endif 1138 for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { 1139 if (ifd->na_[cpri] == 0) 1140 continue; 1141 cl = ifd->active_[cpri]; 1142 ASSERT(cl != NULL); 1143 do { 1144 if (!qempty(cl->q_)) { 1145 if ((cl->undertime_.tv_sec == 0) || 1146 rmc_under_limit(cl, &now)) 1147 goto _prr_out; 1148 if (first == NULL && cl->borrow_ != NULL) 1149 first = cl; 1150 } 1151 cl = cl->peer_; 1152 } while (cl != ifd->active_[cpri]); 1153 } 1154 1155 #ifdef ADJUST_CUTOFF 1156 /* 1157 * no underlimit class found. if cutoff is taking effect, increase 1158 * cutoff and try again. 1159 */ 1160 if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { 1161 ifd->cutoff_++; 1162 goto _again; 1163 } 1164 #endif /* ADJUST_CUTOFF */ 1165 /* 1166 * If LINK_EFFICIENCY is turned on, then the first overlimit 1167 * class we encounter will send a packet if all the classes 1168 * of the link-sharing structure are overlimit. 1169 */ 1170 reset_cutoff(ifd); 1171 if (!ifd->efficient_ || first == NULL) 1172 return (NULL); 1173 1174 cl = first; 1175 cpri = cl->pri_; 1176 #if 0 /* too time-consuming for nothing */ 1177 if (cl->sleeping_) 1178 CALLOUT_STOP(&cl->callout_); 1179 cl->sleeping_ = 0; 1180 cl->undertime_.tv_sec = 0; 1181 #endif 1182 ifd->borrowed_[ifd->qi_] = cl->borrow_; 1183 ifd->cutoff_ = cl->borrow_->depth_; 1184 1185 /* 1186 * Deque the packet and do the book keeping... 1187 */ 1188 _prr_out: 1189 if (op == ALTDQ_REMOVE) { 1190 m = _rmc_getq(cl); 1191 if (m == NULL) 1192 panic("_rmc_prr_dequeue_next"); 1193 if (qempty(cl->q_)) 1194 ifd->na_[cpri]--; 1195 1196 ifd->active_[cpri] = cl->peer_; 1197 1198 ifd->class_[ifd->qi_] = cl; 1199 ifd->curlen_[ifd->qi_] = m_pktlen(m); 1200 ifd->now_[ifd->qi_] = now; 1201 ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; 1202 ifd->queued_++; 1203 } else { 1204 /* mode == ALTDQ_POLL */ 1205 m = _rmc_pollq(cl); 1206 ifd->pollcache_ = cl; 1207 } 1208 return (m); 1209 } 1210 1211 /* 1212 * mbuf_t * 1213 * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function 1214 * is invoked by the packet driver to get the next packet to be 1215 * dequeued and output on the link. If WRR is enabled, then the 1216 * WRR dequeue next routine will determine the next packet to sent. 1217 * Otherwise, packet-by-packet round robin is invoked. 1218 * 1219 * Returns: NULL, if a packet is not available or if all 1220 * classes are overlimit. 1221 * 1222 * Otherwise, Pointer to the next packet. 1223 */ 1224 1225 mbuf_t * 1226 rmc_dequeue_next(struct rm_ifdat *ifd, int mode) 1227 { 1228 if (ifd->queued_ >= ifd->maxqueued_) 1229 return (NULL); 1230 else if (ifd->wrr_) 1231 return (_rmc_wrr_dequeue_next(ifd, mode)); 1232 else 1233 return (_rmc_prr_dequeue_next(ifd, mode)); 1234 } 1235 1236 /* 1237 * Update the utilization estimate for the packet that just completed. 1238 * The packet's class & the parent(s) of that class all get their 1239 * estimators updated. This routine is called by the driver's output- 1240 * packet-completion interrupt service routine. 1241 */ 1242 1243 /* 1244 * a macro to approximate "divide by 1000" that gives 0.000999, 1245 * if a value has enough effective digits. 1246 * (on pentium, mul takes 9 cycles but div takes 46!) 1247 */ 1248 #define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) 1249 void 1250 rmc_update_class_util(struct rm_ifdat *ifd) 1251 { 1252 int idle, avgidle, pktlen; 1253 int pkt_time, tidle; 1254 rm_class_t *cl, *borrowed; 1255 rm_class_t *borrows; 1256 struct timeval *nowp; 1257 1258 /* 1259 * Get the most recent completed class. 1260 */ 1261 if ((cl = ifd->class_[ifd->qo_]) == NULL) 1262 return; 1263 1264 pktlen = ifd->curlen_[ifd->qo_]; 1265 borrowed = ifd->borrowed_[ifd->qo_]; 1266 borrows = borrowed; 1267 1268 PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); 1269 1270 /* 1271 * Run estimator on class and its ancestors. 1272 */ 1273 /* 1274 * rm_update_class_util is designed to be called when the 1275 * transfer is completed from a xmit complete interrupt, 1276 * but most drivers don't implement an upcall for that. 1277 * so, just use estimated completion time. 1278 * as a result, ifd->qi_ and ifd->qo_ are always synced. 1279 */ 1280 nowp = &ifd->now_[ifd->qo_]; 1281 /* get pkt_time (for link) in usec */ 1282 #if 1 /* use approximation */ 1283 pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_; 1284 pkt_time = NSEC_TO_USEC(pkt_time); 1285 #else 1286 pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; 1287 #endif 1288 #if 1 /* ALTQ4PPP */ 1289 if (TV_LT(nowp, &ifd->ifnow_)) { 1290 int iftime; 1291 1292 /* 1293 * make sure the estimated completion time does not go 1294 * too far. it can happen when the link layer supports 1295 * data compression or the interface speed is set to 1296 * a much lower value. 1297 */ 1298 TV_DELTA(&ifd->ifnow_, nowp, iftime); 1299 if (iftime+pkt_time < ifd->maxiftime_) { 1300 TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); 1301 } else { 1302 TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); 1303 } 1304 } else { 1305 TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); 1306 } 1307 #else 1308 if (TV_LT(nowp, &ifd->ifnow_)) { 1309 TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); 1310 } else { 1311 TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); 1312 } 1313 #endif 1314 1315 while (cl != NULL) { 1316 TV_DELTA(&ifd->ifnow_, &cl->last_, idle); 1317 if (idle >= 2000000) 1318 /* 1319 * this class is idle enough, reset avgidle. 1320 * (TV_DELTA returns 2000000 us when delta is large.) 1321 */ 1322 cl->avgidle_ = cl->maxidle_; 1323 1324 /* get pkt_time (for class) in usec */ 1325 #if 1 /* use approximation */ 1326 pkt_time = pktlen * cl->ns_per_byte_; 1327 pkt_time = NSEC_TO_USEC(pkt_time); 1328 #else 1329 pkt_time = pktlen * cl->ns_per_byte_ / 1000; 1330 #endif 1331 idle -= pkt_time; 1332 1333 avgidle = cl->avgidle_; 1334 avgidle += idle - (avgidle >> RM_FILTER_GAIN); 1335 cl->avgidle_ = avgidle; 1336 1337 /* Are we overlimit ? */ 1338 if (avgidle <= 0) { 1339 CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle); 1340 #if 1 /* ALTQ */ 1341 /* 1342 * need some lower bound for avgidle, otherwise 1343 * a borrowing class gets unbounded penalty. 1344 */ 1345 if (avgidle < cl->minidle_) 1346 avgidle = cl->avgidle_ = cl->minidle_; 1347 #endif 1348 /* set next idle to make avgidle 0 */ 1349 tidle = pkt_time + 1350 (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); 1351 TV_ADD_DELTA(nowp, tidle, &cl->undertime_); 1352 ++cl->stats_.over; 1353 } else { 1354 cl->avgidle_ = 1355 (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; 1356 cl->undertime_.tv_sec = 0; 1357 if (cl->sleeping_) { 1358 CALLOUT_STOP(&cl->callout_); 1359 cl->sleeping_ = 0; 1360 } 1361 } 1362 1363 if (borrows != NULL) { 1364 if (borrows != cl) 1365 ++cl->stats_.borrows; 1366 else 1367 borrows = NULL; 1368 } 1369 cl->last_ = ifd->ifnow_; 1370 cl->last_pkttime_ = pkt_time; 1371 1372 #if 1 1373 if (cl->parent_ == NULL) { 1374 /* take stats of root class */ 1375 PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); 1376 } 1377 #endif 1378 1379 cl = cl->parent_; 1380 } 1381 1382 /* 1383 * Check to see if cutoff needs to set to a new level. 1384 */ 1385 cl = ifd->class_[ifd->qo_]; 1386 if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { 1387 #if 1 /* ALTQ */ 1388 if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) { 1389 rmc_tl_satisfied(ifd, nowp); 1390 CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); 1391 } else { 1392 ifd->cutoff_ = borrowed->depth_; 1393 CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); 1394 } 1395 #else /* !ALTQ */ 1396 if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) { 1397 reset_cutoff(ifd); 1398 #ifdef notdef 1399 rmc_tl_satisfied(ifd, &now); 1400 #endif 1401 CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); 1402 } else { 1403 ifd->cutoff_ = borrowed->depth_; 1404 CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); 1405 } 1406 #endif /* !ALTQ */ 1407 } 1408 1409 /* 1410 * Release class slot 1411 */ 1412 ifd->borrowed_[ifd->qo_] = NULL; 1413 ifd->class_[ifd->qo_] = NULL; 1414 ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; 1415 ifd->queued_--; 1416 } 1417 1418 /* 1419 * void 1420 * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) 1421 * over-limit action routines. These get invoked by rmc_under_limit() 1422 * if a class with packets to send if over its bandwidth limit & can't 1423 * borrow from a parent class. 1424 * 1425 * Returns: NONE 1426 */ 1427 1428 static void 1429 rmc_drop_action(struct rm_class *cl) 1430 { 1431 struct rm_ifdat *ifd = cl->ifdat_; 1432 1433 ASSERT(qlen(cl->q_) > 0); 1434 _rmc_dropq(cl); 1435 if (qempty(cl->q_)) 1436 ifd->na_[cl->pri_]--; 1437 } 1438 1439 void 1440 rmc_dropall(struct rm_class *cl) 1441 { 1442 struct rm_ifdat *ifd = cl->ifdat_; 1443 1444 if (!qempty(cl->q_)) { 1445 _flushq(cl->q_); 1446 1447 ifd->na_[cl->pri_]--; 1448 } 1449 } 1450 1451 #if (__FreeBSD_version > 300000) 1452 static int tvhzto(struct timeval *); 1453 1454 static int 1455 tvhzto(struct timeval *tv) 1456 { 1457 struct timeval t2; 1458 1459 getmicrotime(&t2); 1460 t2.tv_sec = tv->tv_sec - t2.tv_sec; 1461 t2.tv_usec = tv->tv_usec - t2.tv_usec; 1462 return (tvtohz(&t2)); 1463 } 1464 #endif /* __FreeBSD_version > 300000 */ 1465 1466 /* 1467 * void 1468 * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ 1469 * delay action routine. It is invoked via rmc_under_limit when the 1470 * packet is discoverd to be overlimit. 1471 * 1472 * If the delay action is result of borrow class being overlimit, then 1473 * delay for the offtime of the borrowing class that is overlimit. 1474 * 1475 * Returns: NONE 1476 */ 1477 1478 void 1479 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow) 1480 { 1481 int ndelay, t, extradelay; 1482 1483 cl->stats_.overactions++; 1484 TV_DELTA(&cl->undertime_, &cl->overtime_, ndelay); 1485 #ifndef BORROW_OFFTIME 1486 ndelay += cl->offtime_; 1487 #endif 1488 1489 if (!cl->sleeping_) { 1490 CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); 1491 #ifdef BORROW_OFFTIME 1492 if (borrow != NULL) 1493 extradelay = borrow->offtime_; 1494 else 1495 #endif 1496 extradelay = cl->offtime_; 1497 1498 #ifdef ALTQ 1499 /* 1500 * XXX recalculate suspend time: 1501 * current undertime is (tidle + pkt_time) calculated 1502 * from the last transmission. 1503 * tidle: time required to bring avgidle back to 0 1504 * pkt_time: target waiting time for this class 1505 * we need to replace pkt_time by offtime 1506 */ 1507 extradelay -= cl->last_pkttime_; 1508 #endif 1509 if (extradelay > 0) { 1510 TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_); 1511 ndelay += extradelay; 1512 } 1513 1514 cl->sleeping_ = 1; 1515 cl->stats_.delays++; 1516 1517 /* 1518 * Since packets are phased randomly with respect to the 1519 * clock, 1 tick (the next clock tick) can be an arbitrarily 1520 * short time so we have to wait for at least two ticks. 1521 * NOTE: If there's no other traffic, we need the timer as 1522 * a 'backstop' to restart this class. 1523 */ 1524 if (ndelay > tick * 2) { 1525 #ifdef __FreeBSD__ 1526 /* FreeBSD rounds up the tick */ 1527 t = tvhzto(&cl->undertime_); 1528 #else 1529 /* other BSDs round down the tick */ 1530 t = tvhzto(&cl->undertime_) + 1; 1531 #endif 1532 } else 1533 t = 2; 1534 CALLOUT_RESET(&cl->callout_, t, 1535 (timeout_t *)rmc_restart, (void *)cl); 1536 } 1537 } 1538 1539 /* 1540 * void 1541 * rmc_restart() - is just a helper routine for rmc_delay_action -- it is 1542 * called by the system timer code & is responsible checking if the 1543 * class is still sleeping (it might have been restarted as a side 1544 * effect of the queue scan on a packet arrival) and, if so, restarting 1545 * output for the class. Inspecting the class state & restarting output 1546 * require locking the class structure. In general the driver is 1547 * responsible for locking but this is the only routine that is not 1548 * called directly or indirectly from the interface driver so it has 1549 * know about system locking conventions. Under bsd, locking is done 1550 * by raising IPL to splnet so that's what's implemented here. On a 1551 * different system this would probably need to be changed. 1552 * 1553 * Returns: NONE 1554 */ 1555 1556 static void 1557 rmc_restart(struct rm_class *cl) 1558 { 1559 struct rm_ifdat *ifd = cl->ifdat_; 1560 int s; 1561 1562 s = splnet(); 1563 if (cl->sleeping_) { 1564 cl->sleeping_ = 0; 1565 cl->undertime_.tv_sec = 0; 1566 1567 if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { 1568 CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); 1569 (ifd->restart)(ifd->ifq_); 1570 } 1571 } 1572 splx(s); 1573 } 1574 1575 /* 1576 * void 1577 * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit 1578 * handling routine for the root class of the link sharing structure. 1579 * 1580 * Returns: NONE 1581 */ 1582 1583 static void 1584 rmc_root_overlimit(struct rm_class *cl, 1585 struct rm_class *borrow) 1586 { 1587 panic("rmc_root_overlimit"); 1588 } 1589 1590 /* 1591 * Packet Queue handling routines. Eventually, this is to localize the 1592 * effects on the code whether queues are red queues or droptail 1593 * queues. 1594 */ 1595 1596 static int 1597 _rmc_addq(rm_class_t *cl, mbuf_t *m) 1598 { 1599 #ifdef ALTQ_RIO 1600 if (q_is_rio(cl->q_)) 1601 return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_); 1602 #endif 1603 #ifdef ALTQ_RED 1604 if (q_is_red(cl->q_)) 1605 return red_addq(cl->red_, cl->q_, m, cl->pktattr_); 1606 #endif /* ALTQ_RED */ 1607 1608 if (cl->flags_ & RMCF_CLEARDSCP) 1609 write_dsfield(m, cl->pktattr_, 0); 1610 1611 _addq(cl->q_, m); 1612 return (0); 1613 } 1614 1615 /* note: _rmc_dropq is not called for red */ 1616 static void 1617 _rmc_dropq(rm_class_t *cl) 1618 { 1619 mbuf_t *m; 1620 1621 if ((m = _getq(cl->q_)) != NULL) 1622 m_freem(m); 1623 } 1624 1625 static mbuf_t * 1626 _rmc_getq(rm_class_t *cl) 1627 { 1628 #ifdef ALTQ_RIO 1629 if (q_is_rio(cl->q_)) 1630 return rio_getq((rio_t *)cl->red_, cl->q_); 1631 #endif 1632 #ifdef ALTQ_RED 1633 if (q_is_red(cl->q_)) 1634 return red_getq(cl->red_, cl->q_); 1635 #endif 1636 return _getq(cl->q_); 1637 } 1638 1639 static mbuf_t * 1640 _rmc_pollq(rm_class_t *cl) 1641 { 1642 return qhead(cl->q_); 1643 } 1644 1645 #ifdef CBQ_TRACE 1646 1647 struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; 1648 struct cbqtrace *cbqtrace_ptr = NULL; 1649 int cbqtrace_count; 1650 1651 /* 1652 * DDB hook to trace cbq events: 1653 * the last 1024 events are held in a circular buffer. 1654 * use "call cbqtrace_dump(N)" to display 20 events from Nth event. 1655 */ 1656 void cbqtrace_dump(int); 1657 static char *rmc_funcname(void *); 1658 1659 static struct rmc_funcs { 1660 void *func; 1661 char *name; 1662 } rmc_funcs[] = 1663 { 1664 rmc_init, "rmc_init", 1665 rmc_queue_packet, "rmc_queue_packet", 1666 rmc_under_limit, "rmc_under_limit", 1667 rmc_update_class_util, "rmc_update_class_util", 1668 rmc_delay_action, "rmc_delay_action", 1669 rmc_restart, "rmc_restart", 1670 _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", 1671 NULL, NULL 1672 }; 1673 1674 static char * 1675 rmc_funcname(void *func) 1676 { 1677 struct rmc_funcs *fp; 1678 1679 for (fp = rmc_funcs; fp->func != NULL; fp++) 1680 if (fp->func == func) 1681 return (fp->name); 1682 return ("unknown"); 1683 } 1684 1685 void 1686 cbqtrace_dump(int counter) 1687 { 1688 int i, *p; 1689 char *cp; 1690 1691 counter = counter % NCBQTRACE; 1692 p = (int *)&cbqtrace_buffer[counter]; 1693 1694 for (i=0; i<20; i++) { 1695 printf("[0x%x] ", *p++); 1696 printf("%s: ", rmc_funcname((void *)*p++)); 1697 cp = (char *)p++; 1698 printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); 1699 printf("%d\n",*p++); 1700 1701 if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) 1702 p = (int *)cbqtrace_buffer; 1703 } 1704 } 1705 #endif /* CBQ_TRACE */ 1706 #endif /* ALTQ_CBQ */ 1707 1708 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) 1709 #if !defined(__GNUC__) || defined(ALTQ_DEBUG) 1710 1711 void 1712 _addq(class_queue_t *q, mbuf_t *m) 1713 { 1714 mbuf_t *m0; 1715 1716 if ((m0 = qtail(q)) != NULL) 1717 m->m_nextpkt = m0->m_nextpkt; 1718 else 1719 m0 = m; 1720 m0->m_nextpkt = m; 1721 qtail(q) = m; 1722 qlen(q)++; 1723 } 1724 1725 mbuf_t * 1726 _getq(class_queue_t *q) 1727 { 1728 mbuf_t *m, *m0; 1729 1730 if ((m = qtail(q)) == NULL) 1731 return (NULL); 1732 if ((m0 = m->m_nextpkt) != m) 1733 m->m_nextpkt = m0->m_nextpkt; 1734 else { 1735 ASSERT(qlen(q) == 1); 1736 qtail(q) = NULL; 1737 } 1738 qlen(q)--; 1739 m0->m_nextpkt = NULL; 1740 return (m0); 1741 } 1742 1743 /* drop a packet at the tail of the queue */ 1744 mbuf_t * 1745 _getq_tail(class_queue_t *q) 1746 { 1747 mbuf_t *m, *m0, *prev; 1748 1749 if ((m = m0 = qtail(q)) == NULL) 1750 return NULL; 1751 do { 1752 prev = m0; 1753 m0 = m0->m_nextpkt; 1754 } while (m0 != m); 1755 prev->m_nextpkt = m->m_nextpkt; 1756 if (prev == m) { 1757 ASSERT(qlen(q) == 1); 1758 qtail(q) = NULL; 1759 } else 1760 qtail(q) = prev; 1761 qlen(q)--; 1762 m->m_nextpkt = NULL; 1763 return (m); 1764 } 1765 1766 /* randomly select a packet in the queue */ 1767 mbuf_t * 1768 _getq_random(class_queue_t *q) 1769 { 1770 struct mbuf *m; 1771 int i, n; 1772 1773 if ((m = qtail(q)) == NULL) 1774 return NULL; 1775 if (m->m_nextpkt == m) { 1776 ASSERT(qlen(q) == 1); 1777 qtail(q) = NULL; 1778 } else { 1779 struct mbuf *prev = NULL; 1780 1781 n = cprng_fast32() % qlen(q) + 1; 1782 for (i = 0; i < n; i++) { 1783 prev = m; 1784 m = m->m_nextpkt; 1785 } 1786 prev->m_nextpkt = m->m_nextpkt; 1787 if (m == qtail(q)) 1788 qtail(q) = prev; 1789 } 1790 qlen(q)--; 1791 m->m_nextpkt = NULL; 1792 return (m); 1793 } 1794 1795 void 1796 _removeq(class_queue_t *q, mbuf_t *m) 1797 { 1798 mbuf_t *m0, *prev; 1799 1800 m0 = qtail(q); 1801 do { 1802 prev = m0; 1803 m0 = m0->m_nextpkt; 1804 } while (m0 != m); 1805 prev->m_nextpkt = m->m_nextpkt; 1806 if (prev == m) 1807 qtail(q) = NULL; 1808 else if (qtail(q) == m) 1809 qtail(q) = prev; 1810 qlen(q)--; 1811 } 1812 1813 void 1814 _flushq(class_queue_t *q) 1815 { 1816 mbuf_t *m; 1817 1818 while ((m = _getq(q)) != NULL) 1819 m_freem(m); 1820 ASSERT(qlen(q) == 0); 1821 } 1822 1823 #endif /* !__GNUC__ || ALTQ_DEBUG */ 1824 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */ 1825