1 /* $NetBSD: altq_rmclass.c,v 1.32 2025/01/08 13:00:04 joe Exp $ */ 2 /* $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $ */ 3 4 /* 5 * Copyright (c) 1991-1997 Regents of the University of California. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the Network Research 19 * Group at Lawrence Berkeley Laboratory. 20 * 4. Neither the name of the University nor of the Laboratory may be used 21 * to endorse or promote products derived from this software without 22 * specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * LBL code modified by speer@eng.sun.com, May 1977. 37 * For questions and/or comments, please send mail to cbq@ee.lbl.gov 38 */ 39 40 #include <sys/cdefs.h> 41 __KERNEL_RCSID(0, "$NetBSD: altq_rmclass.c,v 1.32 2025/01/08 13:00:04 joe Exp $"); 42 43 /* #ident "@(#)rm_class.c 1.48 97/12/05 SMI" */ 44 45 #ifdef _KERNEL_OPT 46 #include "opt_altq.h" 47 #include "opt_inet.h" 48 #endif 49 50 #ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ 51 52 #include <sys/param.h> 53 #include <sys/malloc.h> 54 #include <sys/mbuf.h> 55 #include <sys/socket.h> 56 #include <sys/systm.h> 57 #include <sys/errno.h> 58 #include <sys/time.h> 59 #ifdef ALTQ3_COMPAT 60 #include <sys/kernel.h> 61 #endif 62 #include <sys/cprng.h> 63 64 #include <net/if.h> 65 #include <net/if_types.h> 66 #ifdef ALTQ3_COMPAT 67 #include <netinet/in.h> 68 #include <netinet/in_systm.h> 69 #include <netinet/ip.h> 70 #endif 71 72 #include <altq/altq.h> 73 #include <altq/altq_rmclass.h> 74 #include <altq/altq_rmclass_debug.h> 75 #include <altq/altq_red.h> 76 #include <altq/altq_rio.h> 77 78 /* 79 * Local Macros 80 */ 81 82 #define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } 83 84 #define PSEC_TO_NSEC(t) ((t) / 1000) 85 86 /* 87 * Local routines. 88 */ 89 90 static int rmc_satisfied(struct rm_class *, struct timespec *); 91 static void rmc_wrr_set_weights(struct rm_ifdat *); 92 static void rmc_depth_compute(struct rm_class *); 93 static void rmc_depth_recompute(rm_class_t *); 94 95 static mbuf_t *_rmc_wrr_dequeue_next(struct rm_ifdat *, int); 96 static mbuf_t *_rmc_prr_dequeue_next(struct rm_ifdat *, int); 97 98 static int _rmc_addq(rm_class_t *, mbuf_t *); 99 static void _rmc_dropq(rm_class_t *); 100 static mbuf_t *_rmc_getq(rm_class_t *); 101 static mbuf_t *_rmc_pollq(rm_class_t *); 102 103 static int rmc_under_limit(struct rm_class *, struct timespec *); 104 static void rmc_tl_satisfied(struct rm_ifdat *, struct timespec *); 105 static void rmc_drop_action(struct rm_class *); 106 static void rmc_restart(struct rm_class *); 107 static void rmc_root_overlimit(struct rm_class *, struct rm_class *); 108 109 #define BORROW_OFFTIME 110 /* 111 * BORROW_OFFTIME (experimental): 112 * borrow the offtime of the class borrowing from. 113 * the reason is that when its own offtime is set, the class is unable 114 * to borrow much, especially when cutoff is taking effect. 115 * but when the borrowed class is overloaded (advidle is close to minidle), 116 * use the borrowing class's offtime to avoid overload. 117 */ 118 #define ADJUST_CUTOFF 119 /* 120 * ADJUST_CUTOFF (experimental): 121 * if no underlimit class is found due to cutoff, increase cutoff and 122 * retry the scheduling loop. 123 * also, don't invoke delay_actions while cutoff is taking effect, 124 * since a sleeping class won't have a chance to be scheduled in the 125 * next loop. 126 * 127 * now heuristics for setting the top-level variable (cutoff_) becomes: 128 * 1. if a packet arrives for a not-overlimit class, set cutoff 129 * to the depth of the class. 130 * 2. if cutoff is i, and a packet arrives for an overlimit class 131 * with an underlimit ancestor at a lower level than i (say j), 132 * then set cutoff to j. 133 * 3. at scheduling a packet, if there is no underlimit class 134 * due to the current cutoff level, increase cutoff by 1 and 135 * then try to schedule again. 136 */ 137 138 /* 139 * rm_class_t * 140 * rmc_newclass(...) - Create a new resource management class at priority 141 * 'pri' on the interface given by 'ifd'. 142 * 143 * nsecPerByte is the data rate of the interface in nanoseconds/byte. 144 * E.g., 800 for a 10Mb/s ethernet. If the class gets less 145 * than 100% of the bandwidth, this number should be the 146 * 'effective' rate for the class. Let f be the 147 * bandwidth fraction allocated to this class, and let 148 * nsPerByte be the data rate of the output link in 149 * nanoseconds/byte. Then nsecPerByte is set to 150 * nsPerByte / f. E.g., 1600 (= 800 / .5) 151 * for a class that gets 50% of an ethernet's bandwidth. 152 * 153 * action the routine to call when the class is over limit. 154 * 155 * maxq max allowable queue size for class (in packets). 156 * 157 * parent parent class pointer. 158 * 159 * borrow class to borrow from (should be either 'parent' or null). 160 * 161 * maxidle max value allowed for class 'idle' time estimate (this 162 * parameter determines how large an initial burst of packets 163 * can be before overlimit action is invoked. 164 * 165 * offtime how long 'delay' action will delay when class goes over 166 * limit (this parameter determines the steady-state burst 167 * size when a class is running over its limit). 168 * 169 * Maxidle and offtime have to be computed from the following: If the 170 * average packet size is s, the bandwidth fraction allocated to this 171 * class is f, we want to allow b packet bursts, and the gain of the 172 * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: 173 * 174 * ptime = s * nsPerByte * (1 - f) / f 175 * maxidle = ptime * (1 - g^b) / g^b 176 * minidle = -ptime * (1 / (f - 1)) 177 * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) 178 * 179 * Operationally, it's convenient to specify maxidle & offtime in units 180 * independent of the link bandwidth so the maxidle & offtime passed to 181 * this routine are the above values multiplied by 8*f/(1000*nsPerByte). 182 * (The constant factor is a scale factor needed to make the parameters 183 * integers. This scaling also means that the 'unscaled' values of 184 * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, 185 * not nanoseconds.) Also note that the 'idle' filter computation keeps 186 * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of 187 * maxidle also must be scaled upward by this value. Thus, the passed 188 * values for maxidle and offtime can be computed as follows: 189 * 190 * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) 191 * offtime = offtime * 8 / (1000 * nsecPerByte) 192 * 193 * When USE_HRTIME is employed, then maxidle and offtime become: 194 * maxidle = maxilde * (8.0 / nsecPerByte); 195 * offtime = offtime * (8.0 / nsecPerByte); 196 */ 197 struct rm_class * 198 rmc_newclass(int pri, struct rm_ifdat *ifd, uint64_t psecPerByte, 199 void (*action)(rm_class_t *, rm_class_t *), int maxq, 200 struct rm_class *parent, struct rm_class *borrow, u_int maxidle, 201 int minidle, u_int offtime, int pktsize, int flags) 202 { 203 struct rm_class *cl; 204 struct rm_class *peer; 205 int s; 206 207 if (pri >= RM_MAXPRIO) 208 return NULL; 209 #ifndef ALTQ_RED 210 if (flags & RMCF_RED) { 211 #ifdef ALTQ_DEBUG 212 printf("rmc_newclass: RED not configured for CBQ!\n"); 213 #endif 214 return NULL; 215 } 216 #endif 217 #ifndef ALTQ_RIO 218 if (flags & RMCF_RIO) { 219 #ifdef ALTQ_DEBUG 220 printf("rmc_newclass: RIO not configured for CBQ!\n"); 221 #endif 222 return NULL; 223 } 224 #endif 225 226 cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_WAITOK|M_ZERO); 227 if (cl == NULL) 228 return NULL; 229 CALLOUT_INIT(&cl->callout_); 230 231 cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_WAITOK|M_ZERO); 232 if (cl->q_ == NULL) { 233 free(cl, M_DEVBUF); 234 return NULL; 235 } 236 237 /* 238 * Class initialization. 239 */ 240 cl->children_ = NULL; 241 cl->parent_ = parent; 242 cl->borrow_ = borrow; 243 cl->leaf_ = 1; 244 cl->ifdat_ = ifd; 245 cl->pri_ = pri; 246 cl->allotment_ = (u_int)(RM_PS_PER_SEC / psecPerByte); /* Bytes per sec */ 247 cl->depth_ = 0; 248 cl->qthresh_ = 0; 249 cl->ps_per_byte_ = psecPerByte; 250 251 qlimit(cl->q_) = maxq; 252 qtype(cl->q_) = Q_DROPHEAD; 253 qlen(cl->q_) = 0; 254 cl->flags_ = flags; 255 256 #if 1 /* minidle is also scaled in ALTQ */ 257 cl->minidle_ = ((int64_t)minidle * (int64_t)psecPerByte) / 8; 258 if (cl->minidle_ > 0) 259 cl->minidle_ = 0; 260 #else 261 cl->minidle_ = minidle; 262 #endif 263 cl->maxidle_ = ((int64_t)maxidle * (int64_t)psecPerByte) / 8; 264 if (cl->maxidle_ == 0) 265 cl->maxidle_ = 1; 266 #if 1 /* offtime is also scaled in ALTQ */ 267 cl->avgidle_ = cl->maxidle_; 268 cl->offtime_ = (((int64_t)offtime * (int64_t)psecPerByte) / 8) >> RM_FILTER_GAIN; 269 if (cl->offtime_ == 0) 270 cl->offtime_ = 1; 271 #else 272 cl->avgidle_ = 0; 273 cl->offtime_ = (offtime * nsecPerByte) / 8; 274 #endif 275 cl->overlimit = action; 276 277 #ifdef ALTQ_RED 278 if (flags & (RMCF_RED|RMCF_RIO)) { 279 int red_flags, red_pkttime; 280 281 red_flags = 0; 282 if (flags & RMCF_ECN) 283 red_flags |= REDF_ECN; 284 if (flags & RMCF_FLOWVALVE) 285 red_flags |= REDF_FLOWVALVE; 286 #ifdef ALTQ_RIO 287 if (flags & RMCF_CLEARDSCP) 288 red_flags |= RIOF_CLEARDSCP; 289 #endif 290 red_pkttime = PSEC_TO_NSEC(psecPerByte) * pktsize / 1000; 291 292 if (flags & RMCF_RED) { 293 cl->red_ = red_alloc(0, 0, 294 qlimit(cl->q_) * 10/100, 295 qlimit(cl->q_) * 30/100, 296 red_flags, red_pkttime); 297 if (cl->red_ != NULL) 298 qtype(cl->q_) = Q_RED; 299 } 300 #ifdef ALTQ_RIO 301 else { 302 cl->red_ = (red_t *)rio_alloc(0, NULL, 303 red_flags, red_pkttime); 304 if (cl->red_ != NULL) 305 qtype(cl->q_) = Q_RIO; 306 } 307 #endif 308 } 309 #endif /* ALTQ_RED */ 310 311 /* 312 * put the class into the class tree 313 */ 314 s = splnet(); 315 if ((peer = ifd->active_[pri]) != NULL) { 316 /* find the last class at this pri */ 317 cl->peer_ = peer; 318 while (peer->peer_ != ifd->active_[pri]) 319 peer = peer->peer_; 320 peer->peer_ = cl; 321 } else { 322 ifd->active_[pri] = cl; 323 cl->peer_ = cl; 324 } 325 326 if (cl->parent_) { 327 cl->next_ = parent->children_; 328 parent->children_ = cl; 329 parent->leaf_ = 0; 330 } 331 332 /* 333 * Compute the depth of this class and its ancestors in the class 334 * hierarchy. 335 */ 336 rmc_depth_compute(cl); 337 338 /* 339 * If CBQ's WRR is enabled, then initialize the class WRR state. 340 */ 341 if (ifd->wrr_) { 342 ifd->num_[pri]++; 343 ifd->alloc_[pri] += cl->allotment_; 344 rmc_wrr_set_weights(ifd); 345 } 346 splx(s); 347 return cl; 348 } 349 350 int 351 rmc_modclass(struct rm_class *cl, uint64_t psecPerByte, int maxq, u_int maxidle, 352 int minidle, u_int offtime, int pktsize) 353 { 354 struct rm_ifdat *ifd; 355 u_int old_allotment; 356 int s; 357 358 ifd = cl->ifdat_; 359 old_allotment = cl->allotment_; 360 361 s = splnet(); 362 cl->allotment_ = (u_int)(RM_PS_PER_SEC / psecPerByte); /* Bytes per sec */ 363 cl->qthresh_ = 0; 364 cl->ps_per_byte_ = psecPerByte; 365 366 qlimit(cl->q_) = maxq; 367 368 #if 1 /* minidle is also scaled in ALTQ */ 369 cl->minidle_ = ((int64_t)minidle * (int64_t)psecPerByte) / 8; 370 if (cl->minidle_ > 0) 371 cl->minidle_ = 0; 372 #else 373 cl->minidle_ = minidle; 374 #endif 375 cl->maxidle_ = ((int64_t)maxidle * (int64_t)psecPerByte) / 8; 376 if (cl->maxidle_ == 0) 377 cl->maxidle_ = 1; 378 #if 1 /* offtime is also scaled in ALTQ */ 379 cl->avgidle_ = cl->maxidle_; 380 cl->offtime_ = (((int64_t)offtime * (int64_t)psecPerByte) / 8) >> RM_FILTER_GAIN; 381 if (cl->offtime_ == 0) 382 cl->offtime_ = 1; 383 #else 384 cl->avgidle_ = 0; 385 cl->offtime_ = (offtime * nsecPerByte) / 8; 386 #endif 387 388 /* 389 * If CBQ's WRR is enabled, then initialize the class WRR state. 390 */ 391 if (ifd->wrr_) { 392 ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; 393 rmc_wrr_set_weights(ifd); 394 } 395 splx(s); 396 return 0; 397 } 398 399 /* 400 * static void 401 * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes 402 * the appropriate run robin weights for the CBQ weighted round robin 403 * algorithm. 404 * 405 * Returns: NONE 406 */ 407 408 static void 409 rmc_wrr_set_weights(struct rm_ifdat *ifd) 410 { 411 int i; 412 struct rm_class *cl, *clh; 413 414 for (i = 0; i < RM_MAXPRIO; i++) { 415 /* 416 * This is inverted from that of the simulator to 417 * maintain precision. 418 */ 419 if (ifd->num_[i] == 0) 420 ifd->M_[i] = 0; 421 else 422 ifd->M_[i] = ifd->alloc_[i] / 423 (ifd->num_[i] * ifd->maxpkt_); 424 /* 425 * Compute the weighted allotment for each class. 426 * This takes the expensive div instruction out 427 * of the main loop for the wrr scheduling path. 428 * These only get recomputed when a class comes or 429 * goes. 430 */ 431 if (ifd->active_[i] != NULL) { 432 clh = cl = ifd->active_[i]; 433 do { 434 /* safe-guard for slow link or alloc_ == 0 */ 435 if (ifd->M_[i] == 0) 436 cl->w_allotment_ = 0; 437 else 438 cl->w_allotment_ = cl->allotment_ / 439 ifd->M_[i]; 440 cl = cl->peer_; 441 } while ((cl != NULL) && (cl != clh)); 442 } 443 } 444 } 445 446 int 447 rmc_get_weight(struct rm_ifdat *ifd, int pri) 448 { 449 if ((pri >= 0) && (pri < RM_MAXPRIO)) 450 return (ifd->M_[pri]); 451 else 452 return 0; 453 } 454 455 /* 456 * static void 457 * rmc_depth_compute(struct rm_class *cl) - This function computes the 458 * appropriate depth of class 'cl' and its ancestors. 459 * 460 * Returns: NONE 461 */ 462 463 static void 464 rmc_depth_compute(struct rm_class *cl) 465 { 466 rm_class_t *t = cl, *p; 467 468 /* 469 * Recompute the depth for the branch of the tree. 470 */ 471 while (t != NULL) { 472 p = t->parent_; 473 if (p && (t->depth_ >= p->depth_)) { 474 p->depth_ = t->depth_ + 1; 475 t = p; 476 } else 477 t = NULL; 478 } 479 } 480 481 /* 482 * static void 483 * rmc_depth_recompute(struct rm_class *cl) - This function re-computes 484 * the depth of the tree after a class has been deleted. 485 * 486 * Returns: NONE 487 */ 488 489 static void 490 rmc_depth_recompute(rm_class_t *cl) 491 { 492 #if 1 /* ALTQ */ 493 rm_class_t *p, *t; 494 495 p = cl; 496 while (p != NULL) { 497 if ((t = p->children_) == NULL) { 498 p->depth_ = 0; 499 } else { 500 int cdepth = 0; 501 502 while (t != NULL) { 503 if (t->depth_ > cdepth) 504 cdepth = t->depth_; 505 t = t->next_; 506 } 507 508 if (p->depth_ == cdepth + 1) 509 /* no change to this parent */ 510 return; 511 512 p->depth_ = cdepth + 1; 513 } 514 515 p = p->parent_; 516 } 517 #else 518 rm_class_t *t; 519 520 if (cl->depth_ >= 1) { 521 if (cl->children_ == NULL) { 522 cl->depth_ = 0; 523 } else if ((t = cl->children_) != NULL) { 524 while (t != NULL) { 525 if (t->children_ != NULL) 526 rmc_depth_recompute(t); 527 t = t->next_; 528 } 529 } else 530 rmc_depth_compute(cl); 531 } 532 #endif 533 } 534 535 /* 536 * void 537 * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This 538 * function deletes a class from the link-sharing structure and frees 539 * all resources associated with the class. 540 * 541 * Returns: NONE 542 */ 543 544 void 545 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl) 546 { 547 struct rm_class *p, *head, *previous; 548 int s; 549 550 ASSERT(cl->children_ == NULL); 551 552 if (cl->sleeping_) 553 CALLOUT_STOP(&cl->callout_); 554 555 s = splnet(); 556 /* 557 * Free packets in the packet queue. 558 * XXX - this may not be a desired behavior. Packets should be 559 * re-queued. 560 */ 561 rmc_dropall(cl); 562 563 /* 564 * If the class has a parent, then remove the class from the 565 * class from the parent's children chain. 566 */ 567 if (cl->parent_ != NULL) { 568 head = cl->parent_->children_; 569 p = previous = head; 570 if (head->next_ == NULL) { 571 ASSERT(head == cl); 572 cl->parent_->children_ = NULL; 573 cl->parent_->leaf_ = 1; 574 } else while (p != NULL) { 575 if (p == cl) { 576 if (cl == head) 577 cl->parent_->children_ = cl->next_; 578 else 579 previous->next_ = cl->next_; 580 cl->next_ = NULL; 581 p = NULL; 582 } else { 583 previous = p; 584 p = p->next_; 585 } 586 } 587 } 588 589 /* 590 * Delete class from class priority peer list. 591 */ 592 if ((p = ifd->active_[cl->pri_]) != NULL) { 593 /* 594 * If there is more than one member of this priority 595 * level, then look for class(cl) in the priority level. 596 */ 597 if (p != p->peer_) { 598 while (p->peer_ != cl) 599 p = p->peer_; 600 p->peer_ = cl->peer_; 601 602 if (ifd->active_[cl->pri_] == cl) 603 ifd->active_[cl->pri_] = cl->peer_; 604 } else { 605 ASSERT(p == cl); 606 ifd->active_[cl->pri_] = NULL; 607 } 608 } 609 610 /* 611 * Recompute the WRR weights. 612 */ 613 if (ifd->wrr_) { 614 ifd->alloc_[cl->pri_] -= cl->allotment_; 615 ifd->num_[cl->pri_]--; 616 rmc_wrr_set_weights(ifd); 617 } 618 619 /* 620 * Re-compute the depth of the tree. 621 */ 622 #if 1 /* ALTQ */ 623 rmc_depth_recompute(cl->parent_); 624 #else 625 rmc_depth_recompute(ifd->root_); 626 #endif 627 628 splx(s); 629 630 /* 631 * Free the class structure. 632 */ 633 if (cl->red_ != NULL) { 634 #ifdef ALTQ_RIO 635 if (q_is_rio(cl->q_)) 636 rio_destroy((rio_t *)cl->red_); 637 #endif 638 #ifdef ALTQ_RED 639 if (q_is_red(cl->q_)) 640 red_destroy(cl->red_); 641 #endif 642 } 643 free(cl->q_, M_DEVBUF); 644 free(cl, M_DEVBUF); 645 } 646 647 648 /* 649 * int 650 * rmc_init(...) - Initialize the resource management data structures 651 * associated with the output portion of interface 'ifp'. 'ifd' is 652 * where the structures will be built (for backwards compatibility, the 653 * structures aren't kept in the ifnet struct). 'nsecPerByte' 654 * gives the link speed (inverse of bandwidth) in nanoseconds/byte. 655 * 'restart' is the driver-specific routine that the generic 'delay 656 * until under limit' action will call to restart output. `maxq' 657 * is the queue size of the 'link' & 'default' classes. 'maxqueued' 658 * is the maximum number of packets that the resource management 659 * code will allow to be queued 'downstream' (this is typically 1). 660 * 661 * Returns: 0 on success 662 */ 663 664 int 665 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, uint64_t psecPerByte, 666 void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle, 667 int minidle, u_int offtime, int flags) 668 { 669 int i, mtu; 670 671 /* 672 * Initialize the CBQ tracing/debug facility. 673 */ 674 CBQTRACEINIT(); 675 676 mtu = ifq->altq_ifp->if_mtu; 677 if (mtu < 1) { 678 printf("altq: %s: invalid MTU (interface not initialized?)\n", 679 ifq->altq_ifp->if_xname); 680 return EINVAL; 681 } 682 683 (void)memset((char *)ifd, 0, sizeof (*ifd)); 684 ifd->ifq_ = ifq; 685 ifd->restart = restart; 686 ifd->maxqueued_ = maxqueued; 687 ifd->ps_per_byte_ = psecPerByte; 688 ifd->maxpkt_ = mtu; 689 ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; 690 ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; 691 #if 1 692 ifd->maxiftime_ = mtu * psecPerByte / 1000 / 1000 * 16; 693 if ((int64_t)mtu * psecPerByte > (int64_t)10 * 1000000000) 694 ifd->maxiftime_ /= 4; 695 #endif 696 697 reset_cutoff(ifd); 698 CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); 699 700 /* 701 * Initialize the CBQ's WRR state. 702 */ 703 for (i = 0; i < RM_MAXPRIO; i++) { 704 ifd->alloc_[i] = 0; 705 ifd->M_[i] = 0; 706 ifd->num_[i] = 0; 707 ifd->na_[i] = 0; 708 ifd->active_[i] = NULL; 709 } 710 711 /* 712 * Initialize current packet state. 713 */ 714 ifd->qi_ = 0; 715 ifd->qo_ = 0; 716 for (i = 0; i < RM_MAXQUEUED; i++) { 717 ifd->class_[i] = NULL; 718 ifd->curlen_[i] = 0; 719 ifd->borrowed_[i] = NULL; 720 } 721 722 /* 723 * Create the root class of the link-sharing structure. 724 */ 725 if ((ifd->root_ = rmc_newclass(0, ifd, 726 psecPerByte, 727 rmc_root_overlimit, maxq, 0, 0, 728 maxidle, minidle, offtime, 729 0, 0)) == NULL) { 730 printf("rmc_init: root class not allocated\n"); 731 return ENOMEM; 732 } 733 ifd->root_->depth_ = 0; 734 735 return 0; 736 } 737 738 /* 739 * void 740 * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by 741 * mbuf 'm' to queue for resource class 'cl'. This routine is called 742 * by a driver's if_output routine. This routine must be called with 743 * output packet completion interrupts locked out (to avoid racing with 744 * rmc_dequeue_next). 745 * 746 * Returns: 0 on successful queueing 747 * -1 when packet drop occurs 748 */ 749 int 750 rmc_queue_packet(struct rm_class *cl, mbuf_t *m) 751 { 752 struct timespec now; 753 struct rm_ifdat *ifd = cl->ifdat_; 754 int cpri = cl->pri_; 755 int is_empty = qempty(cl->q_); 756 757 RM_GETTIME(now); 758 if (ifd->cutoff_ > 0) { 759 if (TS_LT(&cl->undertime_, &now)) { 760 if (ifd->cutoff_ > cl->depth_) 761 ifd->cutoff_ = cl->depth_; 762 CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); 763 } 764 #if 1 /* ALTQ */ 765 else { 766 /* 767 * the class is overlimit. if the class has 768 * underlimit ancestors, set cutoff to the lowest 769 * depth among them. 770 */ 771 struct rm_class *borrow = cl->borrow_; 772 773 while (borrow != NULL && 774 borrow->depth_ < ifd->cutoff_) { 775 if (TS_LT(&borrow->undertime_, &now)) { 776 ifd->cutoff_ = borrow->depth_; 777 CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_); 778 break; 779 } 780 borrow = borrow->borrow_; 781 } 782 } 783 #else /* !ALTQ */ 784 else if ((ifd->cutoff_ > 1) && cl->borrow_) { 785 if (TS_LT(&cl->borrow_->undertime_, &now)) { 786 ifd->cutoff_ = cl->borrow_->depth_; 787 CBQTRACE(rmc_queue_packet, 'ffob', 788 cl->borrow_->depth_); 789 } 790 } 791 #endif /* !ALTQ */ 792 } 793 794 if (_rmc_addq(cl, m) < 0) 795 /* failed */ 796 return -1; 797 798 if (is_empty) { 799 CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle); 800 ifd->na_[cpri]++; 801 } 802 803 if (qlen(cl->q_) > qlimit(cl->q_)) { 804 /* note: qlimit can be set to 0 or 1 */ 805 rmc_drop_action(cl); 806 return -1; 807 } 808 return 0; 809 } 810 811 /* 812 * void 813 * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timespec *now) - Check all 814 * classes to see if there are satified. 815 */ 816 817 static void 818 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timespec *now) 819 { 820 int i; 821 rm_class_t *p, *bp; 822 823 for (i = RM_MAXPRIO - 1; i >= 0; i--) { 824 if ((bp = ifd->active_[i]) != NULL) { 825 p = bp; 826 do { 827 if (!rmc_satisfied(p, now)) { 828 ifd->cutoff_ = p->depth_; 829 return; 830 } 831 p = p->peer_; 832 } while (p != bp); 833 } 834 } 835 836 reset_cutoff(ifd); 837 } 838 839 /* 840 * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. 841 */ 842 843 static int 844 rmc_satisfied(struct rm_class *cl, struct timespec *now) 845 { 846 rm_class_t *p; 847 848 if (cl == NULL) 849 return 1; 850 if (TS_LT(now, &cl->undertime_)) 851 return 1; 852 if (cl->depth_ == 0) { 853 if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_)) 854 return 0; 855 else 856 return 1; 857 } 858 if (cl->children_ != NULL) { 859 p = cl->children_; 860 while (p != NULL) { 861 if (!rmc_satisfied(p, now)) 862 return 0; 863 p = p->next_; 864 } 865 } 866 867 return 1; 868 } 869 870 /* 871 * Return 1 if class 'cl' is under limit or can borrow from a parent, 872 * 0 if overlimit. As a side-effect, this routine will invoke the 873 * class overlimit action if the class if overlimit. 874 */ 875 876 static int 877 rmc_under_limit(struct rm_class *cl, struct timespec *now) 878 { 879 rm_class_t *p = cl; 880 rm_class_t *top; 881 struct rm_ifdat *ifd = cl->ifdat_; 882 int sleeping = cl->sleeping_; 883 884 ifd->borrowed_[ifd->qi_] = NULL; 885 /* 886 * If cl is the root class, then always return that it is 887 * underlimit. Otherwise, check to see if the class is underlimit. 888 */ 889 if (cl->parent_ == NULL) 890 return 1; 891 892 if (!TS_LT(now, &cl->undertime_)) { 893 /* Fast path: the given class is allowed to send packets */ 894 if (sleeping) { 895 CALLOUT_STOP(&cl->callout_); 896 cl->sleeping_ = 0; 897 cl->undertime_.tv_sec = 0; 898 } 899 return 1; 900 } 901 902 top = NULL; 903 do { 904 if (((cl = cl->borrow_) == NULL) || 905 (cl->depth_ > ifd->cutoff_)) { 906 /* No need to call the delay action */ 907 if (sleeping) 908 return 0; 909 #ifdef ADJUST_CUTOFF 910 if (cl != NULL) 911 /* cutoff is taking effect, just 912 return false without calling 913 the delay action. */ 914 return 0; 915 #endif 916 #ifdef BORROW_OFFTIME 917 /* 918 * check if the class can borrow offtime too. 919 * borrow offtime from the top of the borrow 920 * chain if the top class is not overloaded. 921 */ 922 if (cl != NULL) { 923 /* cutoff is taking effect, use this class as top. */ 924 top = cl; 925 CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); 926 } 927 if (top != NULL && top->avgidle_ == top->minidle_) 928 top = NULL; 929 p->overtime_ = *now; 930 (p->overlimit)(p, top); 931 #else 932 p->overtime_ = *now; 933 (p->overlimit)(p, NULL); 934 #endif 935 return 0; 936 } 937 top = cl; 938 } while (cl->undertime_.tv_sec && TS_LT(now, &cl->undertime_)); 939 940 if (cl != p) 941 ifd->borrowed_[ifd->qi_] = cl; 942 return 1; 943 } 944 945 /* 946 * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to 947 * Packet-by-packet round robin. 948 * 949 * The heart of the weighted round-robin scheduler, which decides which 950 * class next gets to send a packet. Highest priority first, then 951 * weighted round-robin within priorites. 952 * 953 * Each able-to-send class gets to send until its byte allocation is 954 * exhausted. Thus, the active pointer is only changed after a class has 955 * exhausted its allocation. 956 * 957 * If the scheduler finds no class that is underlimit or able to borrow, 958 * then the first class found that had a nonzero queue and is allowed to 959 * borrow gets to send. 960 */ 961 962 static mbuf_t * 963 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op) 964 { 965 struct rm_class *cl = NULL, *first = NULL; 966 u_int deficit; 967 int cpri; 968 mbuf_t *m; 969 struct timespec now; 970 971 RM_GETTIME(now); 972 973 /* 974 * if the driver polls the top of the queue and then removes 975 * the polled packet, we must return the same packet. 976 */ 977 if (op == ALTDQ_REMOVE && ifd->pollcache_) { 978 cl = ifd->pollcache_; 979 cpri = cl->pri_; 980 if (ifd->efficient_) { 981 /* check if this class is overlimit */ 982 if (cl->undertime_.tv_sec != 0 && 983 rmc_under_limit(cl, &now) == 0) 984 first = cl; 985 } 986 ifd->pollcache_ = NULL; 987 goto _wrr_out; 988 } 989 else { 990 /* mode == ALTDQ_POLL || pollcache == NULL */ 991 ifd->pollcache_ = NULL; 992 ifd->borrowed_[ifd->qi_] = NULL; 993 } 994 #ifdef ADJUST_CUTOFF 995 _again: 996 #endif 997 for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { 998 if (ifd->na_[cpri] == 0) 999 continue; 1000 deficit = 0; 1001 /* 1002 * Loop through twice for a priority level, if some class 1003 * was unable to send a packet the first round because 1004 * of the weighted round-robin mechanism. 1005 * During the second loop at this level, deficit==2. 1006 * (This second loop is not needed if for every class, 1007 * "M[cl->pri_])" times "cl->allotment" is greater than 1008 * the byte size for the largest packet in the class.) 1009 */ 1010 _wrr_loop: 1011 cl = ifd->active_[cpri]; 1012 ASSERT(cl != NULL); 1013 do { 1014 if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) 1015 cl->bytes_alloc_ += cl->w_allotment_; 1016 if (!qempty(cl->q_)) { 1017 if ((cl->undertime_.tv_sec == 0) || 1018 rmc_under_limit(cl, &now)) { 1019 if (cl->bytes_alloc_ > 0 || deficit > 1) 1020 goto _wrr_out; 1021 1022 /* underlimit but no alloc */ 1023 deficit = 1; 1024 #if 1 1025 ifd->borrowed_[ifd->qi_] = NULL; 1026 #endif 1027 } 1028 else if (first == NULL && cl->borrow_ != NULL) 1029 first = cl; /* borrowing candidate */ 1030 } 1031 1032 cl->bytes_alloc_ = 0; 1033 cl = cl->peer_; 1034 } while (cl != ifd->active_[cpri]); 1035 1036 if (deficit == 1) { 1037 /* first loop found an underlimit class with deficit */ 1038 /* Loop on same priority level, with new deficit. */ 1039 deficit = 2; 1040 goto _wrr_loop; 1041 } 1042 } 1043 1044 #ifdef ADJUST_CUTOFF 1045 /* 1046 * no underlimit class found. if cutoff is taking effect, 1047 * increase cutoff and try again. 1048 */ 1049 if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { 1050 ifd->cutoff_++; 1051 CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); 1052 goto _again; 1053 } 1054 #endif /* ADJUST_CUTOFF */ 1055 /* 1056 * If LINK_EFFICIENCY is turned on, then the first overlimit 1057 * class we encounter will send a packet if all the classes 1058 * of the link-sharing structure are overlimit. 1059 */ 1060 reset_cutoff(ifd); 1061 CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); 1062 1063 if (!ifd->efficient_ || first == NULL) 1064 return NULL; 1065 1066 cl = first; 1067 cpri = cl->pri_; 1068 #if 0 /* too time-consuming for nothing */ 1069 if (cl->sleeping_) 1070 CALLOUT_STOP(&cl->callout_); 1071 cl->sleeping_ = 0; 1072 cl->undertime_.tv_sec = 0; 1073 #endif 1074 ifd->borrowed_[ifd->qi_] = cl->borrow_; 1075 ifd->cutoff_ = cl->borrow_->depth_; 1076 1077 /* 1078 * Deque the packet and do the book keeping... 1079 */ 1080 _wrr_out: 1081 if (op == ALTDQ_REMOVE) { 1082 m = _rmc_getq(cl); 1083 if (m == NULL) 1084 panic("_rmc_wrr_dequeue_next"); 1085 if (qempty(cl->q_)) 1086 ifd->na_[cpri]--; 1087 1088 /* 1089 * Update class statistics and link data. 1090 */ 1091 if (cl->bytes_alloc_ > 0) 1092 cl->bytes_alloc_ -= m_pktlen(m); 1093 1094 if ((cl->bytes_alloc_ <= 0) || first == cl) 1095 ifd->active_[cl->pri_] = cl->peer_; 1096 else 1097 ifd->active_[cl->pri_] = cl; 1098 1099 ifd->class_[ifd->qi_] = cl; 1100 ifd->curlen_[ifd->qi_] = m_pktlen(m); 1101 ifd->now_[ifd->qi_] = now; 1102 ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; 1103 ifd->queued_++; 1104 } else { 1105 /* mode == ALTDQ_PPOLL */ 1106 m = _rmc_pollq(cl); 1107 ifd->pollcache_ = cl; 1108 } 1109 return m; 1110 } 1111 1112 /* 1113 * Dequeue & return next packet from the highest priority class that 1114 * has a packet to send & has enough allocation to send it. This 1115 * routine is called by a driver whenever it needs a new packet to 1116 * output. 1117 */ 1118 static mbuf_t * 1119 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op) 1120 { 1121 mbuf_t *m; 1122 int cpri; 1123 struct rm_class *cl, *first = NULL; 1124 struct timespec now; 1125 1126 RM_GETTIME(now); 1127 1128 /* 1129 * if the driver polls the top of the queue and then removes 1130 * the polled packet, we must return the same packet. 1131 */ 1132 if (op == ALTDQ_REMOVE && ifd->pollcache_) { 1133 cl = ifd->pollcache_; 1134 cpri = cl->pri_; 1135 ifd->pollcache_ = NULL; 1136 goto _prr_out; 1137 } else { 1138 /* mode == ALTDQ_POLL || pollcache == NULL */ 1139 ifd->pollcache_ = NULL; 1140 ifd->borrowed_[ifd->qi_] = NULL; 1141 } 1142 #ifdef ADJUST_CUTOFF 1143 _again: 1144 #endif 1145 for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { 1146 if (ifd->na_[cpri] == 0) 1147 continue; 1148 cl = ifd->active_[cpri]; 1149 ASSERT(cl != NULL); 1150 do { 1151 if (!qempty(cl->q_)) { 1152 if ((cl->undertime_.tv_sec == 0) || 1153 rmc_under_limit(cl, &now)) 1154 goto _prr_out; 1155 if (first == NULL && cl->borrow_ != NULL) 1156 first = cl; 1157 } 1158 cl = cl->peer_; 1159 } while (cl != ifd->active_[cpri]); 1160 } 1161 1162 #ifdef ADJUST_CUTOFF 1163 /* 1164 * no underlimit class found. if cutoff is taking effect, increase 1165 * cutoff and try again. 1166 */ 1167 if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { 1168 ifd->cutoff_++; 1169 goto _again; 1170 } 1171 #endif /* ADJUST_CUTOFF */ 1172 /* 1173 * If LINK_EFFICIENCY is turned on, then the first overlimit 1174 * class we encounter will send a packet if all the classes 1175 * of the link-sharing structure are overlimit. 1176 */ 1177 reset_cutoff(ifd); 1178 if (!ifd->efficient_ || first == NULL) 1179 return NULL; 1180 1181 cl = first; 1182 cpri = cl->pri_; 1183 #if 0 /* too time-consuming for nothing */ 1184 if (cl->sleeping_) 1185 CALLOUT_STOP(&cl->callout_); 1186 cl->sleeping_ = 0; 1187 cl->undertime_.tv_sec = 0; 1188 #endif 1189 ifd->borrowed_[ifd->qi_] = cl->borrow_; 1190 ifd->cutoff_ = cl->borrow_->depth_; 1191 1192 /* 1193 * Deque the packet and do the book keeping... 1194 */ 1195 _prr_out: 1196 if (op == ALTDQ_REMOVE) { 1197 m = _rmc_getq(cl); 1198 if (m == NULL) 1199 panic("_rmc_prr_dequeue_next"); 1200 if (qempty(cl->q_)) 1201 ifd->na_[cpri]--; 1202 1203 ifd->active_[cpri] = cl->peer_; 1204 1205 ifd->class_[ifd->qi_] = cl; 1206 ifd->curlen_[ifd->qi_] = m_pktlen(m); 1207 ifd->now_[ifd->qi_] = now; 1208 ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; 1209 ifd->queued_++; 1210 } else { 1211 /* mode == ALTDQ_POLL */ 1212 m = _rmc_pollq(cl); 1213 ifd->pollcache_ = cl; 1214 } 1215 return m; 1216 } 1217 1218 /* 1219 * mbuf_t * 1220 * rmc_dequeue_next(struct rm_ifdat *ifd, struct timespec *now) - this function 1221 * is invoked by the packet driver to get the next packet to be 1222 * dequeued and output on the link. If WRR is enabled, then the 1223 * WRR dequeue next routine will determine the next packet to sent. 1224 * Otherwise, packet-by-packet round robin is invoked. 1225 * 1226 * Returns: NULL, if a packet is not available or if all 1227 * classes are overlimit. 1228 * 1229 * Otherwise, Pointer to the next packet. 1230 */ 1231 1232 mbuf_t * 1233 rmc_dequeue_next(struct rm_ifdat *ifd, int mode) 1234 { 1235 if (ifd->queued_ >= ifd->maxqueued_) 1236 return NULL; 1237 else if (ifd->wrr_) 1238 return (_rmc_wrr_dequeue_next(ifd, mode)); 1239 else 1240 return (_rmc_prr_dequeue_next(ifd, mode)); 1241 } 1242 1243 /* 1244 * Update the utilization estimate for the packet that just completed. 1245 * The packet's class & the parent(s) of that class all get their 1246 * estimators updated. This routine is called by the driver's output- 1247 * packet-completion interrupt service routine. 1248 */ 1249 1250 /* 1251 * a macro to approximate "divide by 1000" that gives 0.000999, 1252 * if a value has enough effective digits. 1253 * (on pentium, mul takes 9 cycles but div takes 46!) 1254 */ 1255 #define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) 1256 /* Don't worry. Recent compilers don't use div. */ 1257 #define PSEC_TO_USEC(t) ((t) / 1000 / 1000) 1258 void 1259 rmc_update_class_util(struct rm_ifdat *ifd) 1260 { 1261 int64_t idle, avgidle, pktlen; 1262 int64_t pkt_time; 1263 int64_t tidle; 1264 rm_class_t *cl, *cl0, *borrowed; 1265 rm_class_t *borrows; 1266 struct timespec *nowp; 1267 1268 /* 1269 * Get the most recent completed class. 1270 */ 1271 if ((cl = ifd->class_[ifd->qo_]) == NULL) 1272 return; 1273 1274 cl0 = cl; 1275 pktlen = (int64_t)ifd->curlen_[ifd->qo_]; 1276 borrowed = ifd->borrowed_[ifd->qo_]; 1277 borrows = borrowed; 1278 1279 PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); 1280 1281 /* 1282 * Run estimator on class and its ancestors. 1283 */ 1284 /* 1285 * rm_update_class_util is designed to be called when the 1286 * transfer is completed from a xmit complete interrupt, 1287 * but most drivers don't implement an upcall for that. 1288 * so, just use estimated completion time. 1289 * as a result, ifd->qi_ and ifd->qo_ are always synced. 1290 */ 1291 nowp = &ifd->now_[ifd->qo_]; 1292 /* get pkt_time (for link) in usec */ 1293 #if 1 /* use approximation */ 1294 pkt_time = (int64_t)ifd->curlen_[ifd->qo_] * (int64_t)ifd->ps_per_byte_; 1295 pkt_time = PSEC_TO_NSEC(pkt_time); 1296 #else 1297 pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; 1298 #endif 1299 if (ifd->ifq_->altq_ifp->if_type == IFT_PPP) { 1300 if (TS_LT(nowp, &ifd->ifnow_)) { 1301 int iftime; 1302 1303 /* 1304 * make sure the estimated completion time does not go 1305 * too far. it can happen when the link layer supports 1306 * data compression or the interface speed is set to 1307 * a much lower value. 1308 */ 1309 TS_DELTA(&ifd->ifnow_, nowp, iftime); 1310 if (iftime+pkt_time < ifd->maxiftime_) { 1311 TS_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); 1312 } else { 1313 TS_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); 1314 } 1315 } else { 1316 TS_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); 1317 } 1318 } else { 1319 if (TS_LT(nowp, &ifd->ifnow_)) { 1320 TS_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); 1321 } else { 1322 TS_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); 1323 } 1324 } 1325 1326 while (cl != NULL) { 1327 TS_DELTA(&ifd->ifnow_, &cl->last_, idle); 1328 if (idle >= 2000000000) 1329 /* 1330 * this class is idle enough, reset avgidle. 1331 * (TS_DELTA returns 2000000000 ns when delta is large.) 1332 */ 1333 cl->avgidle_ = cl->maxidle_; 1334 1335 /* get pkt_time (for class) in usec */ 1336 #if 1 /* use approximation */ 1337 pkt_time = pktlen * (int64_t)cl->ps_per_byte_; 1338 pkt_time = PSEC_TO_NSEC(pkt_time); 1339 #else 1340 pkt_time = pktlen * cl->ns_per_byte_ / 1000; 1341 #endif 1342 idle -= pkt_time; 1343 1344 avgidle = cl->avgidle_; 1345 avgidle += idle - (avgidle >> RM_FILTER_GAIN); 1346 cl->avgidle_ = avgidle; 1347 1348 /* Are we overlimit ? */ 1349 if (avgidle <= 0) { 1350 CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle); 1351 #if 1 /* ALTQ */ 1352 /* 1353 * need some lower bound for avgidle, otherwise 1354 * a borrowing class gets unbounded penalty. 1355 */ 1356 if (avgidle < cl->minidle_) 1357 avgidle = cl->avgidle_ = cl->minidle_; 1358 #endif 1359 /* set next idle to make avgidle 0 */ 1360 tidle = pkt_time + 1361 (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); 1362 TS_ADD_DELTA(nowp, tidle, &cl->undertime_); 1363 ++cl->stats_.over; 1364 } else { 1365 cl->avgidle_ = 1366 (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; 1367 cl->undertime_.tv_sec = 0; 1368 if (cl->sleeping_) { 1369 CALLOUT_STOP(&cl->callout_); 1370 cl->sleeping_ = 0; 1371 } 1372 } 1373 1374 if (borrows != NULL) { 1375 if (borrows != cl) 1376 ++cl->stats_.borrows; 1377 else 1378 borrows = NULL; 1379 } 1380 cl->last_ = ifd->ifnow_; 1381 cl->last_pkttime_ = pkt_time; 1382 1383 #if 1 1384 if (cl->parent_ == NULL && cl != cl0) { 1385 /* take stats of root class */ 1386 PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); 1387 } 1388 #endif 1389 1390 cl = cl->parent_; 1391 } 1392 1393 /* 1394 * Check to see if cutoff needs to set to a new level. 1395 */ 1396 cl = ifd->class_[ifd->qo_]; 1397 if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { 1398 #if 1 /* ALTQ */ 1399 if ((qlen(cl->q_) <= 0) || TS_LT(nowp, &borrowed->undertime_)) { 1400 rmc_tl_satisfied(ifd, nowp); 1401 CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); 1402 } else { 1403 ifd->cutoff_ = borrowed->depth_; 1404 CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); 1405 } 1406 #else /* !ALTQ */ 1407 if ((qlen(cl->q_) <= 1) || TS_LT(&now, &borrowed->undertime_)) { 1408 reset_cutoff(ifd); 1409 #ifdef notdef 1410 rmc_tl_satisfied(ifd, &now); 1411 #endif 1412 CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); 1413 } else { 1414 ifd->cutoff_ = borrowed->depth_; 1415 CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); 1416 } 1417 #endif /* !ALTQ */ 1418 } 1419 1420 /* 1421 * Release class slot 1422 */ 1423 ifd->borrowed_[ifd->qo_] = NULL; 1424 ifd->class_[ifd->qo_] = NULL; 1425 ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; 1426 ifd->queued_--; 1427 } 1428 1429 /* 1430 * void 1431 * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) 1432 * over-limit action routines. These get invoked by rmc_under_limit() 1433 * if a class with packets to send if over its bandwidth limit & can't 1434 * borrow from a parent class. 1435 * 1436 * Returns: NONE 1437 */ 1438 1439 static void 1440 rmc_drop_action(struct rm_class *cl) 1441 { 1442 struct rm_ifdat *ifd = cl->ifdat_; 1443 1444 ASSERT(qlen(cl->q_) > 0); 1445 _rmc_dropq(cl); 1446 if (qempty(cl->q_)) 1447 ifd->na_[cl->pri_]--; 1448 } 1449 1450 void 1451 rmc_dropall(struct rm_class *cl) 1452 { 1453 struct rm_ifdat *ifd = cl->ifdat_; 1454 1455 if (!qempty(cl->q_)) { 1456 _flushq(cl->q_); 1457 1458 ifd->na_[cl->pri_]--; 1459 } 1460 } 1461 1462 #if (__FreeBSD_version > 300000) 1463 static int tvhzto(struct timeval *); 1464 1465 static int 1466 tvhzto(struct timeval *tv) 1467 { 1468 struct timeval t2; 1469 1470 getmicrotime(&t2); 1471 t2.tv_sec = tv->tv_sec - t2.tv_sec; 1472 t2.tv_usec = tv->tv_usec - t2.tv_usec; 1473 return tvtohz(&t2); 1474 } 1475 #endif /* __FreeBSD_version > 300000 */ 1476 1477 /* 1478 * void 1479 * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ 1480 * delay action routine. It is invoked via rmc_under_limit when the 1481 * packet is discovered to be overlimit. 1482 * 1483 * If the delay action is result of borrow class being overlimit, then 1484 * delay for the offtime of the borrowing class that is overlimit. 1485 * 1486 * Returns: NONE 1487 */ 1488 1489 void 1490 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow) 1491 { 1492 int t; 1493 int64_t ndelay, extradelay; 1494 1495 cl->stats_.overactions++; 1496 if (borrow != NULL) 1497 TS_DELTA(&borrow->undertime_, &cl->overtime_, ndelay); 1498 else 1499 TS_DELTA(&cl->undertime_, &cl->overtime_, ndelay); 1500 #ifndef BORROW_OFFTIME 1501 ndelay += cl->offtime_; 1502 #endif 1503 1504 if (!cl->sleeping_) { 1505 CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); 1506 #ifdef BORROW_OFFTIME 1507 if (borrow != NULL) 1508 extradelay = borrow->offtime_; 1509 else 1510 #endif 1511 extradelay = cl->offtime_; 1512 1513 #ifdef ALTQ 1514 /* 1515 * XXX recalculate suspend time: 1516 * current undertime is (tidle + pkt_time) calculated 1517 * from the last transmission. 1518 * tidle: time required to bring avgidle back to 0 1519 * pkt_time: target waiting time for this class 1520 * we need to replace pkt_time by offtime 1521 */ 1522 extradelay -= cl->last_pkttime_; 1523 #endif 1524 if (extradelay > 0) { 1525 TS_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_); 1526 ndelay += extradelay; 1527 } 1528 1529 cl->sleeping_ = 1; 1530 cl->stats_.delays++; 1531 1532 /* 1533 * Since packets are phased randomly with respect to the 1534 * clock, 1 tick (the next clock tick) can be an arbitrarily 1535 * short time so we have to wait for at least two ticks. 1536 * NOTE: If there's no other traffic, we need the timer as 1537 * a 'backstop' to restart this class. 1538 */ 1539 if (NSEC_TO_USEC(ndelay) > tick * 2) { 1540 #ifdef __FreeBSD__ 1541 /* FreeBSD rounds up the tick */ 1542 t = tvhzto(&cl->undertime_); 1543 #else 1544 /* other BSDs round down the tick */ 1545 t = tshzto(&cl->undertime_) + 1; 1546 #endif 1547 } else 1548 t = 2; 1549 CALLOUT_RESET(&cl->callout_, t, 1550 (timeout_t *)rmc_restart, (void *)cl); 1551 } 1552 } 1553 1554 /* 1555 * void 1556 * rmc_restart() - is just a helper routine for rmc_delay_action -- it is 1557 * called by the system timer code & is responsible checking if the 1558 * class is still sleeping (it might have been restarted as a side 1559 * effect of the queue scan on a packet arrival) and, if so, restarting 1560 * output for the class. Inspecting the class state & restarting output 1561 * require locking the class structure. In general the driver is 1562 * responsible for locking but this is the only routine that is not 1563 * called directly or indirectly from the interface driver so it has 1564 * know about system locking conventions. Under bsd, locking is done 1565 * by raising IPL to splnet so that's what's implemented here. On a 1566 * different system this would probably need to be changed. 1567 * 1568 * Returns: NONE 1569 */ 1570 1571 static void 1572 rmc_restart(struct rm_class *cl) 1573 { 1574 struct rm_ifdat *ifd = cl->ifdat_; 1575 int s; 1576 1577 s = splnet(); 1578 if (cl->sleeping_) { 1579 cl->sleeping_ = 0; 1580 cl->undertime_.tv_sec = 0; 1581 1582 if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { 1583 CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); 1584 (ifd->restart)(ifd->ifq_); 1585 } 1586 } 1587 splx(s); 1588 } 1589 1590 /* 1591 * void 1592 * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit 1593 * handling routine for the root class of the link sharing structure. 1594 * 1595 * Returns: NONE 1596 */ 1597 1598 static void 1599 rmc_root_overlimit(struct rm_class *cl, 1600 struct rm_class *borrow) 1601 { 1602 panic("rmc_root_overlimit"); 1603 } 1604 1605 /* 1606 * Packet Queue handling routines. Eventually, this is to localize the 1607 * effects on the code whether queues are red queues or droptail 1608 * queues. 1609 */ 1610 1611 static int 1612 _rmc_addq(rm_class_t *cl, mbuf_t *m) 1613 { 1614 #ifdef ALTQ_RIO 1615 if (q_is_rio(cl->q_)) 1616 return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_); 1617 #endif 1618 #ifdef ALTQ_RED 1619 if (q_is_red(cl->q_)) 1620 return red_addq(cl->red_, cl->q_, m, cl->pktattr_); 1621 #endif /* ALTQ_RED */ 1622 1623 if (cl->flags_ & RMCF_CLEARDSCP) 1624 write_dsfield(m, cl->pktattr_, 0); 1625 1626 _addq(cl->q_, m); 1627 return 0; 1628 } 1629 1630 /* note: _rmc_dropq is not called for red */ 1631 static void 1632 _rmc_dropq(rm_class_t *cl) 1633 { 1634 mbuf_t *m; 1635 1636 if ((m = _getq(cl->q_)) != NULL) 1637 m_freem(m); 1638 } 1639 1640 static mbuf_t * 1641 _rmc_getq(rm_class_t *cl) 1642 { 1643 #ifdef ALTQ_RIO 1644 if (q_is_rio(cl->q_)) 1645 return rio_getq((rio_t *)cl->red_, cl->q_); 1646 #endif 1647 #ifdef ALTQ_RED 1648 if (q_is_red(cl->q_)) 1649 return red_getq(cl->red_, cl->q_); 1650 #endif 1651 return _getq(cl->q_); 1652 } 1653 1654 static mbuf_t * 1655 _rmc_pollq(rm_class_t *cl) 1656 { 1657 return qhead(cl->q_); 1658 } 1659 1660 #ifdef CBQ_TRACE 1661 1662 struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; 1663 struct cbqtrace *cbqtrace_ptr = NULL; 1664 int cbqtrace_count; 1665 1666 /* 1667 * DDB hook to trace cbq events: 1668 * the last 1024 events are held in a circular buffer. 1669 * use "call cbqtrace_dump(N)" to display 20 events from Nth event. 1670 */ 1671 void cbqtrace_dump(int); 1672 static char *rmc_funcname(void *); 1673 1674 static struct rmc_funcs { 1675 void *func; 1676 char *name; 1677 } rmc_funcs[] = 1678 { 1679 rmc_init, "rmc_init", 1680 rmc_queue_packet, "rmc_queue_packet", 1681 rmc_under_limit, "rmc_under_limit", 1682 rmc_update_class_util, "rmc_update_class_util", 1683 rmc_delay_action, "rmc_delay_action", 1684 rmc_restart, "rmc_restart", 1685 _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", 1686 NULL, NULL 1687 }; 1688 1689 static char * 1690 rmc_funcname(void *func) 1691 { 1692 struct rmc_funcs *fp; 1693 1694 for (fp = rmc_funcs; fp->func != NULL; fp++) 1695 if (fp->func == func) 1696 return (fp->name); 1697 return ("unknown"); 1698 } 1699 1700 void 1701 cbqtrace_dump(int counter) 1702 { 1703 int i, *p; 1704 char *cp; 1705 1706 counter = counter % NCBQTRACE; 1707 p = (int *)&cbqtrace_buffer[counter]; 1708 1709 for (i=0; i<20; i++) { 1710 printf("[0x%x] ", *p++); 1711 printf("%s: ", rmc_funcname((void *)*p++)); 1712 cp = (char *)p++; 1713 printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); 1714 printf("%d\n",*p++); 1715 1716 if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) 1717 p = (int *)cbqtrace_buffer; 1718 } 1719 } 1720 #endif /* CBQ_TRACE */ 1721 #endif /* ALTQ_CBQ */ 1722 1723 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) 1724 #if !defined(__GNUC__) || defined(ALTQ_DEBUG) 1725 1726 void 1727 _addq(class_queue_t *q, mbuf_t *m) 1728 { 1729 mbuf_t *m0; 1730 1731 if ((m0 = qtail(q)) != NULL) 1732 m->m_nextpkt = m0->m_nextpkt; 1733 else 1734 m0 = m; 1735 m0->m_nextpkt = m; 1736 qtail(q) = m; 1737 qlen(q)++; 1738 } 1739 1740 mbuf_t * 1741 _getq(class_queue_t *q) 1742 { 1743 mbuf_t *m, *m0; 1744 1745 if ((m = qtail(q)) == NULL) 1746 return NULL; 1747 if ((m0 = m->m_nextpkt) != m) 1748 m->m_nextpkt = m0->m_nextpkt; 1749 else { 1750 ASSERT(qlen(q) == 1); 1751 qtail(q) = NULL; 1752 } 1753 qlen(q)--; 1754 m0->m_nextpkt = NULL; 1755 return m0; 1756 } 1757 1758 /* drop a packet at the tail of the queue */ 1759 mbuf_t * 1760 _getq_tail(class_queue_t *q) 1761 { 1762 mbuf_t *m, *m0, *prev; 1763 1764 if ((m = m0 = qtail(q)) == NULL) 1765 return NULL; 1766 do { 1767 prev = m0; 1768 m0 = m0->m_nextpkt; 1769 } while (m0 != m); 1770 prev->m_nextpkt = m->m_nextpkt; 1771 if (prev == m) { 1772 ASSERT(qlen(q) == 1); 1773 qtail(q) = NULL; 1774 } else 1775 qtail(q) = prev; 1776 qlen(q)--; 1777 m->m_nextpkt = NULL; 1778 return m; 1779 } 1780 1781 /* randomly select a packet in the queue */ 1782 mbuf_t * 1783 _getq_random(class_queue_t *q) 1784 { 1785 struct mbuf *m; 1786 int i, n; 1787 1788 if ((m = qtail(q)) == NULL) 1789 return NULL; 1790 if (m->m_nextpkt == m) { 1791 ASSERT(qlen(q) == 1); 1792 qtail(q) = NULL; 1793 } else { 1794 struct mbuf *prev = NULL; 1795 1796 n = cprng_fast32() % qlen(q) + 1; 1797 for (i = 0; i < n; i++) { 1798 prev = m; 1799 m = m->m_nextpkt; 1800 } 1801 prev->m_nextpkt = m->m_nextpkt; 1802 if (m == qtail(q)) 1803 qtail(q) = prev; 1804 } 1805 qlen(q)--; 1806 m->m_nextpkt = NULL; 1807 return m; 1808 } 1809 1810 void 1811 _removeq(class_queue_t *q, mbuf_t *m) 1812 { 1813 mbuf_t *m0, *prev; 1814 1815 m0 = qtail(q); 1816 do { 1817 prev = m0; 1818 m0 = m0->m_nextpkt; 1819 } while (m0 != m); 1820 prev->m_nextpkt = m->m_nextpkt; 1821 if (prev == m) 1822 qtail(q) = NULL; 1823 else if (qtail(q) == m) 1824 qtail(q) = prev; 1825 qlen(q)--; 1826 } 1827 1828 void 1829 _flushq(class_queue_t *q) 1830 { 1831 mbuf_t *m; 1832 1833 while ((m = _getq(q)) != NULL) 1834 m_freem(m); 1835 ASSERT(qlen(q) == 0); 1836 } 1837 1838 #endif /* !__GNUC__ || ALTQ_DEBUG */ 1839 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */ 1840