1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2017 6WIND S.A. 5 * Copyright 2017 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <errno.h> 35 #include <string.h> 36 #include <unistd.h> 37 #include <sys/queue.h> 38 #include <sys/resource.h> 39 40 #include <rte_byteorder.h> 41 #include <rte_jhash.h> 42 #include <rte_malloc.h> 43 #include <rte_eth_tap.h> 44 #include <tap_flow.h> 45 #include <tap_autoconf.h> 46 #include <tap_tcmsgs.h> 47 #include <tap_rss.h> 48 49 #ifndef HAVE_TC_FLOWER 50 /* 51 * For kernels < 4.2, this enum is not defined. Runtime checks will be made to 52 * avoid sending TC messages the kernel cannot understand. 53 */ 54 enum { 55 TCA_FLOWER_UNSPEC, 56 TCA_FLOWER_CLASSID, 57 TCA_FLOWER_INDEV, 58 TCA_FLOWER_ACT, 59 TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ 60 TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ 61 TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ 62 TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ 63 TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ 64 TCA_FLOWER_KEY_IP_PROTO, /* u8 */ 65 TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ 66 TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ 67 TCA_FLOWER_KEY_IPV4_DST, /* be32 */ 68 TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ 69 TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ 70 TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ 71 TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ 72 TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ 73 TCA_FLOWER_KEY_TCP_SRC, /* be16 */ 74 TCA_FLOWER_KEY_TCP_DST, /* be16 */ 75 TCA_FLOWER_KEY_UDP_SRC, /* be16 */ 76 TCA_FLOWER_KEY_UDP_DST, /* be16 */ 77 }; 78 #endif 79 #ifndef HAVE_TC_VLAN_ID 80 enum { 81 /* TCA_FLOWER_FLAGS, */ 82 TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */ 83 TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ 84 TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ 85 }; 86 #endif 87 /* 88 * For kernels < 4.2 BPF related enums may not be defined. 89 * Runtime checks will be carried out to gracefully report on TC messages that 90 * are rejected by the kernel. Rejection reasons may be due to: 91 * 1. enum is not defined 92 * 2. enum is defined but kernel is not configured to support BPF system calls, 93 * BPF classifications or BPF actions. 94 */ 95 #ifndef HAVE_TC_BPF 96 enum { 97 TCA_BPF_UNSPEC, 98 TCA_BPF_ACT, 99 TCA_BPF_POLICE, 100 TCA_BPF_CLASSID, 101 TCA_BPF_OPS_LEN, 102 TCA_BPF_OPS, 103 }; 104 #endif 105 #ifndef HAVE_TC_BPF_FD 106 enum { 107 TCA_BPF_FD = TCA_BPF_OPS + 1, 108 TCA_BPF_NAME, 109 }; 110 #endif 111 #ifndef HAVE_TC_ACT_BPF 112 #define tc_gen \ 113 __u32 index; \ 114 __u32 capab; \ 115 int action; \ 116 int refcnt; \ 117 int bindcnt 118 119 struct tc_act_bpf { 120 tc_gen; 121 }; 122 123 enum { 124 TCA_ACT_BPF_UNSPEC, 125 TCA_ACT_BPF_TM, 126 TCA_ACT_BPF_PARMS, 127 TCA_ACT_BPF_OPS_LEN, 128 TCA_ACT_BPF_OPS, 129 }; 130 131 #endif 132 #ifndef HAVE_TC_ACT_BPF_FD 133 enum { 134 TCA_ACT_BPF_FD = TCA_ACT_BPF_OPS + 1, 135 TCA_ACT_BPF_NAME, 136 }; 137 #endif 138 139 /* RSS key management */ 140 enum bpf_rss_key_e { 141 KEY_CMD_GET = 1, 142 KEY_CMD_RELEASE, 143 KEY_CMD_INIT, 144 KEY_CMD_DEINIT, 145 }; 146 147 enum key_status_e { 148 KEY_STAT_UNSPEC, 149 KEY_STAT_USED, 150 KEY_STAT_AVAILABLE, 151 }; 152 153 #define ISOLATE_HANDLE 1 154 155 struct rte_flow { 156 LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */ 157 struct rte_flow *remote_flow; /* associated remote flow */ 158 int bpf_fd[SEC_MAX]; /* list of bfs fds per ELF section */ 159 uint32_t key_idx; /* RSS rule key index into BPF map */ 160 struct nlmsg msg; 161 }; 162 163 struct convert_data { 164 uint16_t eth_type; 165 uint16_t ip_proto; 166 uint8_t vlan; 167 struct rte_flow *flow; 168 }; 169 170 struct remote_rule { 171 struct rte_flow_attr attr; 172 struct rte_flow_item items[2]; 173 struct rte_flow_action actions[2]; 174 int mirred; 175 }; 176 177 struct action_data { 178 char id[16]; 179 180 union { 181 struct tc_gact gact; 182 struct tc_mirred mirred; 183 struct skbedit { 184 struct tc_skbedit skbedit; 185 uint16_t queue; 186 } skbedit; 187 struct bpf { 188 struct tc_act_bpf bpf; 189 int bpf_fd; 190 const char *annotation; 191 } bpf; 192 }; 193 }; 194 195 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data); 196 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data); 197 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data); 198 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data); 199 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data); 200 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data); 201 static int 202 tap_flow_validate(struct rte_eth_dev *dev, 203 const struct rte_flow_attr *attr, 204 const struct rte_flow_item items[], 205 const struct rte_flow_action actions[], 206 struct rte_flow_error *error); 207 208 static struct rte_flow * 209 tap_flow_create(struct rte_eth_dev *dev, 210 const struct rte_flow_attr *attr, 211 const struct rte_flow_item items[], 212 const struct rte_flow_action actions[], 213 struct rte_flow_error *error); 214 215 static int 216 tap_flow_destroy(struct rte_eth_dev *dev, 217 struct rte_flow *flow, 218 struct rte_flow_error *error); 219 220 static int 221 tap_flow_isolate(struct rte_eth_dev *dev, 222 int set, 223 struct rte_flow_error *error); 224 225 static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx); 226 static int rss_enable(struct pmd_internals *pmd, 227 const struct rte_flow_attr *attr, 228 struct rte_flow_error *error); 229 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd, 230 const struct rte_flow_action_rss *rss, 231 struct rte_flow_error *error); 232 233 static const struct rte_flow_ops tap_flow_ops = { 234 .validate = tap_flow_validate, 235 .create = tap_flow_create, 236 .destroy = tap_flow_destroy, 237 .flush = tap_flow_flush, 238 .isolate = tap_flow_isolate, 239 }; 240 241 /* Static initializer for items. */ 242 #define ITEMS(...) \ 243 (const enum rte_flow_item_type []){ \ 244 __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \ 245 } 246 247 /* Structure to generate a simple graph of layers supported by the NIC. */ 248 struct tap_flow_items { 249 /* Bit-mask corresponding to what is supported for this item. */ 250 const void *mask; 251 const unsigned int mask_sz; /* Bit-mask size in bytes. */ 252 /* 253 * Bit-mask corresponding to the default mask, if none is provided 254 * along with the item. 255 */ 256 const void *default_mask; 257 /** 258 * Conversion function from rte_flow to netlink attributes. 259 * 260 * @param item 261 * rte_flow item to convert. 262 * @param data 263 * Internal structure to store the conversion. 264 * 265 * @return 266 * 0 on success, negative value otherwise. 267 */ 268 int (*convert)(const struct rte_flow_item *item, void *data); 269 /** List of possible following items. */ 270 const enum rte_flow_item_type *const items; 271 }; 272 273 /* Graph of supported items and associated actions. */ 274 static const struct tap_flow_items tap_flow_items[] = { 275 [RTE_FLOW_ITEM_TYPE_END] = { 276 .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH), 277 }, 278 [RTE_FLOW_ITEM_TYPE_ETH] = { 279 .items = ITEMS( 280 RTE_FLOW_ITEM_TYPE_VLAN, 281 RTE_FLOW_ITEM_TYPE_IPV4, 282 RTE_FLOW_ITEM_TYPE_IPV6), 283 .mask = &(const struct rte_flow_item_eth){ 284 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", 285 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", 286 .type = -1, 287 }, 288 .mask_sz = sizeof(struct rte_flow_item_eth), 289 .default_mask = &rte_flow_item_eth_mask, 290 .convert = tap_flow_create_eth, 291 }, 292 [RTE_FLOW_ITEM_TYPE_VLAN] = { 293 .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4, 294 RTE_FLOW_ITEM_TYPE_IPV6), 295 .mask = &(const struct rte_flow_item_vlan){ 296 .tpid = -1, 297 /* DEI matching is not supported */ 298 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN 299 .tci = 0xffef, 300 #else 301 .tci = 0xefff, 302 #endif 303 }, 304 .mask_sz = sizeof(struct rte_flow_item_vlan), 305 .default_mask = &rte_flow_item_vlan_mask, 306 .convert = tap_flow_create_vlan, 307 }, 308 [RTE_FLOW_ITEM_TYPE_IPV4] = { 309 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, 310 RTE_FLOW_ITEM_TYPE_TCP), 311 .mask = &(const struct rte_flow_item_ipv4){ 312 .hdr = { 313 .src_addr = -1, 314 .dst_addr = -1, 315 .next_proto_id = -1, 316 }, 317 }, 318 .mask_sz = sizeof(struct rte_flow_item_ipv4), 319 .default_mask = &rte_flow_item_ipv4_mask, 320 .convert = tap_flow_create_ipv4, 321 }, 322 [RTE_FLOW_ITEM_TYPE_IPV6] = { 323 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, 324 RTE_FLOW_ITEM_TYPE_TCP), 325 .mask = &(const struct rte_flow_item_ipv6){ 326 .hdr = { 327 .src_addr = { 328 "\xff\xff\xff\xff\xff\xff\xff\xff" 329 "\xff\xff\xff\xff\xff\xff\xff\xff", 330 }, 331 .dst_addr = { 332 "\xff\xff\xff\xff\xff\xff\xff\xff" 333 "\xff\xff\xff\xff\xff\xff\xff\xff", 334 }, 335 .proto = -1, 336 }, 337 }, 338 .mask_sz = sizeof(struct rte_flow_item_ipv6), 339 .default_mask = &rte_flow_item_ipv6_mask, 340 .convert = tap_flow_create_ipv6, 341 }, 342 [RTE_FLOW_ITEM_TYPE_UDP] = { 343 .mask = &(const struct rte_flow_item_udp){ 344 .hdr = { 345 .src_port = -1, 346 .dst_port = -1, 347 }, 348 }, 349 .mask_sz = sizeof(struct rte_flow_item_udp), 350 .default_mask = &rte_flow_item_udp_mask, 351 .convert = tap_flow_create_udp, 352 }, 353 [RTE_FLOW_ITEM_TYPE_TCP] = { 354 .mask = &(const struct rte_flow_item_tcp){ 355 .hdr = { 356 .src_port = -1, 357 .dst_port = -1, 358 }, 359 }, 360 .mask_sz = sizeof(struct rte_flow_item_tcp), 361 .default_mask = &rte_flow_item_tcp_mask, 362 .convert = tap_flow_create_tcp, 363 }, 364 }; 365 366 /* 367 * TC rules, by growing priority 368 * 369 * Remote netdevice Tap netdevice 370 * +-------------+-------------+ +-------------+-------------+ 371 * | Ingress | Egress | | Ingress | Egress | 372 * |-------------|-------------| |-------------|-------------| 373 * | | \ / | | | REMOTE TX | prio 1 374 * | | \ / | | | \ / | prio 2 375 * | EXPLICIT | \ / | | EXPLICIT | \ / | . 376 * | | \ / | | | \ / | . 377 * | RULES | X | | RULES | X | . 378 * | . | / \ | | . | / \ | . 379 * | . | / \ | | . | / \ | . 380 * | . | / \ | | . | / \ | . 381 * | . | / \ | | . | / \ | . 382 * 383 * .... .... .... .... 384 * 385 * | . | \ / | | . | \ / | . 386 * | . | \ / | | . | \ / | . 387 * | | \ / | | | \ / | 388 * | LOCAL_MAC | \ / | | \ / | \ / | last prio - 5 389 * | PROMISC | X | | \ / | X | last prio - 4 390 * | ALLMULTI | / \ | | X | / \ | last prio - 3 391 * | BROADCAST | / \ | | / \ | / \ | last prio - 2 392 * | BROADCASTV6 | / \ | | / \ | / \ | last prio - 1 393 * | xx | / \ | | ISOLATE | / \ | last prio 394 * +-------------+-------------+ +-------------+-------------+ 395 * 396 * The implicit flow rules are stored in a list in with mandatorily the last two 397 * being the ISOLATE and REMOTE_TX rules. e.g.: 398 * 399 * LOCAL_MAC -> BROADCAST -> BROADCASTV6 -> REMOTE_TX -> ISOLATE -> NULL 400 * 401 * That enables tap_flow_isolate() to remove implicit rules by popping the list 402 * head and remove it as long as it applies on the remote netdevice. The 403 * implicit rule for TX redirection is not removed, as isolate concerns only 404 * incoming traffic. 405 */ 406 407 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = { 408 [TAP_REMOTE_LOCAL_MAC] = { 409 .attr = { 410 .group = MAX_GROUP, 411 .priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC, 412 .ingress = 1, 413 }, 414 .items[0] = { 415 .type = RTE_FLOW_ITEM_TYPE_ETH, 416 .mask = &(const struct rte_flow_item_eth){ 417 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", 418 }, 419 }, 420 .items[1] = { 421 .type = RTE_FLOW_ITEM_TYPE_END, 422 }, 423 .mirred = TCA_EGRESS_REDIR, 424 }, 425 [TAP_REMOTE_BROADCAST] = { 426 .attr = { 427 .group = MAX_GROUP, 428 .priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST, 429 .ingress = 1, 430 }, 431 .items[0] = { 432 .type = RTE_FLOW_ITEM_TYPE_ETH, 433 .mask = &(const struct rte_flow_item_eth){ 434 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", 435 }, 436 .spec = &(const struct rte_flow_item_eth){ 437 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", 438 }, 439 }, 440 .items[1] = { 441 .type = RTE_FLOW_ITEM_TYPE_END, 442 }, 443 .mirred = TCA_EGRESS_MIRROR, 444 }, 445 [TAP_REMOTE_BROADCASTV6] = { 446 .attr = { 447 .group = MAX_GROUP, 448 .priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6, 449 .ingress = 1, 450 }, 451 .items[0] = { 452 .type = RTE_FLOW_ITEM_TYPE_ETH, 453 .mask = &(const struct rte_flow_item_eth){ 454 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00", 455 }, 456 .spec = &(const struct rte_flow_item_eth){ 457 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00", 458 }, 459 }, 460 .items[1] = { 461 .type = RTE_FLOW_ITEM_TYPE_END, 462 }, 463 .mirred = TCA_EGRESS_MIRROR, 464 }, 465 [TAP_REMOTE_PROMISC] = { 466 .attr = { 467 .group = MAX_GROUP, 468 .priority = PRIORITY_MASK - TAP_REMOTE_PROMISC, 469 .ingress = 1, 470 }, 471 .items[0] = { 472 .type = RTE_FLOW_ITEM_TYPE_VOID, 473 }, 474 .items[1] = { 475 .type = RTE_FLOW_ITEM_TYPE_END, 476 }, 477 .mirred = TCA_EGRESS_MIRROR, 478 }, 479 [TAP_REMOTE_ALLMULTI] = { 480 .attr = { 481 .group = MAX_GROUP, 482 .priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI, 483 .ingress = 1, 484 }, 485 .items[0] = { 486 .type = RTE_FLOW_ITEM_TYPE_ETH, 487 .mask = &(const struct rte_flow_item_eth){ 488 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00", 489 }, 490 .spec = &(const struct rte_flow_item_eth){ 491 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00", 492 }, 493 }, 494 .items[1] = { 495 .type = RTE_FLOW_ITEM_TYPE_END, 496 }, 497 .mirred = TCA_EGRESS_MIRROR, 498 }, 499 [TAP_REMOTE_TX] = { 500 .attr = { 501 .group = 0, 502 .priority = TAP_REMOTE_TX, 503 .egress = 1, 504 }, 505 .items[0] = { 506 .type = RTE_FLOW_ITEM_TYPE_VOID, 507 }, 508 .items[1] = { 509 .type = RTE_FLOW_ITEM_TYPE_END, 510 }, 511 .mirred = TCA_EGRESS_MIRROR, 512 }, 513 [TAP_ISOLATE] = { 514 .attr = { 515 .group = MAX_GROUP, 516 .priority = PRIORITY_MASK - TAP_ISOLATE, 517 .ingress = 1, 518 }, 519 .items[0] = { 520 .type = RTE_FLOW_ITEM_TYPE_VOID, 521 }, 522 .items[1] = { 523 .type = RTE_FLOW_ITEM_TYPE_END, 524 }, 525 }, 526 }; 527 528 /** 529 * Make as much checks as possible on an Ethernet item, and if a flow is 530 * provided, fill it appropriately with Ethernet info. 531 * 532 * @param[in] item 533 * Item specification. 534 * @param[in, out] data 535 * Additional data structure to tell next layers we've been here. 536 * 537 * @return 538 * 0 if checks are alright, -1 otherwise. 539 */ 540 static int 541 tap_flow_create_eth(const struct rte_flow_item *item, void *data) 542 { 543 struct convert_data *info = (struct convert_data *)data; 544 const struct rte_flow_item_eth *spec = item->spec; 545 const struct rte_flow_item_eth *mask = item->mask; 546 struct rte_flow *flow = info->flow; 547 struct nlmsg *msg; 548 549 /* use default mask if none provided */ 550 if (!mask) 551 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask; 552 /* TC does not support eth_type masking. Only accept if exact match. */ 553 if (mask->type && mask->type != 0xffff) 554 return -1; 555 if (!spec) 556 return 0; 557 /* store eth_type for consistency if ipv4/6 pattern item comes next */ 558 if (spec->type & mask->type) 559 info->eth_type = spec->type; 560 if (!flow) 561 return 0; 562 msg = &flow->msg; 563 if (!is_zero_ether_addr(&spec->dst)) { 564 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN, 565 &spec->dst.addr_bytes); 566 tap_nlattr_add(&msg->nh, 567 TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN, 568 &mask->dst.addr_bytes); 569 } 570 if (!is_zero_ether_addr(&mask->src)) { 571 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN, 572 &spec->src.addr_bytes); 573 tap_nlattr_add(&msg->nh, 574 TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN, 575 &mask->src.addr_bytes); 576 } 577 return 0; 578 } 579 580 /** 581 * Make as much checks as possible on a VLAN item, and if a flow is provided, 582 * fill it appropriately with VLAN info. 583 * 584 * @param[in] item 585 * Item specification. 586 * @param[in, out] data 587 * Additional data structure to tell next layers we've been here. 588 * 589 * @return 590 * 0 if checks are alright, -1 otherwise. 591 */ 592 static int 593 tap_flow_create_vlan(const struct rte_flow_item *item, void *data) 594 { 595 struct convert_data *info = (struct convert_data *)data; 596 const struct rte_flow_item_vlan *spec = item->spec; 597 const struct rte_flow_item_vlan *mask = item->mask; 598 struct rte_flow *flow = info->flow; 599 struct nlmsg *msg; 600 601 /* use default mask if none provided */ 602 if (!mask) 603 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask; 604 /* TC does not support tpid masking. Only accept if exact match. */ 605 if (mask->tpid && mask->tpid != 0xffff) 606 return -1; 607 /* Double-tagging not supported. */ 608 if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q)) 609 return -1; 610 info->vlan = 1; 611 if (!flow) 612 return 0; 613 msg = &flow->msg; 614 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q)); 615 #define VLAN_PRIO(tci) ((tci) >> 13) 616 #define VLAN_ID(tci) ((tci) & 0xfff) 617 if (!spec) 618 return 0; 619 if (spec->tci) { 620 uint16_t tci = ntohs(spec->tci) & mask->tci; 621 uint16_t prio = VLAN_PRIO(tci); 622 uint8_t vid = VLAN_ID(tci); 623 624 if (prio) 625 tap_nlattr_add8(&msg->nh, 626 TCA_FLOWER_KEY_VLAN_PRIO, prio); 627 if (vid) 628 tap_nlattr_add16(&msg->nh, 629 TCA_FLOWER_KEY_VLAN_ID, vid); 630 } 631 return 0; 632 } 633 634 /** 635 * Make as much checks as possible on an IPv4 item, and if a flow is provided, 636 * fill it appropriately with IPv4 info. 637 * 638 * @param[in] item 639 * Item specification. 640 * @param[in, out] data 641 * Additional data structure to tell next layers we've been here. 642 * 643 * @return 644 * 0 if checks are alright, -1 otherwise. 645 */ 646 static int 647 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data) 648 { 649 struct convert_data *info = (struct convert_data *)data; 650 const struct rte_flow_item_ipv4 *spec = item->spec; 651 const struct rte_flow_item_ipv4 *mask = item->mask; 652 struct rte_flow *flow = info->flow; 653 struct nlmsg *msg; 654 655 /* use default mask if none provided */ 656 if (!mask) 657 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask; 658 /* check that previous eth type is compatible with ipv4 */ 659 if (info->eth_type && info->eth_type != htons(ETH_P_IP)) 660 return -1; 661 /* store ip_proto for consistency if udp/tcp pattern item comes next */ 662 if (spec) 663 info->ip_proto = spec->hdr.next_proto_id; 664 if (!flow) 665 return 0; 666 msg = &flow->msg; 667 if (!info->eth_type) 668 info->eth_type = htons(ETH_P_IP); 669 if (!spec) 670 return 0; 671 if (spec->hdr.dst_addr) { 672 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST, 673 spec->hdr.dst_addr); 674 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK, 675 mask->hdr.dst_addr); 676 } 677 if (spec->hdr.src_addr) { 678 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC, 679 spec->hdr.src_addr); 680 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK, 681 mask->hdr.src_addr); 682 } 683 if (spec->hdr.next_proto_id) 684 tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, 685 spec->hdr.next_proto_id); 686 return 0; 687 } 688 689 /** 690 * Make as much checks as possible on an IPv6 item, and if a flow is provided, 691 * fill it appropriately with IPv6 info. 692 * 693 * @param[in] item 694 * Item specification. 695 * @param[in, out] data 696 * Additional data structure to tell next layers we've been here. 697 * 698 * @return 699 * 0 if checks are alright, -1 otherwise. 700 */ 701 static int 702 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data) 703 { 704 struct convert_data *info = (struct convert_data *)data; 705 const struct rte_flow_item_ipv6 *spec = item->spec; 706 const struct rte_flow_item_ipv6 *mask = item->mask; 707 struct rte_flow *flow = info->flow; 708 uint8_t empty_addr[16] = { 0 }; 709 struct nlmsg *msg; 710 711 /* use default mask if none provided */ 712 if (!mask) 713 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask; 714 /* check that previous eth type is compatible with ipv6 */ 715 if (info->eth_type && info->eth_type != htons(ETH_P_IPV6)) 716 return -1; 717 /* store ip_proto for consistency if udp/tcp pattern item comes next */ 718 if (spec) 719 info->ip_proto = spec->hdr.proto; 720 if (!flow) 721 return 0; 722 msg = &flow->msg; 723 if (!info->eth_type) 724 info->eth_type = htons(ETH_P_IPV6); 725 if (!spec) 726 return 0; 727 if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) { 728 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST, 729 sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr); 730 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK, 731 sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr); 732 } 733 if (memcmp(spec->hdr.src_addr, empty_addr, 16)) { 734 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC, 735 sizeof(spec->hdr.src_addr), &spec->hdr.src_addr); 736 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK, 737 sizeof(mask->hdr.src_addr), &mask->hdr.src_addr); 738 } 739 if (spec->hdr.proto) 740 tap_nlattr_add8(&msg->nh, 741 TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto); 742 return 0; 743 } 744 745 /** 746 * Make as much checks as possible on a UDP item, and if a flow is provided, 747 * fill it appropriately with UDP info. 748 * 749 * @param[in] item 750 * Item specification. 751 * @param[in, out] data 752 * Additional data structure to tell next layers we've been here. 753 * 754 * @return 755 * 0 if checks are alright, -1 otherwise. 756 */ 757 static int 758 tap_flow_create_udp(const struct rte_flow_item *item, void *data) 759 { 760 struct convert_data *info = (struct convert_data *)data; 761 const struct rte_flow_item_udp *spec = item->spec; 762 const struct rte_flow_item_udp *mask = item->mask; 763 struct rte_flow *flow = info->flow; 764 struct nlmsg *msg; 765 766 /* use default mask if none provided */ 767 if (!mask) 768 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask; 769 /* check that previous ip_proto is compatible with udp */ 770 if (info->ip_proto && info->ip_proto != IPPROTO_UDP) 771 return -1; 772 /* TC does not support UDP port masking. Only accept if exact match. */ 773 if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) || 774 (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff)) 775 return -1; 776 if (!flow) 777 return 0; 778 msg = &flow->msg; 779 tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP); 780 if (!spec) 781 return 0; 782 if (spec->hdr.dst_port & mask->hdr.dst_port) 783 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST, 784 spec->hdr.dst_port); 785 if (spec->hdr.src_port & mask->hdr.src_port) 786 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC, 787 spec->hdr.src_port); 788 return 0; 789 } 790 791 /** 792 * Make as much checks as possible on a TCP item, and if a flow is provided, 793 * fill it appropriately with TCP info. 794 * 795 * @param[in] item 796 * Item specification. 797 * @param[in, out] data 798 * Additional data structure to tell next layers we've been here. 799 * 800 * @return 801 * 0 if checks are alright, -1 otherwise. 802 */ 803 static int 804 tap_flow_create_tcp(const struct rte_flow_item *item, void *data) 805 { 806 struct convert_data *info = (struct convert_data *)data; 807 const struct rte_flow_item_tcp *spec = item->spec; 808 const struct rte_flow_item_tcp *mask = item->mask; 809 struct rte_flow *flow = info->flow; 810 struct nlmsg *msg; 811 812 /* use default mask if none provided */ 813 if (!mask) 814 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask; 815 /* check that previous ip_proto is compatible with tcp */ 816 if (info->ip_proto && info->ip_proto != IPPROTO_TCP) 817 return -1; 818 /* TC does not support TCP port masking. Only accept if exact match. */ 819 if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) || 820 (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff)) 821 return -1; 822 if (!flow) 823 return 0; 824 msg = &flow->msg; 825 tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP); 826 if (!spec) 827 return 0; 828 if (spec->hdr.dst_port & mask->hdr.dst_port) 829 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST, 830 spec->hdr.dst_port); 831 if (spec->hdr.src_port & mask->hdr.src_port) 832 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC, 833 spec->hdr.src_port); 834 return 0; 835 } 836 837 /** 838 * Check support for a given item. 839 * 840 * @param[in] item 841 * Item specification. 842 * @param size 843 * Bit-Mask size in bytes. 844 * @param[in] supported_mask 845 * Bit-mask covering supported fields to compare with spec, last and mask in 846 * \item. 847 * @param[in] default_mask 848 * Bit-mask default mask if none is provided in \item. 849 * 850 * @return 851 * 0 on success. 852 */ 853 static int 854 tap_flow_item_validate(const struct rte_flow_item *item, 855 unsigned int size, 856 const uint8_t *supported_mask, 857 const uint8_t *default_mask) 858 { 859 int ret = 0; 860 861 /* An empty layer is allowed, as long as all fields are NULL */ 862 if (!item->spec && (item->mask || item->last)) 863 return -1; 864 /* Is the item spec compatible with what the NIC supports? */ 865 if (item->spec && !item->mask) { 866 unsigned int i; 867 const uint8_t *spec = item->spec; 868 869 for (i = 0; i < size; ++i) 870 if ((spec[i] | supported_mask[i]) != supported_mask[i]) 871 return -1; 872 /* Is the default mask compatible with what the NIC supports? */ 873 for (i = 0; i < size; i++) 874 if ((default_mask[i] | supported_mask[i]) != 875 supported_mask[i]) 876 return -1; 877 } 878 /* Is the item last compatible with what the NIC supports? */ 879 if (item->last && !item->mask) { 880 unsigned int i; 881 const uint8_t *spec = item->last; 882 883 for (i = 0; i < size; ++i) 884 if ((spec[i] | supported_mask[i]) != supported_mask[i]) 885 return -1; 886 } 887 /* Is the item mask compatible with what the NIC supports? */ 888 if (item->mask) { 889 unsigned int i; 890 const uint8_t *spec = item->mask; 891 892 for (i = 0; i < size; ++i) 893 if ((spec[i] | supported_mask[i]) != supported_mask[i]) 894 return -1; 895 } 896 /** 897 * Once masked, Are item spec and item last equal? 898 * TC does not support range so anything else is invalid. 899 */ 900 if (item->spec && item->last) { 901 uint8_t spec[size]; 902 uint8_t last[size]; 903 const uint8_t *apply = default_mask; 904 unsigned int i; 905 906 if (item->mask) 907 apply = item->mask; 908 for (i = 0; i < size; ++i) { 909 spec[i] = ((const uint8_t *)item->spec)[i] & apply[i]; 910 last[i] = ((const uint8_t *)item->last)[i] & apply[i]; 911 } 912 ret = memcmp(spec, last, size); 913 } 914 return ret; 915 } 916 917 /** 918 * Configure the kernel with a TC action and its configured parameters 919 * Handled actions: "gact", "mirred", "skbedit", "bpf" 920 * 921 * @param[in] flow 922 * Pointer to rte flow containing the netlink message 923 * 924 * @param[in, out] act_index 925 * Pointer to action sequence number in the TC command 926 * 927 * @param[in] adata 928 * Pointer to struct holding the action parameters 929 * 930 * @return 931 * -1 on failure, 0 on success 932 */ 933 static int 934 add_action(struct rte_flow *flow, size_t *act_index, struct action_data *adata) 935 { 936 struct nlmsg *msg = &flow->msg; 937 938 if (tap_nlattr_nested_start(msg, (*act_index)++) < 0) 939 return -1; 940 941 tap_nlattr_add(&msg->nh, TCA_ACT_KIND, 942 strlen(adata->id) + 1, adata->id); 943 if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) 944 return -1; 945 if (strcmp("gact", adata->id) == 0) { 946 tap_nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(adata->gact), 947 &adata->gact); 948 } else if (strcmp("mirred", adata->id) == 0) { 949 if (adata->mirred.eaction == TCA_EGRESS_MIRROR) 950 adata->mirred.action = TC_ACT_PIPE; 951 else /* REDIRECT */ 952 adata->mirred.action = TC_ACT_STOLEN; 953 tap_nlattr_add(&msg->nh, TCA_MIRRED_PARMS, 954 sizeof(adata->mirred), 955 &adata->mirred); 956 } else if (strcmp("skbedit", adata->id) == 0) { 957 tap_nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, 958 sizeof(adata->skbedit.skbedit), 959 &adata->skbedit.skbedit); 960 tap_nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, 961 adata->skbedit.queue); 962 } else if (strcmp("bpf", adata->id) == 0) { 963 tap_nlattr_add32(&msg->nh, TCA_ACT_BPF_FD, adata->bpf.bpf_fd); 964 tap_nlattr_add(&msg->nh, TCA_ACT_BPF_NAME, 965 strlen(adata->bpf.annotation) + 1, 966 adata->bpf.annotation); 967 tap_nlattr_add(&msg->nh, TCA_ACT_BPF_PARMS, 968 sizeof(adata->bpf.bpf), 969 &adata->bpf.bpf); 970 } else { 971 return -1; 972 } 973 tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ 974 tap_nlattr_nested_finish(msg); /* nested act_index */ 975 return 0; 976 } 977 978 /** 979 * Helper function to send a serie of TC actions to the kernel 980 * 981 * @param[in] flow 982 * Pointer to rte flow containing the netlink message 983 * 984 * @param[in] nb_actions 985 * Number of actions in an array of action structs 986 * 987 * @param[in] data 988 * Pointer to an array of action structs 989 * 990 * @param[in] classifier_actions 991 * The classifier on behave of which the actions are configured 992 * 993 * @return 994 * -1 on failure, 0 on success 995 */ 996 static int 997 add_actions(struct rte_flow *flow, int nb_actions, struct action_data *data, 998 int classifier_action) 999 { 1000 struct nlmsg *msg = &flow->msg; 1001 size_t act_index = 1; 1002 int i; 1003 1004 if (tap_nlattr_nested_start(msg, classifier_action) < 0) 1005 return -1; 1006 for (i = 0; i < nb_actions; i++) 1007 if (add_action(flow, &act_index, data + i) < 0) 1008 return -1; 1009 tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ 1010 return 0; 1011 } 1012 1013 /** 1014 * Validate a flow supported by TC. 1015 * If flow param is not NULL, then also fill the netlink message inside. 1016 * 1017 * @param pmd 1018 * Pointer to private structure. 1019 * @param[in] attr 1020 * Flow rule attributes. 1021 * @param[in] pattern 1022 * Pattern specification (list terminated by the END pattern item). 1023 * @param[in] actions 1024 * Associated actions (list terminated by the END action). 1025 * @param[out] error 1026 * Perform verbose error reporting if not NULL. 1027 * @param[in, out] flow 1028 * Flow structure to update. 1029 * @param[in] mirred 1030 * If set to TCA_EGRESS_REDIR, provided actions will be replaced with a 1031 * redirection to the tap netdevice, and the TC rule will be configured 1032 * on the remote netdevice in pmd. 1033 * If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a 1034 * mirroring to the tap netdevice, and the TC rule will be configured 1035 * on the remote netdevice in pmd. Matching packets will thus be duplicated. 1036 * If set to 0, the standard behavior is to be used: set correct actions for 1037 * the TC rule, and apply it on the tap netdevice. 1038 * 1039 * @return 1040 * 0 on success, a negative errno value otherwise and rte_errno is set. 1041 */ 1042 static int 1043 priv_flow_process(struct pmd_internals *pmd, 1044 const struct rte_flow_attr *attr, 1045 const struct rte_flow_item items[], 1046 const struct rte_flow_action actions[], 1047 struct rte_flow_error *error, 1048 struct rte_flow *flow, 1049 int mirred) 1050 { 1051 const struct tap_flow_items *cur_item = tap_flow_items; 1052 struct convert_data data = { 1053 .eth_type = 0, 1054 .ip_proto = 0, 1055 .flow = flow, 1056 }; 1057 int action = 0; /* Only one action authorized for now */ 1058 1059 if (attr->group > MAX_GROUP) { 1060 rte_flow_error_set( 1061 error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP, 1062 NULL, "group value too big: cannot exceed 15"); 1063 return -rte_errno; 1064 } 1065 if (attr->priority > MAX_PRIORITY) { 1066 rte_flow_error_set( 1067 error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, 1068 NULL, "priority value too big"); 1069 return -rte_errno; 1070 } else if (flow) { 1071 uint16_t group = attr->group << GROUP_SHIFT; 1072 uint16_t prio = group | (attr->priority + 1073 RSS_PRIORITY_OFFSET + PRIORITY_OFFSET); 1074 flow->msg.t.tcm_info = TC_H_MAKE(prio << 16, 1075 flow->msg.t.tcm_info); 1076 } 1077 if (flow) { 1078 if (mirred) { 1079 /* 1080 * If attr->ingress, the rule applies on remote ingress 1081 * to match incoming packets 1082 * If attr->egress, the rule applies on tap ingress (as 1083 * seen from the kernel) to deal with packets going out 1084 * from the DPDK app. 1085 */ 1086 flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0); 1087 } else { 1088 /* Standard rule on tap egress (kernel standpoint). */ 1089 flow->msg.t.tcm_parent = 1090 TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); 1091 } 1092 /* use flower filter type */ 1093 tap_nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower"); 1094 if (tap_nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0) 1095 goto exit_item_not_supported; 1096 } 1097 for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) { 1098 const struct tap_flow_items *token = NULL; 1099 unsigned int i; 1100 int err = 0; 1101 1102 if (items->type == RTE_FLOW_ITEM_TYPE_VOID) 1103 continue; 1104 for (i = 0; 1105 cur_item->items && 1106 cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END; 1107 ++i) { 1108 if (cur_item->items[i] == items->type) { 1109 token = &tap_flow_items[items->type]; 1110 break; 1111 } 1112 } 1113 if (!token) 1114 goto exit_item_not_supported; 1115 cur_item = token; 1116 err = tap_flow_item_validate( 1117 items, cur_item->mask_sz, 1118 (const uint8_t *)cur_item->mask, 1119 (const uint8_t *)cur_item->default_mask); 1120 if (err) 1121 goto exit_item_not_supported; 1122 if (flow && cur_item->convert) { 1123 err = cur_item->convert(items, &data); 1124 if (err) 1125 goto exit_item_not_supported; 1126 } 1127 } 1128 if (flow) { 1129 if (data.vlan) { 1130 tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE, 1131 htons(ETH_P_8021Q)); 1132 tap_nlattr_add16(&flow->msg.nh, 1133 TCA_FLOWER_KEY_VLAN_ETH_TYPE, 1134 data.eth_type ? 1135 data.eth_type : htons(ETH_P_ALL)); 1136 } else if (data.eth_type) { 1137 tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE, 1138 data.eth_type); 1139 } 1140 } 1141 if (mirred && flow) { 1142 struct action_data adata = { 1143 .id = "mirred", 1144 .mirred = { 1145 .eaction = mirred, 1146 }, 1147 }; 1148 1149 /* 1150 * If attr->egress && mirred, then this is a special 1151 * case where the rule must be applied on the tap, to 1152 * redirect packets coming from the DPDK App, out 1153 * through the remote netdevice. 1154 */ 1155 adata.mirred.ifindex = attr->ingress ? pmd->if_index : 1156 pmd->remote_if_index; 1157 if (mirred == TCA_EGRESS_MIRROR) 1158 adata.mirred.action = TC_ACT_PIPE; 1159 else 1160 adata.mirred.action = TC_ACT_STOLEN; 1161 if (add_actions(flow, 1, &adata, TCA_FLOWER_ACT) < 0) 1162 goto exit_action_not_supported; 1163 else 1164 goto end; 1165 } 1166 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { 1167 int err = 0; 1168 1169 if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) { 1170 continue; 1171 } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { 1172 if (action) 1173 goto exit_action_not_supported; 1174 action = 1; 1175 if (flow) { 1176 struct action_data adata = { 1177 .id = "gact", 1178 .gact = { 1179 .action = TC_ACT_SHOT, 1180 }, 1181 }; 1182 1183 err = add_actions(flow, 1, &adata, 1184 TCA_FLOWER_ACT); 1185 } 1186 } else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) { 1187 if (action) 1188 goto exit_action_not_supported; 1189 action = 1; 1190 if (flow) { 1191 struct action_data adata = { 1192 .id = "gact", 1193 .gact = { 1194 /* continue */ 1195 .action = TC_ACT_UNSPEC, 1196 }, 1197 }; 1198 1199 err = add_actions(flow, 1, &adata, 1200 TCA_FLOWER_ACT); 1201 } 1202 } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) { 1203 const struct rte_flow_action_queue *queue = 1204 (const struct rte_flow_action_queue *) 1205 actions->conf; 1206 1207 if (action) 1208 goto exit_action_not_supported; 1209 action = 1; 1210 if (!queue || 1211 (queue->index > pmd->dev->data->nb_rx_queues - 1)) 1212 goto exit_action_not_supported; 1213 if (flow) { 1214 struct action_data adata = { 1215 .id = "skbedit", 1216 .skbedit = { 1217 .skbedit = { 1218 .action = TC_ACT_PIPE, 1219 }, 1220 .queue = queue->index, 1221 }, 1222 }; 1223 1224 err = add_actions(flow, 1, &adata, 1225 TCA_FLOWER_ACT); 1226 } 1227 } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) { 1228 const struct rte_flow_action_rss *rss = 1229 (const struct rte_flow_action_rss *) 1230 actions->conf; 1231 1232 if (action++) 1233 goto exit_action_not_supported; 1234 1235 if (!pmd->rss_enabled) { 1236 err = rss_enable(pmd, attr, error); 1237 if (err) 1238 goto exit_action_not_supported; 1239 } 1240 if (flow && rss) 1241 err = rss_add_actions(flow, pmd, rss, error); 1242 } else { 1243 goto exit_action_not_supported; 1244 } 1245 if (err) 1246 goto exit_action_not_supported; 1247 } 1248 end: 1249 if (flow) 1250 tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */ 1251 return 0; 1252 exit_item_not_supported: 1253 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, 1254 items, "item not supported"); 1255 return -rte_errno; 1256 exit_action_not_supported: 1257 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, 1258 actions, "action not supported"); 1259 return -rte_errno; 1260 } 1261 1262 1263 1264 /** 1265 * Validate a flow. 1266 * 1267 * @see rte_flow_validate() 1268 * @see rte_flow_ops 1269 */ 1270 static int 1271 tap_flow_validate(struct rte_eth_dev *dev, 1272 const struct rte_flow_attr *attr, 1273 const struct rte_flow_item items[], 1274 const struct rte_flow_action actions[], 1275 struct rte_flow_error *error) 1276 { 1277 struct pmd_internals *pmd = dev->data->dev_private; 1278 1279 return priv_flow_process(pmd, attr, items, actions, error, NULL, 0); 1280 } 1281 1282 /** 1283 * Set a unique handle in a flow. 1284 * 1285 * The kernel supports TC rules with equal priority, as long as they use the 1286 * same matching fields (e.g.: dst mac and ipv4) with different values (and 1287 * full mask to ensure no collision is possible). 1288 * In those rules, the handle (uint32_t) is the part that would identify 1289 * specifically each rule. 1290 * 1291 * On 32-bit architectures, the handle can simply be the flow's pointer address. 1292 * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently) 1293 * unique handle. 1294 * 1295 * @param[in, out] flow 1296 * The flow that needs its handle set. 1297 */ 1298 static void 1299 tap_flow_set_handle(struct rte_flow *flow) 1300 { 1301 uint32_t handle = 0; 1302 1303 if (sizeof(flow) > 4) 1304 handle = rte_jhash(&flow, sizeof(flow), 1); 1305 else 1306 handle = (uintptr_t)flow; 1307 /* must be at least 1 to avoid letting the kernel choose one for us */ 1308 if (!handle) 1309 handle = 1; 1310 flow->msg.t.tcm_handle = handle; 1311 } 1312 1313 /** 1314 * Create a flow. 1315 * 1316 * @see rte_flow_create() 1317 * @see rte_flow_ops 1318 */ 1319 static struct rte_flow * 1320 tap_flow_create(struct rte_eth_dev *dev, 1321 const struct rte_flow_attr *attr, 1322 const struct rte_flow_item items[], 1323 const struct rte_flow_action actions[], 1324 struct rte_flow_error *error) 1325 { 1326 struct pmd_internals *pmd = dev->data->dev_private; 1327 struct rte_flow *remote_flow = NULL; 1328 struct rte_flow *flow = NULL; 1329 struct nlmsg *msg = NULL; 1330 int err; 1331 1332 if (!pmd->if_index) { 1333 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, 1334 NULL, 1335 "can't create rule, ifindex not found"); 1336 goto fail; 1337 } 1338 /* 1339 * No rules configured through standard rte_flow should be set on the 1340 * priorities used by implicit rules. 1341 */ 1342 if ((attr->group == MAX_GROUP) && 1343 attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) { 1344 rte_flow_error_set( 1345 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, 1346 NULL, "priority value too big"); 1347 goto fail; 1348 } 1349 flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); 1350 if (!flow) { 1351 rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1352 NULL, "cannot allocate memory for rte_flow"); 1353 goto fail; 1354 } 1355 msg = &flow->msg; 1356 tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, 1357 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); 1358 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); 1359 tap_flow_set_handle(flow); 1360 if (priv_flow_process(pmd, attr, items, actions, error, flow, 0)) 1361 goto fail; 1362 err = tap_nl_send(pmd->nlsk_fd, &msg->nh); 1363 if (err < 0) { 1364 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, 1365 NULL, "couldn't send request to kernel"); 1366 goto fail; 1367 } 1368 err = tap_nl_recv_ack(pmd->nlsk_fd); 1369 if (err < 0) { 1370 RTE_LOG(ERR, PMD, 1371 "Kernel refused TC filter rule creation (%d): %s\n", 1372 errno, strerror(errno)); 1373 rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE, 1374 NULL, 1375 "overlapping rules or Kernel too old for flower support"); 1376 goto fail; 1377 } 1378 LIST_INSERT_HEAD(&pmd->flows, flow, next); 1379 /** 1380 * If a remote device is configured, a TC rule with identical items for 1381 * matching must be set on that device, with a single action: redirect 1382 * to the local pmd->if_index. 1383 */ 1384 if (pmd->remote_if_index) { 1385 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); 1386 if (!remote_flow) { 1387 rte_flow_error_set( 1388 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 1389 "cannot allocate memory for rte_flow"); 1390 goto fail; 1391 } 1392 msg = &remote_flow->msg; 1393 /* set the rule if_index for the remote netdevice */ 1394 tc_init_msg( 1395 msg, pmd->remote_if_index, RTM_NEWTFILTER, 1396 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); 1397 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); 1398 tap_flow_set_handle(remote_flow); 1399 if (priv_flow_process(pmd, attr, items, NULL, 1400 error, remote_flow, TCA_EGRESS_REDIR)) { 1401 rte_flow_error_set( 1402 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1403 NULL, "rte flow rule validation failed"); 1404 goto fail; 1405 } 1406 err = tap_nl_send(pmd->nlsk_fd, &msg->nh); 1407 if (err < 0) { 1408 rte_flow_error_set( 1409 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1410 NULL, "Failure sending nl request"); 1411 goto fail; 1412 } 1413 err = tap_nl_recv_ack(pmd->nlsk_fd); 1414 if (err < 0) { 1415 RTE_LOG(ERR, PMD, 1416 "Kernel refused TC filter rule creation (%d): %s\n", 1417 errno, strerror(errno)); 1418 rte_flow_error_set( 1419 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1420 NULL, 1421 "overlapping rules or Kernel too old for flower support"); 1422 goto fail; 1423 } 1424 flow->remote_flow = remote_flow; 1425 } 1426 return flow; 1427 fail: 1428 if (remote_flow) 1429 rte_free(remote_flow); 1430 if (flow) 1431 rte_free(flow); 1432 return NULL; 1433 } 1434 1435 /** 1436 * Destroy a flow using pointer to pmd_internal. 1437 * 1438 * @param[in, out] pmd 1439 * Pointer to private structure. 1440 * @param[in] flow 1441 * Pointer to the flow to destroy. 1442 * @param[in, out] error 1443 * Pointer to the flow error handler 1444 * 1445 * @return 0 if the flow could be destroyed, -1 otherwise. 1446 */ 1447 static int 1448 tap_flow_destroy_pmd(struct pmd_internals *pmd, 1449 struct rte_flow *flow, 1450 struct rte_flow_error *error) 1451 { 1452 struct rte_flow *remote_flow = flow->remote_flow; 1453 int i; 1454 int ret = 0; 1455 1456 LIST_REMOVE(flow, next); 1457 flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1458 flow->msg.nh.nlmsg_type = RTM_DELTFILTER; 1459 1460 ret = tap_nl_send(pmd->nlsk_fd, &flow->msg.nh); 1461 if (ret < 0) { 1462 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, 1463 NULL, "couldn't send request to kernel"); 1464 goto end; 1465 } 1466 ret = tap_nl_recv_ack(pmd->nlsk_fd); 1467 /* If errno is ENOENT, the rule is already no longer in the kernel. */ 1468 if (ret < 0 && errno == ENOENT) 1469 ret = 0; 1470 if (ret < 0) { 1471 RTE_LOG(ERR, PMD, 1472 "Kernel refused TC filter rule deletion (%d): %s\n", 1473 errno, strerror(errno)); 1474 rte_flow_error_set( 1475 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 1476 "couldn't receive kernel ack to our request"); 1477 goto end; 1478 } 1479 /* Close opened BPF file descriptors of this flow */ 1480 for (i = 0; i < SEC_MAX; i++) 1481 if (flow->bpf_fd[i] != 0) { 1482 close(flow->bpf_fd[i]); 1483 flow->bpf_fd[i] = 0; 1484 } 1485 1486 /* Release map key for this RSS rule */ 1487 ret = bpf_rss_key(KEY_CMD_RELEASE, &flow->key_idx); 1488 if (ret < 0) { 1489 rte_flow_error_set( 1490 error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 1491 "Failed to release BPF RSS key"); 1492 1493 goto end; 1494 } 1495 1496 if (remote_flow) { 1497 remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1498 remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER; 1499 1500 ret = tap_nl_send(pmd->nlsk_fd, &remote_flow->msg.nh); 1501 if (ret < 0) { 1502 rte_flow_error_set( 1503 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1504 NULL, "Failure sending nl request"); 1505 goto end; 1506 } 1507 ret = tap_nl_recv_ack(pmd->nlsk_fd); 1508 if (ret < 0 && errno == ENOENT) 1509 ret = 0; 1510 if (ret < 0) { 1511 RTE_LOG(ERR, PMD, 1512 "Kernel refused TC filter rule deletion (%d): %s\n", 1513 errno, strerror(errno)); 1514 rte_flow_error_set( 1515 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1516 NULL, "Failure trying to receive nl ack"); 1517 goto end; 1518 } 1519 } 1520 end: 1521 if (remote_flow) 1522 rte_free(remote_flow); 1523 rte_free(flow); 1524 return ret; 1525 } 1526 1527 /** 1528 * Destroy a flow. 1529 * 1530 * @see rte_flow_destroy() 1531 * @see rte_flow_ops 1532 */ 1533 static int 1534 tap_flow_destroy(struct rte_eth_dev *dev, 1535 struct rte_flow *flow, 1536 struct rte_flow_error *error) 1537 { 1538 struct pmd_internals *pmd = dev->data->dev_private; 1539 1540 return tap_flow_destroy_pmd(pmd, flow, error); 1541 } 1542 1543 /** 1544 * Enable/disable flow isolation. 1545 * 1546 * @see rte_flow_isolate() 1547 * @see rte_flow_ops 1548 */ 1549 static int 1550 tap_flow_isolate(struct rte_eth_dev *dev, 1551 int set, 1552 struct rte_flow_error *error __rte_unused) 1553 { 1554 struct pmd_internals *pmd = dev->data->dev_private; 1555 1556 if (set) 1557 pmd->flow_isolate = 1; 1558 else 1559 pmd->flow_isolate = 0; 1560 /* 1561 * If netdevice is there, setup appropriate flow rules immediately. 1562 * Otherwise it will be set when bringing up the netdevice (tun_alloc). 1563 */ 1564 if (!pmd->rxq[0].fd) 1565 return 0; 1566 if (set) { 1567 struct rte_flow *flow; 1568 1569 while (1) { 1570 flow = LIST_FIRST(&pmd->implicit_flows); 1571 if (!flow) 1572 break; 1573 /* 1574 * Remove all implicit rules on the remote. 1575 * Keep the local rule to redirect packets on TX. 1576 * Keep also the last implicit local rule: ISOLATE. 1577 */ 1578 if (flow->msg.t.tcm_ifindex == pmd->if_index) 1579 break; 1580 if (tap_flow_destroy_pmd(pmd, flow, NULL) < 0) 1581 goto error; 1582 } 1583 /* Switch the TC rule according to pmd->flow_isolate */ 1584 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1) 1585 goto error; 1586 } else { 1587 /* Switch the TC rule according to pmd->flow_isolate */ 1588 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1) 1589 goto error; 1590 if (!pmd->remote_if_index) 1591 return 0; 1592 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0) 1593 goto error; 1594 if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0) 1595 goto error; 1596 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0) 1597 goto error; 1598 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0) 1599 goto error; 1600 if (dev->data->promiscuous && 1601 tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC) < 0) 1602 goto error; 1603 if (dev->data->all_multicast && 1604 tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI) < 0) 1605 goto error; 1606 } 1607 return 0; 1608 error: 1609 pmd->flow_isolate = 0; 1610 return rte_flow_error_set( 1611 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, 1612 "TC rule creation failed"); 1613 } 1614 1615 /** 1616 * Destroy all flows. 1617 * 1618 * @see rte_flow_flush() 1619 * @see rte_flow_ops 1620 */ 1621 int 1622 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error) 1623 { 1624 struct pmd_internals *pmd = dev->data->dev_private; 1625 struct rte_flow *flow; 1626 1627 while (!LIST_EMPTY(&pmd->flows)) { 1628 flow = LIST_FIRST(&pmd->flows); 1629 if (tap_flow_destroy(dev, flow, error) < 0) 1630 return -1; 1631 } 1632 return 0; 1633 } 1634 1635 /** 1636 * Add an implicit flow rule on the remote device to make sure traffic gets to 1637 * the tap netdevice from there. 1638 * 1639 * @param pmd 1640 * Pointer to private structure. 1641 * @param[in] idx 1642 * The idx in the implicit_rte_flows array specifying which rule to apply. 1643 * 1644 * @return -1 if the rule couldn't be applied, 0 otherwise. 1645 */ 1646 int tap_flow_implicit_create(struct pmd_internals *pmd, 1647 enum implicit_rule_index idx) 1648 { 1649 uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE; 1650 struct rte_flow_action *actions = implicit_rte_flows[idx].actions; 1651 struct rte_flow_action isolate_actions[2] = { 1652 [1] = { 1653 .type = RTE_FLOW_ACTION_TYPE_END, 1654 }, 1655 }; 1656 struct rte_flow_item *items = implicit_rte_flows[idx].items; 1657 struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr; 1658 struct rte_flow_item_eth eth_local = { .type = 0 }; 1659 uint16_t if_index = pmd->remote_if_index; 1660 struct rte_flow *remote_flow = NULL; 1661 struct nlmsg *msg = NULL; 1662 int err = 0; 1663 struct rte_flow_item items_local[2] = { 1664 [0] = { 1665 .type = items[0].type, 1666 .spec = ð_local, 1667 .mask = items[0].mask, 1668 }, 1669 [1] = { 1670 .type = items[1].type, 1671 } 1672 }; 1673 1674 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); 1675 if (!remote_flow) { 1676 RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow\n"); 1677 goto fail; 1678 } 1679 msg = &remote_flow->msg; 1680 if (idx == TAP_REMOTE_TX) { 1681 if_index = pmd->if_index; 1682 } else if (idx == TAP_ISOLATE) { 1683 if_index = pmd->if_index; 1684 /* Don't be exclusive for this rule, it can be changed later. */ 1685 flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; 1686 isolate_actions[0].type = pmd->flow_isolate ? 1687 RTE_FLOW_ACTION_TYPE_DROP : 1688 RTE_FLOW_ACTION_TYPE_PASSTHRU; 1689 actions = isolate_actions; 1690 } else if (idx == TAP_REMOTE_LOCAL_MAC) { 1691 /* 1692 * eth addr couldn't be set in implicit_rte_flows[] as it is not 1693 * known at compile time. 1694 */ 1695 memcpy(ð_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr)); 1696 items = items_local; 1697 } 1698 tc_init_msg(msg, if_index, RTM_NEWTFILTER, flags); 1699 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); 1700 /* 1701 * The ISOLATE rule is always present and must have a static handle, as 1702 * the action is changed whether the feature is enabled (DROP) or 1703 * disabled (PASSTHRU). 1704 */ 1705 if (idx == TAP_ISOLATE) 1706 remote_flow->msg.t.tcm_handle = ISOLATE_HANDLE; 1707 else 1708 tap_flow_set_handle(remote_flow); 1709 if (priv_flow_process(pmd, attr, items, actions, NULL, 1710 remote_flow, implicit_rte_flows[idx].mirred)) { 1711 RTE_LOG(ERR, PMD, "rte flow rule validation failed\n"); 1712 goto fail; 1713 } 1714 err = tap_nl_send(pmd->nlsk_fd, &msg->nh); 1715 if (err < 0) { 1716 RTE_LOG(ERR, PMD, "Failure sending nl request\n"); 1717 goto fail; 1718 } 1719 err = tap_nl_recv_ack(pmd->nlsk_fd); 1720 if (err < 0) { 1721 RTE_LOG(ERR, PMD, 1722 "Kernel refused TC filter rule creation (%d): %s\n", 1723 errno, strerror(errno)); 1724 goto fail; 1725 } 1726 LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next); 1727 return 0; 1728 fail: 1729 if (remote_flow) 1730 rte_free(remote_flow); 1731 return -1; 1732 } 1733 1734 /** 1735 * Remove specific implicit flow rule on the remote device. 1736 * 1737 * @param[in, out] pmd 1738 * Pointer to private structure. 1739 * @param[in] idx 1740 * The idx in the implicit_rte_flows array specifying which rule to remove. 1741 * 1742 * @return -1 if one of the implicit rules couldn't be created, 0 otherwise. 1743 */ 1744 int tap_flow_implicit_destroy(struct pmd_internals *pmd, 1745 enum implicit_rule_index idx) 1746 { 1747 struct rte_flow *remote_flow; 1748 int cur_prio = -1; 1749 int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET; 1750 1751 for (remote_flow = LIST_FIRST(&pmd->implicit_flows); 1752 remote_flow; 1753 remote_flow = LIST_NEXT(remote_flow, next)) { 1754 cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK; 1755 if (cur_prio != idx_prio) 1756 continue; 1757 return tap_flow_destroy_pmd(pmd, remote_flow, NULL); 1758 } 1759 return 0; 1760 } 1761 1762 /** 1763 * Destroy all implicit flows. 1764 * 1765 * @see rte_flow_flush() 1766 */ 1767 int 1768 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error) 1769 { 1770 struct rte_flow *remote_flow; 1771 1772 while (!LIST_EMPTY(&pmd->implicit_flows)) { 1773 remote_flow = LIST_FIRST(&pmd->implicit_flows); 1774 if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0) 1775 return -1; 1776 } 1777 return 0; 1778 } 1779 1780 #define MAX_RSS_KEYS 256 1781 #define SEC_NAME_CLS_Q "cls_q" 1782 1783 const char *sec_name[SEC_MAX] = { 1784 [SEC_L3_L4] = "l3_l4", 1785 }; 1786 1787 /** 1788 * Enable RSS on tap: create TC rules for queuing. 1789 * 1790 * @param[in, out] pmd 1791 * Pointer to private structure. 1792 * 1793 * @param[in] attr 1794 * Pointer to rte_flow to get flow group 1795 * 1796 * @param[out] error 1797 * Pointer to error reporting if not NULL. 1798 * 1799 * @return 0 on success, negative value on failure. 1800 */ 1801 static int rss_enable(struct pmd_internals *pmd, 1802 const struct rte_flow_attr *attr, 1803 struct rte_flow_error *error) 1804 { 1805 struct rte_flow *rss_flow = NULL; 1806 struct nlmsg *msg = NULL; 1807 /* 4096 is the maximum number of instructions for a BPF program */ 1808 char annotation[64]; 1809 int i; 1810 int err = 0; 1811 1812 /* unlimit locked memory */ 1813 struct rlimit memlock_limit = { 1814 .rlim_cur = RLIM_INFINITY, 1815 .rlim_max = RLIM_INFINITY, 1816 }; 1817 setrlimit(RLIMIT_MEMLOCK, &memlock_limit); 1818 1819 /* Get a new map key for a new RSS rule */ 1820 err = bpf_rss_key(KEY_CMD_INIT, NULL); 1821 if (err < 0) { 1822 rte_flow_error_set( 1823 error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 1824 "Failed to initialize BPF RSS keys"); 1825 1826 return -1; 1827 } 1828 1829 /* 1830 * Create BPF RSS MAP 1831 */ 1832 pmd->map_fd = tap_flow_bpf_rss_map_create(sizeof(__u32), /* key size */ 1833 sizeof(struct rss_key), 1834 MAX_RSS_KEYS); 1835 if (pmd->map_fd < 0) { 1836 RTE_LOG(ERR, PMD, 1837 "Failed to create BPF map (%d): %s\n", 1838 errno, strerror(errno)); 1839 rte_flow_error_set( 1840 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 1841 "Kernel too old or not configured " 1842 "to support BPF maps"); 1843 1844 return -ENOTSUP; 1845 } 1846 1847 /* 1848 * Add a rule per queue to match reclassified packets and direct them to 1849 * the correct queue. 1850 */ 1851 for (i = 0; i < pmd->dev->data->nb_rx_queues; i++) { 1852 pmd->bpf_fd[i] = tap_flow_bpf_cls_q(i); 1853 if (pmd->bpf_fd[i] < 0) { 1854 RTE_LOG(ERR, PMD, 1855 "Failed to load BPF section %s for queue %d", 1856 SEC_NAME_CLS_Q, i); 1857 rte_flow_error_set( 1858 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, 1859 NULL, 1860 "Kernel too old or not configured " 1861 "to support BPF programs loading"); 1862 1863 return -ENOTSUP; 1864 } 1865 1866 rss_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); 1867 if (!rss_flow) { 1868 RTE_LOG(ERR, PMD, 1869 "Cannot allocate memory for rte_flow"); 1870 return -1; 1871 } 1872 msg = &rss_flow->msg; 1873 tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, NLM_F_REQUEST | 1874 NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); 1875 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); 1876 tap_flow_set_handle(rss_flow); 1877 uint16_t group = attr->group << GROUP_SHIFT; 1878 uint16_t prio = group | (i + PRIORITY_OFFSET); 1879 msg->t.tcm_info = TC_H_MAKE(prio << 16, msg->t.tcm_info); 1880 msg->t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); 1881 1882 tap_nlattr_add(&msg->nh, TCA_KIND, sizeof("bpf"), "bpf"); 1883 if (tap_nlattr_nested_start(msg, TCA_OPTIONS) < 0) 1884 return -1; 1885 tap_nlattr_add32(&msg->nh, TCA_BPF_FD, pmd->bpf_fd[i]); 1886 snprintf(annotation, sizeof(annotation), "[%s%d]", 1887 SEC_NAME_CLS_Q, i); 1888 tap_nlattr_add(&msg->nh, TCA_BPF_NAME, strlen(annotation) + 1, 1889 annotation); 1890 /* Actions */ 1891 { 1892 struct action_data adata = { 1893 .id = "skbedit", 1894 .skbedit = { 1895 .skbedit = { 1896 .action = TC_ACT_PIPE, 1897 }, 1898 .queue = i, 1899 }, 1900 }; 1901 if (add_actions(rss_flow, 1, &adata, TCA_BPF_ACT) < 0) 1902 return -1; 1903 } 1904 tap_nlattr_nested_finish(msg); /* nested TCA_OPTIONS */ 1905 1906 /* Netlink message is now ready to be sent */ 1907 if (tap_nl_send(pmd->nlsk_fd, &msg->nh) < 0) 1908 return -1; 1909 err = tap_nl_recv_ack(pmd->nlsk_fd); 1910 if (err < 0) { 1911 RTE_LOG(ERR, PMD, 1912 "Kernel refused TC filter rule creation (%d): %s\n", 1913 errno, strerror(errno)); 1914 return err; 1915 } 1916 LIST_INSERT_HEAD(&pmd->rss_flows, rss_flow, next); 1917 } 1918 1919 pmd->rss_enabled = 1; 1920 return err; 1921 } 1922 1923 /** 1924 * Manage bpf RSS keys repository with operations: init, get, release 1925 * 1926 * @param[in] cmd 1927 * Command on RSS keys: init, get, release 1928 * 1929 * @param[in, out] key_idx 1930 * Pointer to RSS Key index (out for get command, in for release command) 1931 * 1932 * @return -1 if couldn't get, release or init the RSS keys, 0 otherwise. 1933 */ 1934 static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx) 1935 { 1936 __u32 i; 1937 int err = -1; 1938 static __u32 num_used_keys; 1939 static __u32 rss_keys[MAX_RSS_KEYS] = {KEY_STAT_UNSPEC}; 1940 static __u32 rss_keys_initialized; 1941 1942 switch (cmd) { 1943 case KEY_CMD_GET: 1944 if (!rss_keys_initialized) 1945 break; 1946 1947 if (num_used_keys == RTE_DIM(rss_keys)) 1948 break; 1949 1950 *key_idx = num_used_keys % RTE_DIM(rss_keys); 1951 while (rss_keys[*key_idx] == KEY_STAT_USED) 1952 *key_idx = (*key_idx + 1) % RTE_DIM(rss_keys); 1953 1954 rss_keys[*key_idx] = KEY_STAT_USED; 1955 num_used_keys++; 1956 err = 0; 1957 break; 1958 1959 case KEY_CMD_RELEASE: 1960 if (!rss_keys_initialized) { 1961 err = 0; 1962 break; 1963 } 1964 1965 if (rss_keys[*key_idx] == KEY_STAT_USED) { 1966 rss_keys[*key_idx] = KEY_STAT_AVAILABLE; 1967 num_used_keys--; 1968 err = 0; 1969 } 1970 break; 1971 1972 case KEY_CMD_INIT: 1973 for (i = 0; i < RTE_DIM(rss_keys); i++) 1974 rss_keys[i] = KEY_STAT_AVAILABLE; 1975 1976 rss_keys_initialized = 1; 1977 num_used_keys = 0; 1978 err = 0; 1979 break; 1980 1981 case KEY_CMD_DEINIT: 1982 for (i = 0; i < RTE_DIM(rss_keys); i++) 1983 rss_keys[i] = KEY_STAT_UNSPEC; 1984 1985 rss_keys_initialized = 0; 1986 num_used_keys = 0; 1987 err = 0; 1988 break; 1989 1990 default: 1991 break; 1992 } 1993 1994 return err; 1995 } 1996 1997 /** 1998 * Add RSS hash calculations and queue selection 1999 * 2000 * @param[in, out] pmd 2001 * Pointer to internal structure. Used to set/get RSS map fd 2002 * 2003 * @param[in] rss 2004 * Pointer to RSS flow actions 2005 * 2006 * @param[out] error 2007 * Pointer to error reporting if not NULL. 2008 * 2009 * @return 0 on success, negative value on failure 2010 */ 2011 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd, 2012 const struct rte_flow_action_rss *rss, 2013 struct rte_flow_error *error) 2014 { 2015 /* 4096 is the maximum number of instructions for a BPF program */ 2016 int i; 2017 int err; 2018 struct rss_key rss_entry = { .hash_fields = 0, 2019 .key_size = 0 }; 2020 2021 /* Get a new map key for a new RSS rule */ 2022 err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx); 2023 if (err < 0) { 2024 rte_flow_error_set( 2025 error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 2026 "Failed to get BPF RSS key"); 2027 2028 return -1; 2029 } 2030 2031 /* Update RSS map entry with queues */ 2032 rss_entry.nb_queues = rss->num; 2033 for (i = 0; i < rss->num; i++) 2034 rss_entry.queues[i] = rss->queue[i]; 2035 rss_entry.hash_fields = 2036 (1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4); 2037 2038 /* Add this RSS entry to map */ 2039 err = tap_flow_bpf_update_rss_elem(pmd->map_fd, 2040 &flow->key_idx, &rss_entry); 2041 2042 if (err) { 2043 RTE_LOG(ERR, PMD, 2044 "Failed to update BPF map entry #%u (%d): %s\n", 2045 flow->key_idx, errno, strerror(errno)); 2046 rte_flow_error_set( 2047 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 2048 "Kernel too old or not configured " 2049 "to support BPF maps updates"); 2050 2051 return -ENOTSUP; 2052 } 2053 2054 2055 /* 2056 * Load bpf rules to calculate hash for this key_idx 2057 */ 2058 2059 flow->bpf_fd[SEC_L3_L4] = 2060 tap_flow_bpf_calc_l3_l4_hash(flow->key_idx, pmd->map_fd); 2061 if (flow->bpf_fd[SEC_L3_L4] < 0) { 2062 RTE_LOG(ERR, PMD, 2063 "Failed to load BPF section %s (%d): %s\n", 2064 sec_name[SEC_L3_L4], errno, strerror(errno)); 2065 rte_flow_error_set( 2066 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 2067 "Kernel too old or not configured " 2068 "to support BPF program loading"); 2069 2070 return -ENOTSUP; 2071 } 2072 2073 /* Actions */ 2074 { 2075 struct action_data adata[] = { 2076 { 2077 .id = "bpf", 2078 .bpf = { 2079 .bpf_fd = flow->bpf_fd[SEC_L3_L4], 2080 .annotation = sec_name[SEC_L3_L4], 2081 .bpf = { 2082 .action = TC_ACT_PIPE, 2083 }, 2084 }, 2085 }, 2086 }; 2087 2088 if (add_actions(flow, RTE_DIM(adata), adata, 2089 TCA_FLOWER_ACT) < 0) 2090 return -1; 2091 } 2092 2093 return 0; 2094 } 2095 2096 /** 2097 * Manage filter operations. 2098 * 2099 * @param dev 2100 * Pointer to Ethernet device structure. 2101 * @param filter_type 2102 * Filter type. 2103 * @param filter_op 2104 * Operation to perform. 2105 * @param arg 2106 * Pointer to operation-specific structure. 2107 * 2108 * @return 2109 * 0 on success, negative errno value on failure. 2110 */ 2111 int 2112 tap_dev_filter_ctrl(struct rte_eth_dev *dev, 2113 enum rte_filter_type filter_type, 2114 enum rte_filter_op filter_op, 2115 void *arg) 2116 { 2117 switch (filter_type) { 2118 case RTE_ETH_FILTER_GENERIC: 2119 if (filter_op != RTE_ETH_FILTER_GET) 2120 return -EINVAL; 2121 *(const void **)arg = &tap_flow_ops; 2122 return 0; 2123 default: 2124 RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported\n", 2125 (void *)dev, filter_type); 2126 } 2127 return -EINVAL; 2128 } 2129 2130