1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2017 6WIND S.A. 3 * Copyright 2017 Mellanox Technologies, Ltd 4 */ 5 6 #include <errno.h> 7 #include <string.h> 8 #include <unistd.h> 9 #include <sys/queue.h> 10 #include <sys/resource.h> 11 12 #include <rte_byteorder.h> 13 #include <rte_jhash.h> 14 #include <rte_random.h> 15 #include <rte_malloc.h> 16 #include <rte_eth_tap.h> 17 #include <tap_flow.h> 18 #include <tap_autoconf.h> 19 #include <tap_tcmsgs.h> 20 #include <tap_rss.h> 21 22 #ifndef HAVE_TC_FLOWER 23 /* 24 * For kernels < 4.2, this enum is not defined. Runtime checks will be made to 25 * avoid sending TC messages the kernel cannot understand. 26 */ 27 enum { 28 TCA_FLOWER_UNSPEC, 29 TCA_FLOWER_CLASSID, 30 TCA_FLOWER_INDEV, 31 TCA_FLOWER_ACT, 32 TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ 33 TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ 34 TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ 35 TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ 36 TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ 37 TCA_FLOWER_KEY_IP_PROTO, /* u8 */ 38 TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ 39 TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ 40 TCA_FLOWER_KEY_IPV4_DST, /* be32 */ 41 TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ 42 TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ 43 TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ 44 TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ 45 TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ 46 TCA_FLOWER_KEY_TCP_SRC, /* be16 */ 47 TCA_FLOWER_KEY_TCP_DST, /* be16 */ 48 TCA_FLOWER_KEY_UDP_SRC, /* be16 */ 49 TCA_FLOWER_KEY_UDP_DST, /* be16 */ 50 }; 51 #endif 52 #ifndef HAVE_TC_VLAN_ID 53 enum { 54 /* TCA_FLOWER_FLAGS, */ 55 TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */ 56 TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ 57 TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ 58 }; 59 #endif 60 /* 61 * For kernels < 4.2 BPF related enums may not be defined. 62 * Runtime checks will be carried out to gracefully report on TC messages that 63 * are rejected by the kernel. Rejection reasons may be due to: 64 * 1. enum is not defined 65 * 2. enum is defined but kernel is not configured to support BPF system calls, 66 * BPF classifications or BPF actions. 67 */ 68 #ifndef HAVE_TC_BPF 69 enum { 70 TCA_BPF_UNSPEC, 71 TCA_BPF_ACT, 72 TCA_BPF_POLICE, 73 TCA_BPF_CLASSID, 74 TCA_BPF_OPS_LEN, 75 TCA_BPF_OPS, 76 }; 77 #endif 78 #ifndef HAVE_TC_BPF_FD 79 enum { 80 TCA_BPF_FD = TCA_BPF_OPS + 1, 81 TCA_BPF_NAME, 82 }; 83 #endif 84 #ifndef HAVE_TC_ACT_BPF 85 #define tc_gen \ 86 __u32 index; \ 87 __u32 capab; \ 88 int action; \ 89 int refcnt; \ 90 int bindcnt 91 92 struct tc_act_bpf { 93 tc_gen; 94 }; 95 96 enum { 97 TCA_ACT_BPF_UNSPEC, 98 TCA_ACT_BPF_TM, 99 TCA_ACT_BPF_PARMS, 100 TCA_ACT_BPF_OPS_LEN, 101 TCA_ACT_BPF_OPS, 102 }; 103 104 #endif 105 #ifndef HAVE_TC_ACT_BPF_FD 106 enum { 107 TCA_ACT_BPF_FD = TCA_ACT_BPF_OPS + 1, 108 TCA_ACT_BPF_NAME, 109 }; 110 #endif 111 112 /* RSS key management */ 113 enum bpf_rss_key_e { 114 KEY_CMD_GET = 1, 115 KEY_CMD_RELEASE, 116 KEY_CMD_INIT, 117 KEY_CMD_DEINIT, 118 }; 119 120 enum key_status_e { 121 KEY_STAT_UNSPEC, 122 KEY_STAT_USED, 123 KEY_STAT_AVAILABLE, 124 }; 125 126 #define ISOLATE_HANDLE 1 127 #define REMOTE_PROMISCUOUS_HANDLE 2 128 129 struct rte_flow { 130 LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */ 131 struct rte_flow *remote_flow; /* associated remote flow */ 132 int bpf_fd[SEC_MAX]; /* list of bfs fds per ELF section */ 133 uint32_t key_idx; /* RSS rule key index into BPF map */ 134 struct nlmsg msg; 135 }; 136 137 struct convert_data { 138 uint16_t eth_type; 139 uint16_t ip_proto; 140 uint8_t vlan; 141 struct rte_flow *flow; 142 }; 143 144 struct remote_rule { 145 struct rte_flow_attr attr; 146 struct rte_flow_item items[2]; 147 struct rte_flow_action actions[2]; 148 int mirred; 149 }; 150 151 struct action_data { 152 char id[16]; 153 154 union { 155 struct tc_gact gact; 156 struct tc_mirred mirred; 157 struct skbedit { 158 struct tc_skbedit skbedit; 159 uint16_t queue; 160 } skbedit; 161 struct bpf { 162 struct tc_act_bpf bpf; 163 int bpf_fd; 164 const char *annotation; 165 } bpf; 166 }; 167 }; 168 169 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data); 170 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data); 171 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data); 172 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data); 173 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data); 174 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data); 175 static int 176 tap_flow_validate(struct rte_eth_dev *dev, 177 const struct rte_flow_attr *attr, 178 const struct rte_flow_item items[], 179 const struct rte_flow_action actions[], 180 struct rte_flow_error *error); 181 182 static struct rte_flow * 183 tap_flow_create(struct rte_eth_dev *dev, 184 const struct rte_flow_attr *attr, 185 const struct rte_flow_item items[], 186 const struct rte_flow_action actions[], 187 struct rte_flow_error *error); 188 189 static void 190 tap_flow_free(struct pmd_internals *pmd, 191 struct rte_flow *flow); 192 193 static int 194 tap_flow_destroy(struct rte_eth_dev *dev, 195 struct rte_flow *flow, 196 struct rte_flow_error *error); 197 198 static int 199 tap_flow_isolate(struct rte_eth_dev *dev, 200 int set, 201 struct rte_flow_error *error); 202 203 static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx); 204 static int rss_enable(struct pmd_internals *pmd, 205 const struct rte_flow_attr *attr, 206 struct rte_flow_error *error); 207 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd, 208 const struct rte_flow_action_rss *rss, 209 struct rte_flow_error *error); 210 211 static const struct rte_flow_ops tap_flow_ops = { 212 .validate = tap_flow_validate, 213 .create = tap_flow_create, 214 .destroy = tap_flow_destroy, 215 .flush = tap_flow_flush, 216 .isolate = tap_flow_isolate, 217 }; 218 219 /* Static initializer for items. */ 220 #define ITEMS(...) \ 221 (const enum rte_flow_item_type []){ \ 222 __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \ 223 } 224 225 /* Structure to generate a simple graph of layers supported by the NIC. */ 226 struct tap_flow_items { 227 /* Bit-mask corresponding to what is supported for this item. */ 228 const void *mask; 229 const unsigned int mask_sz; /* Bit-mask size in bytes. */ 230 /* 231 * Bit-mask corresponding to the default mask, if none is provided 232 * along with the item. 233 */ 234 const void *default_mask; 235 /** 236 * Conversion function from rte_flow to netlink attributes. 237 * 238 * @param item 239 * rte_flow item to convert. 240 * @param data 241 * Internal structure to store the conversion. 242 * 243 * @return 244 * 0 on success, negative value otherwise. 245 */ 246 int (*convert)(const struct rte_flow_item *item, void *data); 247 /** List of possible following items. */ 248 const enum rte_flow_item_type *const items; 249 }; 250 251 /* Graph of supported items and associated actions. */ 252 static const struct tap_flow_items tap_flow_items[] = { 253 [RTE_FLOW_ITEM_TYPE_END] = { 254 .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH), 255 }, 256 [RTE_FLOW_ITEM_TYPE_ETH] = { 257 .items = ITEMS( 258 RTE_FLOW_ITEM_TYPE_VLAN, 259 RTE_FLOW_ITEM_TYPE_IPV4, 260 RTE_FLOW_ITEM_TYPE_IPV6), 261 .mask = &(const struct rte_flow_item_eth){ 262 .hdr.dst_addr.addr_bytes = "\xff\xff\xff\xff\xff\xff", 263 .hdr.src_addr.addr_bytes = "\xff\xff\xff\xff\xff\xff", 264 .hdr.ether_type = -1, 265 }, 266 .mask_sz = sizeof(struct rte_flow_item_eth), 267 .default_mask = &rte_flow_item_eth_mask, 268 .convert = tap_flow_create_eth, 269 }, 270 [RTE_FLOW_ITEM_TYPE_VLAN] = { 271 .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4, 272 RTE_FLOW_ITEM_TYPE_IPV6), 273 .mask = &(const struct rte_flow_item_vlan){ 274 /* DEI matching is not supported */ 275 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN 276 .hdr.vlan_tci = 0xffef, 277 #else 278 .hdr.vlan_tci = 0xefff, 279 #endif 280 .hdr.eth_proto = -1, 281 }, 282 .mask_sz = sizeof(struct rte_flow_item_vlan), 283 .default_mask = &rte_flow_item_vlan_mask, 284 .convert = tap_flow_create_vlan, 285 }, 286 [RTE_FLOW_ITEM_TYPE_IPV4] = { 287 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, 288 RTE_FLOW_ITEM_TYPE_TCP), 289 .mask = &(const struct rte_flow_item_ipv4){ 290 .hdr = { 291 .src_addr = -1, 292 .dst_addr = -1, 293 .next_proto_id = -1, 294 }, 295 }, 296 .mask_sz = sizeof(struct rte_flow_item_ipv4), 297 .default_mask = &rte_flow_item_ipv4_mask, 298 .convert = tap_flow_create_ipv4, 299 }, 300 [RTE_FLOW_ITEM_TYPE_IPV6] = { 301 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, 302 RTE_FLOW_ITEM_TYPE_TCP), 303 .mask = &(const struct rte_flow_item_ipv6){ 304 .hdr = { 305 .src_addr = { 306 "\xff\xff\xff\xff\xff\xff\xff\xff" 307 "\xff\xff\xff\xff\xff\xff\xff\xff", 308 }, 309 .dst_addr = { 310 "\xff\xff\xff\xff\xff\xff\xff\xff" 311 "\xff\xff\xff\xff\xff\xff\xff\xff", 312 }, 313 .proto = -1, 314 }, 315 }, 316 .mask_sz = sizeof(struct rte_flow_item_ipv6), 317 .default_mask = &rte_flow_item_ipv6_mask, 318 .convert = tap_flow_create_ipv6, 319 }, 320 [RTE_FLOW_ITEM_TYPE_UDP] = { 321 .mask = &(const struct rte_flow_item_udp){ 322 .hdr = { 323 .src_port = -1, 324 .dst_port = -1, 325 }, 326 }, 327 .mask_sz = sizeof(struct rte_flow_item_udp), 328 .default_mask = &rte_flow_item_udp_mask, 329 .convert = tap_flow_create_udp, 330 }, 331 [RTE_FLOW_ITEM_TYPE_TCP] = { 332 .mask = &(const struct rte_flow_item_tcp){ 333 .hdr = { 334 .src_port = -1, 335 .dst_port = -1, 336 }, 337 }, 338 .mask_sz = sizeof(struct rte_flow_item_tcp), 339 .default_mask = &rte_flow_item_tcp_mask, 340 .convert = tap_flow_create_tcp, 341 }, 342 }; 343 344 /* 345 * TC rules, by growing priority 346 * 347 * Remote netdevice Tap netdevice 348 * +-------------+-------------+ +-------------+-------------+ 349 * | Ingress | Egress | | Ingress | Egress | 350 * |-------------|-------------| |-------------|-------------| 351 * | | \ / | | | REMOTE TX | prio 1 352 * | | \ / | | | \ / | prio 2 353 * | EXPLICIT | \ / | | EXPLICIT | \ / | . 354 * | | \ / | | | \ / | . 355 * | RULES | X | | RULES | X | . 356 * | . | / \ | | . | / \ | . 357 * | . | / \ | | . | / \ | . 358 * | . | / \ | | . | / \ | . 359 * | . | / \ | | . | / \ | . 360 * 361 * .... .... .... .... 362 * 363 * | . | \ / | | . | \ / | . 364 * | . | \ / | | . | \ / | . 365 * | | \ / | | | \ / | 366 * | LOCAL_MAC | \ / | | \ / | \ / | last prio - 5 367 * | PROMISC | X | | \ / | X | last prio - 4 368 * | ALLMULTI | / \ | | X | / \ | last prio - 3 369 * | BROADCAST | / \ | | / \ | / \ | last prio - 2 370 * | BROADCASTV6 | / \ | | / \ | / \ | last prio - 1 371 * | xx | / \ | | ISOLATE | / \ | last prio 372 * +-------------+-------------+ +-------------+-------------+ 373 * 374 * The implicit flow rules are stored in a list in with mandatorily the last two 375 * being the ISOLATE and REMOTE_TX rules. e.g.: 376 * 377 * LOCAL_MAC -> BROADCAST -> BROADCASTV6 -> REMOTE_TX -> ISOLATE -> NULL 378 * 379 * That enables tap_flow_isolate() to remove implicit rules by popping the list 380 * head and remove it as long as it applies on the remote netdevice. The 381 * implicit rule for TX redirection is not removed, as isolate concerns only 382 * incoming traffic. 383 */ 384 385 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = { 386 [TAP_REMOTE_LOCAL_MAC] = { 387 .attr = { 388 .group = MAX_GROUP, 389 .priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC, 390 .ingress = 1, 391 }, 392 .items[0] = { 393 .type = RTE_FLOW_ITEM_TYPE_ETH, 394 .mask = &(const struct rte_flow_item_eth){ 395 .hdr.dst_addr.addr_bytes = "\xff\xff\xff\xff\xff\xff", 396 }, 397 }, 398 .items[1] = { 399 .type = RTE_FLOW_ITEM_TYPE_END, 400 }, 401 .mirred = TCA_EGRESS_REDIR, 402 }, 403 [TAP_REMOTE_BROADCAST] = { 404 .attr = { 405 .group = MAX_GROUP, 406 .priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST, 407 .ingress = 1, 408 }, 409 .items[0] = { 410 .type = RTE_FLOW_ITEM_TYPE_ETH, 411 .mask = &(const struct rte_flow_item_eth){ 412 .hdr.dst_addr.addr_bytes = "\xff\xff\xff\xff\xff\xff", 413 }, 414 .spec = &(const struct rte_flow_item_eth){ 415 .hdr.dst_addr.addr_bytes = "\xff\xff\xff\xff\xff\xff", 416 }, 417 }, 418 .items[1] = { 419 .type = RTE_FLOW_ITEM_TYPE_END, 420 }, 421 .mirred = TCA_EGRESS_MIRROR, 422 }, 423 [TAP_REMOTE_BROADCASTV6] = { 424 .attr = { 425 .group = MAX_GROUP, 426 .priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6, 427 .ingress = 1, 428 }, 429 .items[0] = { 430 .type = RTE_FLOW_ITEM_TYPE_ETH, 431 .mask = &(const struct rte_flow_item_eth){ 432 .hdr.dst_addr.addr_bytes = "\x33\x33\x00\x00\x00\x00", 433 }, 434 .spec = &(const struct rte_flow_item_eth){ 435 .hdr.dst_addr.addr_bytes = "\x33\x33\x00\x00\x00\x00", 436 }, 437 }, 438 .items[1] = { 439 .type = RTE_FLOW_ITEM_TYPE_END, 440 }, 441 .mirred = TCA_EGRESS_MIRROR, 442 }, 443 [TAP_REMOTE_PROMISC] = { 444 .attr = { 445 .group = MAX_GROUP, 446 .priority = PRIORITY_MASK - TAP_REMOTE_PROMISC, 447 .ingress = 1, 448 }, 449 .items[0] = { 450 .type = RTE_FLOW_ITEM_TYPE_VOID, 451 }, 452 .items[1] = { 453 .type = RTE_FLOW_ITEM_TYPE_END, 454 }, 455 .mirred = TCA_EGRESS_MIRROR, 456 }, 457 [TAP_REMOTE_ALLMULTI] = { 458 .attr = { 459 .group = MAX_GROUP, 460 .priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI, 461 .ingress = 1, 462 }, 463 .items[0] = { 464 .type = RTE_FLOW_ITEM_TYPE_ETH, 465 .mask = &(const struct rte_flow_item_eth){ 466 .hdr.dst_addr.addr_bytes = "\x01\x00\x00\x00\x00\x00", 467 }, 468 .spec = &(const struct rte_flow_item_eth){ 469 .hdr.dst_addr.addr_bytes = "\x01\x00\x00\x00\x00\x00", 470 }, 471 }, 472 .items[1] = { 473 .type = RTE_FLOW_ITEM_TYPE_END, 474 }, 475 .mirred = TCA_EGRESS_MIRROR, 476 }, 477 [TAP_REMOTE_TX] = { 478 .attr = { 479 .group = 0, 480 .priority = TAP_REMOTE_TX, 481 .egress = 1, 482 }, 483 .items[0] = { 484 .type = RTE_FLOW_ITEM_TYPE_VOID, 485 }, 486 .items[1] = { 487 .type = RTE_FLOW_ITEM_TYPE_END, 488 }, 489 .mirred = TCA_EGRESS_MIRROR, 490 }, 491 [TAP_ISOLATE] = { 492 .attr = { 493 .group = MAX_GROUP, 494 .priority = PRIORITY_MASK - TAP_ISOLATE, 495 .ingress = 1, 496 }, 497 .items[0] = { 498 .type = RTE_FLOW_ITEM_TYPE_VOID, 499 }, 500 .items[1] = { 501 .type = RTE_FLOW_ITEM_TYPE_END, 502 }, 503 }, 504 }; 505 506 /** 507 * Make as much checks as possible on an Ethernet item, and if a flow is 508 * provided, fill it appropriately with Ethernet info. 509 * 510 * @param[in] item 511 * Item specification. 512 * @param[in, out] data 513 * Additional data structure to tell next layers we've been here. 514 * 515 * @return 516 * 0 if checks are alright, -1 otherwise. 517 */ 518 static int 519 tap_flow_create_eth(const struct rte_flow_item *item, void *data) 520 { 521 struct convert_data *info = (struct convert_data *)data; 522 const struct rte_flow_item_eth *spec = item->spec; 523 const struct rte_flow_item_eth *mask = item->mask; 524 struct rte_flow *flow = info->flow; 525 struct nlmsg *msg; 526 527 /* use default mask if none provided */ 528 if (!mask) 529 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask; 530 /* TC does not support eth_type masking. Only accept if exact match. */ 531 if (mask->hdr.ether_type && mask->hdr.ether_type != 0xffff) 532 return -1; 533 if (!spec) 534 return 0; 535 /* store eth_type for consistency if ipv4/6 pattern item comes next */ 536 if (spec->hdr.ether_type & mask->hdr.ether_type) 537 info->eth_type = spec->hdr.ether_type; 538 if (!flow) 539 return 0; 540 msg = &flow->msg; 541 if (!rte_is_zero_ether_addr(&mask->hdr.dst_addr)) { 542 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, 543 RTE_ETHER_ADDR_LEN, 544 &spec->hdr.dst_addr.addr_bytes); 545 tap_nlattr_add(&msg->nh, 546 TCA_FLOWER_KEY_ETH_DST_MASK, RTE_ETHER_ADDR_LEN, 547 &mask->hdr.dst_addr.addr_bytes); 548 } 549 if (!rte_is_zero_ether_addr(&mask->hdr.src_addr)) { 550 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, 551 RTE_ETHER_ADDR_LEN, 552 &spec->hdr.src_addr.addr_bytes); 553 tap_nlattr_add(&msg->nh, 554 TCA_FLOWER_KEY_ETH_SRC_MASK, RTE_ETHER_ADDR_LEN, 555 &mask->hdr.src_addr.addr_bytes); 556 } 557 return 0; 558 } 559 560 /** 561 * Make as much checks as possible on a VLAN item, and if a flow is provided, 562 * fill it appropriately with VLAN info. 563 * 564 * @param[in] item 565 * Item specification. 566 * @param[in, out] data 567 * Additional data structure to tell next layers we've been here. 568 * 569 * @return 570 * 0 if checks are alright, -1 otherwise. 571 */ 572 static int 573 tap_flow_create_vlan(const struct rte_flow_item *item, void *data) 574 { 575 struct convert_data *info = (struct convert_data *)data; 576 const struct rte_flow_item_vlan *spec = item->spec; 577 const struct rte_flow_item_vlan *mask = item->mask; 578 struct rte_flow *flow = info->flow; 579 struct nlmsg *msg; 580 581 /* use default mask if none provided */ 582 if (!mask) 583 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask; 584 /* Outer TPID cannot be matched. */ 585 if (info->eth_type) 586 return -1; 587 /* Double-tagging not supported. */ 588 if (info->vlan) 589 return -1; 590 info->vlan = 1; 591 if (mask->hdr.eth_proto) { 592 /* TC does not support partial eth_type masking */ 593 if (mask->hdr.eth_proto != RTE_BE16(0xffff)) 594 return -1; 595 info->eth_type = spec->hdr.eth_proto; 596 } 597 if (!flow) 598 return 0; 599 msg = &flow->msg; 600 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q)); 601 #define VLAN_PRIO(tci) ((tci) >> 13) 602 #define VLAN_ID(tci) ((tci) & 0xfff) 603 if (!spec) 604 return 0; 605 if (spec->hdr.vlan_tci) { 606 uint16_t tci = ntohs(spec->hdr.vlan_tci) & mask->hdr.vlan_tci; 607 uint16_t prio = VLAN_PRIO(tci); 608 uint8_t vid = VLAN_ID(tci); 609 610 if (prio) 611 tap_nlattr_add8(&msg->nh, 612 TCA_FLOWER_KEY_VLAN_PRIO, prio); 613 if (vid) 614 tap_nlattr_add16(&msg->nh, 615 TCA_FLOWER_KEY_VLAN_ID, vid); 616 } 617 return 0; 618 } 619 620 /** 621 * Make as much checks as possible on an IPv4 item, and if a flow is provided, 622 * fill it appropriately with IPv4 info. 623 * 624 * @param[in] item 625 * Item specification. 626 * @param[in, out] data 627 * Additional data structure to tell next layers we've been here. 628 * 629 * @return 630 * 0 if checks are alright, -1 otherwise. 631 */ 632 static int 633 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data) 634 { 635 struct convert_data *info = (struct convert_data *)data; 636 const struct rte_flow_item_ipv4 *spec = item->spec; 637 const struct rte_flow_item_ipv4 *mask = item->mask; 638 struct rte_flow *flow = info->flow; 639 struct nlmsg *msg; 640 641 /* use default mask if none provided */ 642 if (!mask) 643 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask; 644 /* check that previous eth type is compatible with ipv4 */ 645 if (info->eth_type && info->eth_type != htons(ETH_P_IP)) 646 return -1; 647 /* store ip_proto for consistency if udp/tcp pattern item comes next */ 648 if (spec) 649 info->ip_proto = spec->hdr.next_proto_id; 650 if (!flow) 651 return 0; 652 msg = &flow->msg; 653 if (!info->eth_type) 654 info->eth_type = htons(ETH_P_IP); 655 if (!spec) 656 return 0; 657 if (mask->hdr.dst_addr) { 658 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST, 659 spec->hdr.dst_addr); 660 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK, 661 mask->hdr.dst_addr); 662 } 663 if (mask->hdr.src_addr) { 664 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC, 665 spec->hdr.src_addr); 666 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK, 667 mask->hdr.src_addr); 668 } 669 if (spec->hdr.next_proto_id) 670 tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, 671 spec->hdr.next_proto_id); 672 return 0; 673 } 674 675 /** 676 * Make as much checks as possible on an IPv6 item, and if a flow is provided, 677 * fill it appropriately with IPv6 info. 678 * 679 * @param[in] item 680 * Item specification. 681 * @param[in, out] data 682 * Additional data structure to tell next layers we've been here. 683 * 684 * @return 685 * 0 if checks are alright, -1 otherwise. 686 */ 687 static int 688 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data) 689 { 690 struct convert_data *info = (struct convert_data *)data; 691 const struct rte_flow_item_ipv6 *spec = item->spec; 692 const struct rte_flow_item_ipv6 *mask = item->mask; 693 struct rte_flow *flow = info->flow; 694 uint8_t empty_addr[16] = { 0 }; 695 struct nlmsg *msg; 696 697 /* use default mask if none provided */ 698 if (!mask) 699 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask; 700 /* check that previous eth type is compatible with ipv6 */ 701 if (info->eth_type && info->eth_type != htons(ETH_P_IPV6)) 702 return -1; 703 /* store ip_proto for consistency if udp/tcp pattern item comes next */ 704 if (spec) 705 info->ip_proto = spec->hdr.proto; 706 if (!flow) 707 return 0; 708 msg = &flow->msg; 709 if (!info->eth_type) 710 info->eth_type = htons(ETH_P_IPV6); 711 if (!spec) 712 return 0; 713 if (memcmp(mask->hdr.dst_addr, empty_addr, 16)) { 714 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST, 715 sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr); 716 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK, 717 sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr); 718 } 719 if (memcmp(mask->hdr.src_addr, empty_addr, 16)) { 720 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC, 721 sizeof(spec->hdr.src_addr), &spec->hdr.src_addr); 722 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK, 723 sizeof(mask->hdr.src_addr), &mask->hdr.src_addr); 724 } 725 if (spec->hdr.proto) 726 tap_nlattr_add8(&msg->nh, 727 TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto); 728 return 0; 729 } 730 731 /** 732 * Make as much checks as possible on a UDP item, and if a flow is provided, 733 * fill it appropriately with UDP info. 734 * 735 * @param[in] item 736 * Item specification. 737 * @param[in, out] data 738 * Additional data structure to tell next layers we've been here. 739 * 740 * @return 741 * 0 if checks are alright, -1 otherwise. 742 */ 743 static int 744 tap_flow_create_udp(const struct rte_flow_item *item, void *data) 745 { 746 struct convert_data *info = (struct convert_data *)data; 747 const struct rte_flow_item_udp *spec = item->spec; 748 const struct rte_flow_item_udp *mask = item->mask; 749 struct rte_flow *flow = info->flow; 750 struct nlmsg *msg; 751 752 /* use default mask if none provided */ 753 if (!mask) 754 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask; 755 /* check that previous ip_proto is compatible with udp */ 756 if (info->ip_proto && info->ip_proto != IPPROTO_UDP) 757 return -1; 758 /* TC does not support UDP port masking. Only accept if exact match. */ 759 if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) || 760 (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff)) 761 return -1; 762 if (!flow) 763 return 0; 764 msg = &flow->msg; 765 tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP); 766 if (!spec) 767 return 0; 768 if (mask->hdr.dst_port) 769 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST, 770 spec->hdr.dst_port); 771 if (mask->hdr.src_port) 772 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC, 773 spec->hdr.src_port); 774 return 0; 775 } 776 777 /** 778 * Make as much checks as possible on a TCP item, and if a flow is provided, 779 * fill it appropriately with TCP info. 780 * 781 * @param[in] item 782 * Item specification. 783 * @param[in, out] data 784 * Additional data structure to tell next layers we've been here. 785 * 786 * @return 787 * 0 if checks are alright, -1 otherwise. 788 */ 789 static int 790 tap_flow_create_tcp(const struct rte_flow_item *item, void *data) 791 { 792 struct convert_data *info = (struct convert_data *)data; 793 const struct rte_flow_item_tcp *spec = item->spec; 794 const struct rte_flow_item_tcp *mask = item->mask; 795 struct rte_flow *flow = info->flow; 796 struct nlmsg *msg; 797 798 /* use default mask if none provided */ 799 if (!mask) 800 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask; 801 /* check that previous ip_proto is compatible with tcp */ 802 if (info->ip_proto && info->ip_proto != IPPROTO_TCP) 803 return -1; 804 /* TC does not support TCP port masking. Only accept if exact match. */ 805 if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) || 806 (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff)) 807 return -1; 808 if (!flow) 809 return 0; 810 msg = &flow->msg; 811 tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP); 812 if (!spec) 813 return 0; 814 if (mask->hdr.dst_port) 815 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST, 816 spec->hdr.dst_port); 817 if (mask->hdr.src_port) 818 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC, 819 spec->hdr.src_port); 820 return 0; 821 } 822 823 /** 824 * Check support for a given item. 825 * 826 * @param[in] item 827 * Item specification. 828 * @param size 829 * Bit-Mask size in bytes. 830 * @param[in] supported_mask 831 * Bit-mask covering supported fields to compare with spec, last and mask in 832 * \item. 833 * @param[in] default_mask 834 * Bit-mask default mask if none is provided in \item. 835 * 836 * @return 837 * 0 on success. 838 */ 839 static int 840 tap_flow_item_validate(const struct rte_flow_item *item, 841 unsigned int size, 842 const uint8_t *supported_mask, 843 const uint8_t *default_mask) 844 { 845 int ret = 0; 846 847 /* An empty layer is allowed, as long as all fields are NULL */ 848 if (!item->spec && (item->mask || item->last)) 849 return -1; 850 /* Is the item spec compatible with what the NIC supports? */ 851 if (item->spec && !item->mask) { 852 unsigned int i; 853 const uint8_t *spec = item->spec; 854 855 for (i = 0; i < size; ++i) 856 if ((spec[i] | supported_mask[i]) != supported_mask[i]) 857 return -1; 858 /* Is the default mask compatible with what the NIC supports? */ 859 for (i = 0; i < size; i++) 860 if ((default_mask[i] | supported_mask[i]) != 861 supported_mask[i]) 862 return -1; 863 } 864 /* Is the item last compatible with what the NIC supports? */ 865 if (item->last && !item->mask) { 866 unsigned int i; 867 const uint8_t *spec = item->last; 868 869 for (i = 0; i < size; ++i) 870 if ((spec[i] | supported_mask[i]) != supported_mask[i]) 871 return -1; 872 } 873 /* Is the item mask compatible with what the NIC supports? */ 874 if (item->mask) { 875 unsigned int i; 876 const uint8_t *spec = item->mask; 877 878 for (i = 0; i < size; ++i) 879 if ((spec[i] | supported_mask[i]) != supported_mask[i]) 880 return -1; 881 } 882 /** 883 * Once masked, Are item spec and item last equal? 884 * TC does not support range so anything else is invalid. 885 */ 886 if (item->spec && item->last) { 887 uint8_t spec[size]; 888 uint8_t last[size]; 889 const uint8_t *apply = default_mask; 890 unsigned int i; 891 892 if (item->mask) 893 apply = item->mask; 894 for (i = 0; i < size; ++i) { 895 spec[i] = ((const uint8_t *)item->spec)[i] & apply[i]; 896 last[i] = ((const uint8_t *)item->last)[i] & apply[i]; 897 } 898 ret = memcmp(spec, last, size); 899 } 900 return ret; 901 } 902 903 /** 904 * Configure the kernel with a TC action and its configured parameters 905 * Handled actions: "gact", "mirred", "skbedit", "bpf" 906 * 907 * @param[in] flow 908 * Pointer to rte flow containing the netlink message 909 * 910 * @param[in, out] act_index 911 * Pointer to action sequence number in the TC command 912 * 913 * @param[in] adata 914 * Pointer to struct holding the action parameters 915 * 916 * @return 917 * -1 on failure, 0 on success 918 */ 919 static int 920 add_action(struct rte_flow *flow, size_t *act_index, struct action_data *adata) 921 { 922 struct nlmsg *msg = &flow->msg; 923 924 if (tap_nlattr_nested_start(msg, (*act_index)++) < 0) 925 return -1; 926 927 tap_nlattr_add(&msg->nh, TCA_ACT_KIND, 928 strlen(adata->id) + 1, adata->id); 929 if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) 930 return -1; 931 if (strcmp("gact", adata->id) == 0) { 932 tap_nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(adata->gact), 933 &adata->gact); 934 } else if (strcmp("mirred", adata->id) == 0) { 935 if (adata->mirred.eaction == TCA_EGRESS_MIRROR) 936 adata->mirred.action = TC_ACT_PIPE; 937 else /* REDIRECT */ 938 adata->mirred.action = TC_ACT_STOLEN; 939 tap_nlattr_add(&msg->nh, TCA_MIRRED_PARMS, 940 sizeof(adata->mirred), 941 &adata->mirred); 942 } else if (strcmp("skbedit", adata->id) == 0) { 943 tap_nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, 944 sizeof(adata->skbedit.skbedit), 945 &adata->skbedit.skbedit); 946 tap_nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, 947 adata->skbedit.queue); 948 } else if (strcmp("bpf", adata->id) == 0) { 949 tap_nlattr_add32(&msg->nh, TCA_ACT_BPF_FD, adata->bpf.bpf_fd); 950 tap_nlattr_add(&msg->nh, TCA_ACT_BPF_NAME, 951 strlen(adata->bpf.annotation) + 1, 952 adata->bpf.annotation); 953 tap_nlattr_add(&msg->nh, TCA_ACT_BPF_PARMS, 954 sizeof(adata->bpf.bpf), 955 &adata->bpf.bpf); 956 } else { 957 return -1; 958 } 959 tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ 960 tap_nlattr_nested_finish(msg); /* nested act_index */ 961 return 0; 962 } 963 964 /** 965 * Helper function to send a series of TC actions to the kernel 966 * 967 * @param[in] flow 968 * Pointer to rte flow containing the netlink message 969 * 970 * @param[in] nb_actions 971 * Number of actions in an array of action structs 972 * 973 * @param[in] data 974 * Pointer to an array of action structs 975 * 976 * @param[in] classifier_actions 977 * The classifier on behave of which the actions are configured 978 * 979 * @return 980 * -1 on failure, 0 on success 981 */ 982 static int 983 add_actions(struct rte_flow *flow, int nb_actions, struct action_data *data, 984 int classifier_action) 985 { 986 struct nlmsg *msg = &flow->msg; 987 size_t act_index = 1; 988 int i; 989 990 if (tap_nlattr_nested_start(msg, classifier_action) < 0) 991 return -1; 992 for (i = 0; i < nb_actions; i++) 993 if (add_action(flow, &act_index, data + i) < 0) 994 return -1; 995 tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ 996 return 0; 997 } 998 999 /** 1000 * Validate a flow supported by TC. 1001 * If flow param is not NULL, then also fill the netlink message inside. 1002 * 1003 * @param pmd 1004 * Pointer to private structure. 1005 * @param[in] attr 1006 * Flow rule attributes. 1007 * @param[in] pattern 1008 * Pattern specification (list terminated by the END pattern item). 1009 * @param[in] actions 1010 * Associated actions (list terminated by the END action). 1011 * @param[out] error 1012 * Perform verbose error reporting if not NULL. 1013 * @param[in, out] flow 1014 * Flow structure to update. 1015 * @param[in] mirred 1016 * If set to TCA_EGRESS_REDIR, provided actions will be replaced with a 1017 * redirection to the tap netdevice, and the TC rule will be configured 1018 * on the remote netdevice in pmd. 1019 * If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a 1020 * mirroring to the tap netdevice, and the TC rule will be configured 1021 * on the remote netdevice in pmd. Matching packets will thus be duplicated. 1022 * If set to 0, the standard behavior is to be used: set correct actions for 1023 * the TC rule, and apply it on the tap netdevice. 1024 * 1025 * @return 1026 * 0 on success, a negative errno value otherwise and rte_errno is set. 1027 */ 1028 static int 1029 priv_flow_process(struct pmd_internals *pmd, 1030 const struct rte_flow_attr *attr, 1031 const struct rte_flow_item items[], 1032 const struct rte_flow_action actions[], 1033 struct rte_flow_error *error, 1034 struct rte_flow *flow, 1035 int mirred) 1036 { 1037 const struct tap_flow_items *cur_item = tap_flow_items; 1038 struct convert_data data = { 1039 .eth_type = 0, 1040 .ip_proto = 0, 1041 .flow = flow, 1042 }; 1043 int action = 0; /* Only one action authorized for now */ 1044 1045 if (attr->transfer) { 1046 rte_flow_error_set( 1047 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER, 1048 NULL, "transfer is not supported"); 1049 return -rte_errno; 1050 } 1051 if (attr->group > MAX_GROUP) { 1052 rte_flow_error_set( 1053 error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP, 1054 NULL, "group value too big: cannot exceed 15"); 1055 return -rte_errno; 1056 } 1057 if (attr->priority > MAX_PRIORITY) { 1058 rte_flow_error_set( 1059 error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, 1060 NULL, "priority value too big"); 1061 return -rte_errno; 1062 } else if (flow) { 1063 uint16_t group = attr->group << GROUP_SHIFT; 1064 uint16_t prio = group | (attr->priority + 1065 RSS_PRIORITY_OFFSET + PRIORITY_OFFSET); 1066 flow->msg.t.tcm_info = TC_H_MAKE(prio << 16, 1067 flow->msg.t.tcm_info); 1068 } 1069 if (flow) { 1070 if (mirred) { 1071 /* 1072 * If attr->ingress, the rule applies on remote ingress 1073 * to match incoming packets 1074 * If attr->egress, the rule applies on tap ingress (as 1075 * seen from the kernel) to deal with packets going out 1076 * from the DPDK app. 1077 */ 1078 flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0); 1079 } else { 1080 /* Standard rule on tap egress (kernel standpoint). */ 1081 flow->msg.t.tcm_parent = 1082 TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); 1083 } 1084 /* use flower filter type */ 1085 tap_nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower"); 1086 if (tap_nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0) { 1087 rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_ACTION, 1088 actions, "could not allocated netlink msg"); 1089 goto exit_return_error; 1090 } 1091 } 1092 for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) { 1093 const struct tap_flow_items *token = NULL; 1094 unsigned int i; 1095 int err = 0; 1096 1097 if (items->type == RTE_FLOW_ITEM_TYPE_VOID) 1098 continue; 1099 for (i = 0; 1100 cur_item->items && 1101 cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END; 1102 ++i) { 1103 if (cur_item->items[i] == items->type) { 1104 token = &tap_flow_items[items->type]; 1105 break; 1106 } 1107 } 1108 if (!token) 1109 goto exit_item_not_supported; 1110 cur_item = token; 1111 err = tap_flow_item_validate( 1112 items, cur_item->mask_sz, 1113 (const uint8_t *)cur_item->mask, 1114 (const uint8_t *)cur_item->default_mask); 1115 if (err) 1116 goto exit_item_not_supported; 1117 if (flow && cur_item->convert) { 1118 err = cur_item->convert(items, &data); 1119 if (err) 1120 goto exit_item_not_supported; 1121 } 1122 } 1123 if (flow) { 1124 if (data.vlan) { 1125 tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE, 1126 htons(ETH_P_8021Q)); 1127 tap_nlattr_add16(&flow->msg.nh, 1128 TCA_FLOWER_KEY_VLAN_ETH_TYPE, 1129 data.eth_type ? 1130 data.eth_type : htons(ETH_P_ALL)); 1131 } else if (data.eth_type) { 1132 tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE, 1133 data.eth_type); 1134 } 1135 } 1136 if (mirred && flow) { 1137 struct action_data adata = { 1138 .id = "mirred", 1139 .mirred = { 1140 .eaction = mirred, 1141 }, 1142 }; 1143 1144 /* 1145 * If attr->egress && mirred, then this is a special 1146 * case where the rule must be applied on the tap, to 1147 * redirect packets coming from the DPDK App, out 1148 * through the remote netdevice. 1149 */ 1150 adata.mirred.ifindex = attr->ingress ? pmd->if_index : 1151 pmd->remote_if_index; 1152 if (mirred == TCA_EGRESS_MIRROR) 1153 adata.mirred.action = TC_ACT_PIPE; 1154 else 1155 adata.mirred.action = TC_ACT_STOLEN; 1156 if (add_actions(flow, 1, &adata, TCA_FLOWER_ACT) < 0) 1157 goto exit_action_not_supported; 1158 else 1159 goto end; 1160 } 1161 actions: 1162 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { 1163 int err = 0; 1164 1165 if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) { 1166 continue; 1167 } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { 1168 if (action) 1169 goto exit_action_not_supported; 1170 action = 1; 1171 if (flow) { 1172 struct action_data adata = { 1173 .id = "gact", 1174 .gact = { 1175 .action = TC_ACT_SHOT, 1176 }, 1177 }; 1178 1179 err = add_actions(flow, 1, &adata, 1180 TCA_FLOWER_ACT); 1181 } 1182 } else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) { 1183 if (action) 1184 goto exit_action_not_supported; 1185 action = 1; 1186 if (flow) { 1187 struct action_data adata = { 1188 .id = "gact", 1189 .gact = { 1190 /* continue */ 1191 .action = TC_ACT_UNSPEC, 1192 }, 1193 }; 1194 1195 err = add_actions(flow, 1, &adata, 1196 TCA_FLOWER_ACT); 1197 } 1198 } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) { 1199 const struct rte_flow_action_queue *queue = 1200 (const struct rte_flow_action_queue *) 1201 actions->conf; 1202 1203 if (action) 1204 goto exit_action_not_supported; 1205 action = 1; 1206 if (queue->index >= pmd->dev->data->nb_rx_queues) { 1207 rte_flow_error_set(error, ERANGE, 1208 RTE_FLOW_ERROR_TYPE_ACTION, actions, 1209 "queue index out of range"); 1210 goto exit_return_error; 1211 } 1212 if (flow) { 1213 struct action_data adata = { 1214 .id = "skbedit", 1215 .skbedit = { 1216 .skbedit = { 1217 .action = TC_ACT_PIPE, 1218 }, 1219 .queue = queue->index, 1220 }, 1221 }; 1222 1223 err = add_actions(flow, 1, &adata, 1224 TCA_FLOWER_ACT); 1225 } 1226 } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) { 1227 const struct rte_flow_action_rss *rss = 1228 (const struct rte_flow_action_rss *) 1229 actions->conf; 1230 1231 if (action++) 1232 goto exit_action_not_supported; 1233 1234 if (!pmd->rss_enabled) { 1235 err = rss_enable(pmd, attr, error); 1236 if (err) 1237 goto exit_return_error; 1238 } 1239 if (flow) 1240 err = rss_add_actions(flow, pmd, rss, error); 1241 } else { 1242 goto exit_action_not_supported; 1243 } 1244 if (err) 1245 goto exit_return_error; 1246 } 1247 /* When fate is unknown, drop traffic. */ 1248 if (!action) { 1249 static const struct rte_flow_action drop[] = { 1250 { .type = RTE_FLOW_ACTION_TYPE_DROP, }, 1251 { .type = RTE_FLOW_ACTION_TYPE_END, }, 1252 }; 1253 1254 actions = drop; 1255 goto actions; 1256 } 1257 end: 1258 if (flow) 1259 tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */ 1260 return 0; 1261 exit_item_not_supported: 1262 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, 1263 items, "item not supported"); 1264 return -rte_errno; 1265 exit_action_not_supported: 1266 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, 1267 actions, "action not supported"); 1268 exit_return_error: 1269 return -rte_errno; 1270 } 1271 1272 1273 1274 /** 1275 * Validate a flow. 1276 * 1277 * @see rte_flow_validate() 1278 * @see rte_flow_ops 1279 */ 1280 static int 1281 tap_flow_validate(struct rte_eth_dev *dev, 1282 const struct rte_flow_attr *attr, 1283 const struct rte_flow_item items[], 1284 const struct rte_flow_action actions[], 1285 struct rte_flow_error *error) 1286 { 1287 struct pmd_internals *pmd = dev->data->dev_private; 1288 1289 return priv_flow_process(pmd, attr, items, actions, error, NULL, 0); 1290 } 1291 1292 /** 1293 * Set a unique handle in a flow. 1294 * 1295 * The kernel supports TC rules with equal priority, as long as they use the 1296 * same matching fields (e.g.: dst mac and ipv4) with different values (and 1297 * full mask to ensure no collision is possible). 1298 * In those rules, the handle (uint32_t) is the part that would identify 1299 * specifically each rule. 1300 * 1301 * Use jhash of the flow pointer to make a unique handle. 1302 * 1303 * @param[in, out] flow 1304 * The flow that needs its handle set. 1305 */ 1306 static void 1307 tap_flow_set_handle(struct rte_flow *flow) 1308 { 1309 union { 1310 struct rte_flow *flow; 1311 uint32_t words[sizeof(flow) / sizeof(uint32_t)]; 1312 } tmp = { 1313 .flow = flow, 1314 }; 1315 uint32_t handle; 1316 static uint64_t hash_seed; 1317 1318 if (hash_seed == 0) 1319 hash_seed = rte_rand(); 1320 1321 handle = rte_jhash_32b(tmp.words, sizeof(flow) / sizeof(uint32_t), hash_seed); 1322 1323 /* must be at least 1 to avoid letting the kernel choose one for us */ 1324 if (!handle) 1325 handle = 1; 1326 flow->msg.t.tcm_handle = handle; 1327 } 1328 1329 /** 1330 * Free the flow opened file descriptors and allocated memory 1331 * 1332 * @param[in] flow 1333 * Pointer to the flow to free 1334 * 1335 */ 1336 static void 1337 tap_flow_free(struct pmd_internals *pmd, struct rte_flow *flow) 1338 { 1339 int i; 1340 1341 if (!flow) 1342 return; 1343 1344 if (pmd->rss_enabled) { 1345 /* Close flow BPF file descriptors */ 1346 for (i = 0; i < SEC_MAX; i++) 1347 if (flow->bpf_fd[i] != 0) { 1348 close(flow->bpf_fd[i]); 1349 flow->bpf_fd[i] = 0; 1350 } 1351 1352 /* Release the map key for this RSS rule */ 1353 bpf_rss_key(KEY_CMD_RELEASE, &flow->key_idx); 1354 flow->key_idx = 0; 1355 } 1356 1357 /* Free flow allocated memory */ 1358 rte_free(flow); 1359 } 1360 1361 /** 1362 * Create a flow. 1363 * 1364 * @see rte_flow_create() 1365 * @see rte_flow_ops 1366 */ 1367 static struct rte_flow * 1368 tap_flow_create(struct rte_eth_dev *dev, 1369 const struct rte_flow_attr *attr, 1370 const struct rte_flow_item items[], 1371 const struct rte_flow_action actions[], 1372 struct rte_flow_error *error) 1373 { 1374 struct pmd_internals *pmd = dev->data->dev_private; 1375 struct rte_flow *remote_flow = NULL; 1376 struct rte_flow *flow = NULL; 1377 struct nlmsg *msg = NULL; 1378 int err; 1379 1380 if (!pmd->if_index) { 1381 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, 1382 NULL, 1383 "can't create rule, ifindex not found"); 1384 goto fail; 1385 } 1386 /* 1387 * No rules configured through standard rte_flow should be set on the 1388 * priorities used by implicit rules. 1389 */ 1390 if ((attr->group == MAX_GROUP) && 1391 attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) { 1392 rte_flow_error_set( 1393 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, 1394 NULL, "priority value too big"); 1395 goto fail; 1396 } 1397 flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0); 1398 if (!flow) { 1399 rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1400 NULL, "cannot allocate memory for rte_flow"); 1401 goto fail; 1402 } 1403 msg = &flow->msg; 1404 tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, 1405 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); 1406 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); 1407 tap_flow_set_handle(flow); 1408 if (priv_flow_process(pmd, attr, items, actions, error, flow, 0)) 1409 goto fail; 1410 err = tap_nl_send(pmd->nlsk_fd, &msg->nh); 1411 if (err < 0) { 1412 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, 1413 NULL, "couldn't send request to kernel"); 1414 goto fail; 1415 } 1416 err = tap_nl_recv_ack(pmd->nlsk_fd); 1417 if (err < 0) { 1418 TAP_LOG(ERR, 1419 "Kernel refused TC filter rule creation (%d): %s", 1420 errno, strerror(errno)); 1421 rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE, 1422 NULL, 1423 "overlapping rules or Kernel too old for flower support"); 1424 goto fail; 1425 } 1426 LIST_INSERT_HEAD(&pmd->flows, flow, next); 1427 /** 1428 * If a remote device is configured, a TC rule with identical items for 1429 * matching must be set on that device, with a single action: redirect 1430 * to the local pmd->if_index. 1431 */ 1432 if (pmd->remote_if_index) { 1433 remote_flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0); 1434 if (!remote_flow) { 1435 rte_flow_error_set( 1436 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 1437 "cannot allocate memory for rte_flow"); 1438 goto fail; 1439 } 1440 msg = &remote_flow->msg; 1441 /* set the rule if_index for the remote netdevice */ 1442 tc_init_msg( 1443 msg, pmd->remote_if_index, RTM_NEWTFILTER, 1444 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); 1445 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); 1446 tap_flow_set_handle(remote_flow); 1447 if (priv_flow_process(pmd, attr, items, NULL, 1448 error, remote_flow, TCA_EGRESS_REDIR)) { 1449 rte_flow_error_set( 1450 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1451 NULL, "rte flow rule validation failed"); 1452 goto fail; 1453 } 1454 err = tap_nl_send(pmd->nlsk_fd, &msg->nh); 1455 if (err < 0) { 1456 rte_flow_error_set( 1457 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1458 NULL, "Failure sending nl request"); 1459 goto fail; 1460 } 1461 err = tap_nl_recv_ack(pmd->nlsk_fd); 1462 if (err < 0) { 1463 TAP_LOG(ERR, 1464 "Kernel refused TC filter rule creation (%d): %s", 1465 errno, strerror(errno)); 1466 rte_flow_error_set( 1467 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1468 NULL, 1469 "overlapping rules or Kernel too old for flower support"); 1470 goto fail; 1471 } 1472 flow->remote_flow = remote_flow; 1473 } 1474 return flow; 1475 fail: 1476 rte_free(remote_flow); 1477 if (flow) 1478 tap_flow_free(pmd, flow); 1479 return NULL; 1480 } 1481 1482 /** 1483 * Destroy a flow using pointer to pmd_internal. 1484 * 1485 * @param[in, out] pmd 1486 * Pointer to private structure. 1487 * @param[in] flow 1488 * Pointer to the flow to destroy. 1489 * @param[in, out] error 1490 * Pointer to the flow error handler 1491 * 1492 * @return 0 if the flow could be destroyed, -1 otherwise. 1493 */ 1494 static int 1495 tap_flow_destroy_pmd(struct pmd_internals *pmd, 1496 struct rte_flow *flow, 1497 struct rte_flow_error *error) 1498 { 1499 struct rte_flow *remote_flow = flow->remote_flow; 1500 int ret = 0; 1501 1502 LIST_REMOVE(flow, next); 1503 flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1504 flow->msg.nh.nlmsg_type = RTM_DELTFILTER; 1505 1506 ret = tap_nl_send(pmd->nlsk_fd, &flow->msg.nh); 1507 if (ret < 0) { 1508 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, 1509 NULL, "couldn't send request to kernel"); 1510 goto end; 1511 } 1512 ret = tap_nl_recv_ack(pmd->nlsk_fd); 1513 /* If errno is ENOENT, the rule is already no longer in the kernel. */ 1514 if (ret < 0 && errno == ENOENT) 1515 ret = 0; 1516 if (ret < 0) { 1517 TAP_LOG(ERR, 1518 "Kernel refused TC filter rule deletion (%d): %s", 1519 errno, strerror(errno)); 1520 rte_flow_error_set( 1521 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 1522 "couldn't receive kernel ack to our request"); 1523 goto end; 1524 } 1525 1526 if (remote_flow) { 1527 remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1528 remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER; 1529 1530 ret = tap_nl_send(pmd->nlsk_fd, &remote_flow->msg.nh); 1531 if (ret < 0) { 1532 rte_flow_error_set( 1533 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1534 NULL, "Failure sending nl request"); 1535 goto end; 1536 } 1537 ret = tap_nl_recv_ack(pmd->nlsk_fd); 1538 if (ret < 0 && errno == ENOENT) 1539 ret = 0; 1540 if (ret < 0) { 1541 TAP_LOG(ERR, 1542 "Kernel refused TC filter rule deletion (%d): %s", 1543 errno, strerror(errno)); 1544 rte_flow_error_set( 1545 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1546 NULL, "Failure trying to receive nl ack"); 1547 goto end; 1548 } 1549 } 1550 end: 1551 rte_free(remote_flow); 1552 tap_flow_free(pmd, flow); 1553 return ret; 1554 } 1555 1556 /** 1557 * Destroy a flow. 1558 * 1559 * @see rte_flow_destroy() 1560 * @see rte_flow_ops 1561 */ 1562 static int 1563 tap_flow_destroy(struct rte_eth_dev *dev, 1564 struct rte_flow *flow, 1565 struct rte_flow_error *error) 1566 { 1567 struct pmd_internals *pmd = dev->data->dev_private; 1568 1569 return tap_flow_destroy_pmd(pmd, flow, error); 1570 } 1571 1572 /** 1573 * Enable/disable flow isolation. 1574 * 1575 * @see rte_flow_isolate() 1576 * @see rte_flow_ops 1577 */ 1578 static int 1579 tap_flow_isolate(struct rte_eth_dev *dev, 1580 int set, 1581 struct rte_flow_error *error __rte_unused) 1582 { 1583 struct pmd_internals *pmd = dev->data->dev_private; 1584 struct pmd_process_private *process_private = dev->process_private; 1585 1586 /* normalize 'set' variable to contain 0 or 1 values */ 1587 if (set) 1588 set = 1; 1589 /* if already in the right isolation mode - nothing to do */ 1590 if ((set ^ pmd->flow_isolate) == 0) 1591 return 0; 1592 /* mark the isolation mode for tap_flow_implicit_create() */ 1593 pmd->flow_isolate = set; 1594 /* 1595 * If netdevice is there, setup appropriate flow rules immediately. 1596 * Otherwise it will be set when bringing up the netdevice (tun_alloc). 1597 */ 1598 if (!process_private->rxq_fds[0]) 1599 return 0; 1600 if (set) { 1601 struct rte_flow *remote_flow; 1602 1603 while (1) { 1604 remote_flow = LIST_FIRST(&pmd->implicit_flows); 1605 if (!remote_flow) 1606 break; 1607 /* 1608 * Remove all implicit rules on the remote. 1609 * Keep the local rule to redirect packets on TX. 1610 * Keep also the last implicit local rule: ISOLATE. 1611 */ 1612 if (remote_flow->msg.t.tcm_ifindex == pmd->if_index) 1613 break; 1614 if (tap_flow_destroy_pmd(pmd, remote_flow, NULL) < 0) 1615 goto error; 1616 } 1617 /* Switch the TC rule according to pmd->flow_isolate */ 1618 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1) 1619 goto error; 1620 } else { 1621 /* Switch the TC rule according to pmd->flow_isolate */ 1622 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1) 1623 goto error; 1624 if (!pmd->remote_if_index) 1625 return 0; 1626 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0) 1627 goto error; 1628 if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0) 1629 goto error; 1630 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0) 1631 goto error; 1632 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0) 1633 goto error; 1634 if (dev->data->promiscuous && 1635 tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC) < 0) 1636 goto error; 1637 if (dev->data->all_multicast && 1638 tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI) < 0) 1639 goto error; 1640 } 1641 return 0; 1642 error: 1643 pmd->flow_isolate = 0; 1644 return rte_flow_error_set( 1645 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, 1646 "TC rule creation failed"); 1647 } 1648 1649 /** 1650 * Destroy all flows. 1651 * 1652 * @see rte_flow_flush() 1653 * @see rte_flow_ops 1654 */ 1655 int 1656 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error) 1657 { 1658 struct pmd_internals *pmd = dev->data->dev_private; 1659 struct rte_flow *flow; 1660 1661 while (!LIST_EMPTY(&pmd->flows)) { 1662 flow = LIST_FIRST(&pmd->flows); 1663 if (tap_flow_destroy(dev, flow, error) < 0) 1664 return -1; 1665 } 1666 return 0; 1667 } 1668 1669 /** 1670 * Add an implicit flow rule on the remote device to make sure traffic gets to 1671 * the tap netdevice from there. 1672 * 1673 * @param pmd 1674 * Pointer to private structure. 1675 * @param[in] idx 1676 * The idx in the implicit_rte_flows array specifying which rule to apply. 1677 * 1678 * @return -1 if the rule couldn't be applied, 0 otherwise. 1679 */ 1680 int tap_flow_implicit_create(struct pmd_internals *pmd, 1681 enum implicit_rule_index idx) 1682 { 1683 uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE; 1684 struct rte_flow_action *actions = implicit_rte_flows[idx].actions; 1685 struct rte_flow_action isolate_actions[2] = { 1686 [1] = { 1687 .type = RTE_FLOW_ACTION_TYPE_END, 1688 }, 1689 }; 1690 struct rte_flow_item *items = implicit_rte_flows[idx].items; 1691 struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr; 1692 struct rte_flow_item_eth eth_local = { .hdr.ether_type = 0 }; 1693 unsigned int if_index = pmd->remote_if_index; 1694 struct rte_flow *remote_flow = NULL; 1695 struct nlmsg *msg = NULL; 1696 int err = 0; 1697 struct rte_flow_item items_local[2] = { 1698 [0] = { 1699 .type = items[0].type, 1700 .spec = ð_local, 1701 .mask = items[0].mask, 1702 }, 1703 [1] = { 1704 .type = items[1].type, 1705 } 1706 }; 1707 1708 remote_flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0); 1709 if (!remote_flow) { 1710 TAP_LOG(ERR, "Cannot allocate memory for rte_flow"); 1711 goto fail; 1712 } 1713 msg = &remote_flow->msg; 1714 if (idx == TAP_REMOTE_TX) { 1715 if_index = pmd->if_index; 1716 } else if (idx == TAP_ISOLATE) { 1717 if_index = pmd->if_index; 1718 /* Don't be exclusive for this rule, it can be changed later. */ 1719 flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; 1720 isolate_actions[0].type = pmd->flow_isolate ? 1721 RTE_FLOW_ACTION_TYPE_DROP : 1722 RTE_FLOW_ACTION_TYPE_PASSTHRU; 1723 actions = isolate_actions; 1724 } else if (idx == TAP_REMOTE_LOCAL_MAC) { 1725 /* 1726 * eth addr couldn't be set in implicit_rte_flows[] as it is not 1727 * known at compile time. 1728 */ 1729 memcpy(ð_local.hdr.dst_addr, &pmd->eth_addr, sizeof(pmd->eth_addr)); 1730 items = items_local; 1731 } 1732 tc_init_msg(msg, if_index, RTM_NEWTFILTER, flags); 1733 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); 1734 /* 1735 * The ISOLATE rule is always present and must have a static handle, as 1736 * the action is changed whether the feature is enabled (DROP) or 1737 * disabled (PASSTHRU). 1738 * There is just one REMOTE_PROMISCUOUS rule in all cases. It should 1739 * have a static handle such that adding it twice will fail with EEXIST 1740 * with any kernel version. Remark: old kernels may falsely accept the 1741 * same REMOTE_PROMISCUOUS rules if they had different handles. 1742 */ 1743 if (idx == TAP_ISOLATE) 1744 remote_flow->msg.t.tcm_handle = ISOLATE_HANDLE; 1745 else if (idx == TAP_REMOTE_PROMISC) 1746 remote_flow->msg.t.tcm_handle = REMOTE_PROMISCUOUS_HANDLE; 1747 else 1748 tap_flow_set_handle(remote_flow); 1749 if (priv_flow_process(pmd, attr, items, actions, NULL, 1750 remote_flow, implicit_rte_flows[idx].mirred)) { 1751 TAP_LOG(ERR, "rte flow rule validation failed"); 1752 goto fail; 1753 } 1754 err = tap_nl_send(pmd->nlsk_fd, &msg->nh); 1755 if (err < 0) { 1756 TAP_LOG(ERR, "Failure sending nl request"); 1757 goto fail; 1758 } 1759 err = tap_nl_recv_ack(pmd->nlsk_fd); 1760 if (err < 0) { 1761 /* Silently ignore re-entering existing rule */ 1762 if (errno == EEXIST) 1763 goto success; 1764 TAP_LOG(ERR, 1765 "Kernel refused TC filter rule creation (%d): %s", 1766 errno, strerror(errno)); 1767 goto fail; 1768 } 1769 LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next); 1770 success: 1771 return 0; 1772 fail: 1773 rte_free(remote_flow); 1774 return -1; 1775 } 1776 1777 /** 1778 * Remove specific implicit flow rule on the remote device. 1779 * 1780 * @param[in, out] pmd 1781 * Pointer to private structure. 1782 * @param[in] idx 1783 * The idx in the implicit_rte_flows array specifying which rule to remove. 1784 * 1785 * @return -1 if one of the implicit rules couldn't be created, 0 otherwise. 1786 */ 1787 int tap_flow_implicit_destroy(struct pmd_internals *pmd, 1788 enum implicit_rule_index idx) 1789 { 1790 struct rte_flow *remote_flow; 1791 int cur_prio = -1; 1792 int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET; 1793 1794 for (remote_flow = LIST_FIRST(&pmd->implicit_flows); 1795 remote_flow; 1796 remote_flow = LIST_NEXT(remote_flow, next)) { 1797 cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK; 1798 if (cur_prio != idx_prio) 1799 continue; 1800 return tap_flow_destroy_pmd(pmd, remote_flow, NULL); 1801 } 1802 return 0; 1803 } 1804 1805 /** 1806 * Destroy all implicit flows. 1807 * 1808 * @see rte_flow_flush() 1809 */ 1810 int 1811 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error) 1812 { 1813 struct rte_flow *remote_flow; 1814 1815 while (!LIST_EMPTY(&pmd->implicit_flows)) { 1816 remote_flow = LIST_FIRST(&pmd->implicit_flows); 1817 if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0) 1818 return -1; 1819 } 1820 return 0; 1821 } 1822 1823 #define MAX_RSS_KEYS 256 1824 #define KEY_IDX_OFFSET (3 * MAX_RSS_KEYS) 1825 #define SEC_NAME_CLS_Q "cls_q" 1826 1827 static const char *sec_name[SEC_MAX] = { 1828 [SEC_L3_L4] = "l3_l4", 1829 }; 1830 1831 /** 1832 * Enable RSS on tap: create TC rules for queuing. 1833 * 1834 * @param[in, out] pmd 1835 * Pointer to private structure. 1836 * 1837 * @param[in] attr 1838 * Pointer to rte_flow to get flow group 1839 * 1840 * @param[out] error 1841 * Pointer to error reporting if not NULL. 1842 * 1843 * @return 0 on success, negative value on failure. 1844 */ 1845 static int rss_enable(struct pmd_internals *pmd, 1846 const struct rte_flow_attr *attr, 1847 struct rte_flow_error *error) 1848 { 1849 struct rte_flow *rss_flow = NULL; 1850 struct nlmsg *msg = NULL; 1851 /* 4096 is the maximum number of instructions for a BPF program */ 1852 char annotation[64]; 1853 int i; 1854 int err = 0; 1855 1856 /* unlimit locked memory */ 1857 struct rlimit memlock_limit = { 1858 .rlim_cur = RLIM_INFINITY, 1859 .rlim_max = RLIM_INFINITY, 1860 }; 1861 setrlimit(RLIMIT_MEMLOCK, &memlock_limit); 1862 1863 /* Get a new map key for a new RSS rule */ 1864 err = bpf_rss_key(KEY_CMD_INIT, NULL); 1865 if (err < 0) { 1866 rte_flow_error_set( 1867 error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 1868 "Failed to initialize BPF RSS keys"); 1869 1870 return -1; 1871 } 1872 1873 /* 1874 * Create BPF RSS MAP 1875 */ 1876 pmd->map_fd = tap_flow_bpf_rss_map_create(sizeof(__u32), /* key size */ 1877 sizeof(struct rss_key), 1878 MAX_RSS_KEYS); 1879 if (pmd->map_fd < 0) { 1880 TAP_LOG(ERR, 1881 "Failed to create BPF map (%d): %s", 1882 errno, strerror(errno)); 1883 rte_flow_error_set( 1884 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 1885 "Kernel too old or not configured " 1886 "to support BPF maps"); 1887 1888 return -ENOTSUP; 1889 } 1890 1891 /* 1892 * Add a rule per queue to match reclassified packets and direct them to 1893 * the correct queue. 1894 */ 1895 for (i = 0; i < pmd->dev->data->nb_rx_queues; i++) { 1896 pmd->bpf_fd[i] = tap_flow_bpf_cls_q(i); 1897 if (pmd->bpf_fd[i] < 0) { 1898 TAP_LOG(ERR, 1899 "Failed to load BPF section %s for queue %d", 1900 SEC_NAME_CLS_Q, i); 1901 rte_flow_error_set( 1902 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, 1903 NULL, 1904 "Kernel too old or not configured " 1905 "to support BPF programs loading"); 1906 1907 return -ENOTSUP; 1908 } 1909 1910 rss_flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0); 1911 if (!rss_flow) { 1912 TAP_LOG(ERR, 1913 "Cannot allocate memory for rte_flow"); 1914 return -1; 1915 } 1916 msg = &rss_flow->msg; 1917 tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, NLM_F_REQUEST | 1918 NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); 1919 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); 1920 tap_flow_set_handle(rss_flow); 1921 uint16_t group = attr->group << GROUP_SHIFT; 1922 uint16_t prio = group | (i + PRIORITY_OFFSET); 1923 msg->t.tcm_info = TC_H_MAKE(prio << 16, msg->t.tcm_info); 1924 msg->t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); 1925 1926 tap_nlattr_add(&msg->nh, TCA_KIND, sizeof("bpf"), "bpf"); 1927 if (tap_nlattr_nested_start(msg, TCA_OPTIONS) < 0) 1928 return -1; 1929 tap_nlattr_add32(&msg->nh, TCA_BPF_FD, pmd->bpf_fd[i]); 1930 snprintf(annotation, sizeof(annotation), "[%s%d]", 1931 SEC_NAME_CLS_Q, i); 1932 tap_nlattr_add(&msg->nh, TCA_BPF_NAME, strlen(annotation) + 1, 1933 annotation); 1934 /* Actions */ 1935 { 1936 struct action_data adata = { 1937 .id = "skbedit", 1938 .skbedit = { 1939 .skbedit = { 1940 .action = TC_ACT_PIPE, 1941 }, 1942 .queue = i, 1943 }, 1944 }; 1945 if (add_actions(rss_flow, 1, &adata, TCA_BPF_ACT) < 0) 1946 return -1; 1947 } 1948 tap_nlattr_nested_finish(msg); /* nested TCA_OPTIONS */ 1949 1950 /* Netlink message is now ready to be sent */ 1951 if (tap_nl_send(pmd->nlsk_fd, &msg->nh) < 0) 1952 return -1; 1953 err = tap_nl_recv_ack(pmd->nlsk_fd); 1954 if (err < 0) { 1955 TAP_LOG(ERR, 1956 "Kernel refused TC filter rule creation (%d): %s", 1957 errno, strerror(errno)); 1958 return err; 1959 } 1960 LIST_INSERT_HEAD(&pmd->rss_flows, rss_flow, next); 1961 } 1962 1963 pmd->rss_enabled = 1; 1964 return err; 1965 } 1966 1967 /** 1968 * Manage bpf RSS keys repository with operations: init, get, release 1969 * 1970 * @param[in] cmd 1971 * Command on RSS keys: init, get, release 1972 * 1973 * @param[in, out] key_idx 1974 * Pointer to RSS Key index (out for get command, in for release command) 1975 * 1976 * @return -1 if couldn't get, release or init the RSS keys, 0 otherwise. 1977 */ 1978 static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx) 1979 { 1980 __u32 i; 1981 int err = 0; 1982 static __u32 num_used_keys; 1983 static __u32 rss_keys[MAX_RSS_KEYS] = {KEY_STAT_UNSPEC}; 1984 static __u32 rss_keys_initialized; 1985 __u32 key; 1986 1987 switch (cmd) { 1988 case KEY_CMD_GET: 1989 if (!rss_keys_initialized) { 1990 err = -1; 1991 break; 1992 } 1993 1994 if (num_used_keys == RTE_DIM(rss_keys)) { 1995 err = -1; 1996 break; 1997 } 1998 1999 *key_idx = num_used_keys % RTE_DIM(rss_keys); 2000 while (rss_keys[*key_idx] == KEY_STAT_USED) 2001 *key_idx = (*key_idx + 1) % RTE_DIM(rss_keys); 2002 2003 rss_keys[*key_idx] = KEY_STAT_USED; 2004 2005 /* 2006 * Add an offset to key_idx in order to handle a case of 2007 * RSS and non RSS flows mixture. 2008 * If a non RSS flow is destroyed it has an eBPF map 2009 * index 0 (initialized on flow creation) and might 2010 * unintentionally remove RSS entry 0 from eBPF map. 2011 * To avoid this issue, add an offset to the real index 2012 * during a KEY_CMD_GET operation and subtract this offset 2013 * during a KEY_CMD_RELEASE operation in order to restore 2014 * the real index. 2015 */ 2016 *key_idx += KEY_IDX_OFFSET; 2017 num_used_keys++; 2018 break; 2019 2020 case KEY_CMD_RELEASE: 2021 if (!rss_keys_initialized) 2022 break; 2023 2024 /* 2025 * Subtract offset to restore real key index 2026 * If a non RSS flow is falsely trying to release map 2027 * entry 0 - the offset subtraction will calculate the real 2028 * map index as an out-of-range value and the release operation 2029 * will be silently ignored. 2030 */ 2031 key = *key_idx - KEY_IDX_OFFSET; 2032 if (key >= RTE_DIM(rss_keys)) 2033 break; 2034 2035 if (rss_keys[key] == KEY_STAT_USED) { 2036 rss_keys[key] = KEY_STAT_AVAILABLE; 2037 num_used_keys--; 2038 } 2039 break; 2040 2041 case KEY_CMD_INIT: 2042 for (i = 0; i < RTE_DIM(rss_keys); i++) 2043 rss_keys[i] = KEY_STAT_AVAILABLE; 2044 2045 rss_keys_initialized = 1; 2046 num_used_keys = 0; 2047 break; 2048 2049 case KEY_CMD_DEINIT: 2050 for (i = 0; i < RTE_DIM(rss_keys); i++) 2051 rss_keys[i] = KEY_STAT_UNSPEC; 2052 2053 rss_keys_initialized = 0; 2054 num_used_keys = 0; 2055 break; 2056 2057 default: 2058 break; 2059 } 2060 2061 return err; 2062 } 2063 2064 /** 2065 * Add RSS hash calculations and queue selection 2066 * 2067 * @param[in, out] pmd 2068 * Pointer to internal structure. Used to set/get RSS map fd 2069 * 2070 * @param[in] rss 2071 * Pointer to RSS flow actions 2072 * 2073 * @param[out] error 2074 * Pointer to error reporting if not NULL. 2075 * 2076 * @return 0 on success, negative value on failure 2077 */ 2078 static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd, 2079 const struct rte_flow_action_rss *rss, 2080 struct rte_flow_error *error) 2081 { 2082 /* 4096 is the maximum number of instructions for a BPF program */ 2083 unsigned int i; 2084 int err; 2085 struct rss_key rss_entry = { .hash_fields = 0, 2086 .key_size = 0 }; 2087 2088 /* Check supported RSS features */ 2089 if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT) 2090 return rte_flow_error_set 2091 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, 2092 "non-default RSS hash functions are not supported"); 2093 if (rss->level) 2094 return rte_flow_error_set 2095 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, 2096 "a nonzero RSS encapsulation level is not supported"); 2097 2098 /* Get a new map key for a new RSS rule */ 2099 err = bpf_rss_key(KEY_CMD_GET, &flow->key_idx); 2100 if (err < 0) { 2101 rte_flow_error_set( 2102 error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 2103 "Failed to get BPF RSS key"); 2104 2105 return -1; 2106 } 2107 2108 /* Update RSS map entry with queues */ 2109 rss_entry.nb_queues = rss->queue_num; 2110 for (i = 0; i < rss->queue_num; i++) 2111 rss_entry.queues[i] = rss->queue[i]; 2112 rss_entry.hash_fields = 2113 (1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4); 2114 2115 /* Add this RSS entry to map */ 2116 err = tap_flow_bpf_update_rss_elem(pmd->map_fd, 2117 &flow->key_idx, &rss_entry); 2118 2119 if (err) { 2120 TAP_LOG(ERR, 2121 "Failed to update BPF map entry #%u (%d): %s", 2122 flow->key_idx, errno, strerror(errno)); 2123 rte_flow_error_set( 2124 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 2125 "Kernel too old or not configured " 2126 "to support BPF maps updates"); 2127 2128 return -ENOTSUP; 2129 } 2130 2131 2132 /* 2133 * Load bpf rules to calculate hash for this key_idx 2134 */ 2135 2136 flow->bpf_fd[SEC_L3_L4] = 2137 tap_flow_bpf_calc_l3_l4_hash(flow->key_idx, pmd->map_fd); 2138 if (flow->bpf_fd[SEC_L3_L4] < 0) { 2139 TAP_LOG(ERR, 2140 "Failed to load BPF section %s (%d): %s", 2141 sec_name[SEC_L3_L4], errno, strerror(errno)); 2142 rte_flow_error_set( 2143 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 2144 "Kernel too old or not configured " 2145 "to support BPF program loading"); 2146 2147 return -ENOTSUP; 2148 } 2149 2150 /* Actions */ 2151 { 2152 struct action_data adata[] = { 2153 { 2154 .id = "bpf", 2155 .bpf = { 2156 .bpf_fd = flow->bpf_fd[SEC_L3_L4], 2157 .annotation = sec_name[SEC_L3_L4], 2158 .bpf = { 2159 .action = TC_ACT_PIPE, 2160 }, 2161 }, 2162 }, 2163 }; 2164 2165 if (add_actions(flow, RTE_DIM(adata), adata, 2166 TCA_FLOWER_ACT) < 0) 2167 return -1; 2168 } 2169 2170 return 0; 2171 } 2172 2173 /** 2174 * Get rte_flow operations. 2175 * 2176 * @param dev 2177 * Pointer to Ethernet device structure. 2178 * @param ops 2179 * Pointer to operation-specific structure. 2180 * 2181 * @return 2182 * 0 on success, negative errno value on failure. 2183 */ 2184 int 2185 tap_dev_flow_ops_get(struct rte_eth_dev *dev __rte_unused, 2186 const struct rte_flow_ops **ops) 2187 { 2188 *ops = &tap_flow_ops; 2189 return 0; 2190 } 2191