1 /*- 2 * BSD LICENSE 3 * 4 * Copyright 2017 6WIND S.A. 5 * Copyright 2017 Mellanox. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of 6WIND S.A. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <errno.h> 35 #include <string.h> 36 #include <sys/queue.h> 37 38 #include <rte_byteorder.h> 39 #include <rte_jhash.h> 40 #include <rte_malloc.h> 41 #include <rte_eth_tap.h> 42 #include <tap_flow.h> 43 #include <tap_autoconf.h> 44 #include <tap_tcmsgs.h> 45 46 #ifndef HAVE_TC_FLOWER 47 /* 48 * For kernels < 4.2, this enum is not defined. Runtime checks will be made to 49 * avoid sending TC messages the kernel cannot understand. 50 */ 51 enum { 52 TCA_FLOWER_UNSPEC, 53 TCA_FLOWER_CLASSID, 54 TCA_FLOWER_INDEV, 55 TCA_FLOWER_ACT, 56 TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ 57 TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ 58 TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ 59 TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ 60 TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ 61 TCA_FLOWER_KEY_IP_PROTO, /* u8 */ 62 TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ 63 TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ 64 TCA_FLOWER_KEY_IPV4_DST, /* be32 */ 65 TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ 66 TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ 67 TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ 68 TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ 69 TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ 70 TCA_FLOWER_KEY_TCP_SRC, /* be16 */ 71 TCA_FLOWER_KEY_TCP_DST, /* be16 */ 72 TCA_FLOWER_KEY_UDP_SRC, /* be16 */ 73 TCA_FLOWER_KEY_UDP_DST, /* be16 */ 74 }; 75 #endif 76 #ifndef HAVE_TC_VLAN_ID 77 enum { 78 /* TCA_FLOWER_FLAGS, */ 79 TCA_FLOWER_KEY_VLAN_ID = TCA_FLOWER_KEY_UDP_DST + 2, /* be16 */ 80 TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ 81 TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ 82 }; 83 #endif 84 85 #define ISOLATE_HANDLE 1 86 87 struct rte_flow { 88 LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */ 89 struct rte_flow *remote_flow; /* associated remote flow */ 90 struct nlmsg msg; 91 }; 92 93 struct convert_data { 94 uint16_t eth_type; 95 uint16_t ip_proto; 96 uint8_t vlan; 97 struct rte_flow *flow; 98 }; 99 100 struct remote_rule { 101 struct rte_flow_attr attr; 102 struct rte_flow_item items[2]; 103 struct rte_flow_action actions[2]; 104 int mirred; 105 }; 106 107 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data); 108 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data); 109 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data); 110 static int tap_flow_create_ipv6(const struct rte_flow_item *item, void *data); 111 static int tap_flow_create_udp(const struct rte_flow_item *item, void *data); 112 static int tap_flow_create_tcp(const struct rte_flow_item *item, void *data); 113 static int 114 tap_flow_validate(struct rte_eth_dev *dev, 115 const struct rte_flow_attr *attr, 116 const struct rte_flow_item items[], 117 const struct rte_flow_action actions[], 118 struct rte_flow_error *error); 119 120 static struct rte_flow * 121 tap_flow_create(struct rte_eth_dev *dev, 122 const struct rte_flow_attr *attr, 123 const struct rte_flow_item items[], 124 const struct rte_flow_action actions[], 125 struct rte_flow_error *error); 126 127 static int 128 tap_flow_destroy(struct rte_eth_dev *dev, 129 struct rte_flow *flow, 130 struct rte_flow_error *error); 131 132 static int 133 tap_flow_isolate(struct rte_eth_dev *dev, 134 int set, 135 struct rte_flow_error *error); 136 137 static const struct rte_flow_ops tap_flow_ops = { 138 .validate = tap_flow_validate, 139 .create = tap_flow_create, 140 .destroy = tap_flow_destroy, 141 .flush = tap_flow_flush, 142 .isolate = tap_flow_isolate, 143 }; 144 145 /* Static initializer for items. */ 146 #define ITEMS(...) \ 147 (const enum rte_flow_item_type []){ \ 148 __VA_ARGS__, RTE_FLOW_ITEM_TYPE_END, \ 149 } 150 151 /* Structure to generate a simple graph of layers supported by the NIC. */ 152 struct tap_flow_items { 153 /* Bit-mask corresponding to what is supported for this item. */ 154 const void *mask; 155 const unsigned int mask_sz; /* Bit-mask size in bytes. */ 156 /* 157 * Bit-mask corresponding to the default mask, if none is provided 158 * along with the item. 159 */ 160 const void *default_mask; 161 /** 162 * Conversion function from rte_flow to netlink attributes. 163 * 164 * @param item 165 * rte_flow item to convert. 166 * @param data 167 * Internal structure to store the conversion. 168 * 169 * @return 170 * 0 on success, negative value otherwise. 171 */ 172 int (*convert)(const struct rte_flow_item *item, void *data); 173 /** List of possible following items. */ 174 const enum rte_flow_item_type *const items; 175 }; 176 177 /* Graph of supported items and associated actions. */ 178 static const struct tap_flow_items tap_flow_items[] = { 179 [RTE_FLOW_ITEM_TYPE_END] = { 180 .items = ITEMS(RTE_FLOW_ITEM_TYPE_ETH), 181 }, 182 [RTE_FLOW_ITEM_TYPE_ETH] = { 183 .items = ITEMS( 184 RTE_FLOW_ITEM_TYPE_VLAN, 185 RTE_FLOW_ITEM_TYPE_IPV4, 186 RTE_FLOW_ITEM_TYPE_IPV6), 187 .mask = &(const struct rte_flow_item_eth){ 188 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", 189 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff", 190 .type = -1, 191 }, 192 .mask_sz = sizeof(struct rte_flow_item_eth), 193 .default_mask = &rte_flow_item_eth_mask, 194 .convert = tap_flow_create_eth, 195 }, 196 [RTE_FLOW_ITEM_TYPE_VLAN] = { 197 .items = ITEMS(RTE_FLOW_ITEM_TYPE_IPV4, 198 RTE_FLOW_ITEM_TYPE_IPV6), 199 .mask = &(const struct rte_flow_item_vlan){ 200 .tpid = -1, 201 /* DEI matching is not supported */ 202 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN 203 .tci = 0xffef, 204 #else 205 .tci = 0xefff, 206 #endif 207 }, 208 .mask_sz = sizeof(struct rte_flow_item_vlan), 209 .default_mask = &rte_flow_item_vlan_mask, 210 .convert = tap_flow_create_vlan, 211 }, 212 [RTE_FLOW_ITEM_TYPE_IPV4] = { 213 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, 214 RTE_FLOW_ITEM_TYPE_TCP), 215 .mask = &(const struct rte_flow_item_ipv4){ 216 .hdr = { 217 .src_addr = -1, 218 .dst_addr = -1, 219 .next_proto_id = -1, 220 }, 221 }, 222 .mask_sz = sizeof(struct rte_flow_item_ipv4), 223 .default_mask = &rte_flow_item_ipv4_mask, 224 .convert = tap_flow_create_ipv4, 225 }, 226 [RTE_FLOW_ITEM_TYPE_IPV6] = { 227 .items = ITEMS(RTE_FLOW_ITEM_TYPE_UDP, 228 RTE_FLOW_ITEM_TYPE_TCP), 229 .mask = &(const struct rte_flow_item_ipv6){ 230 .hdr = { 231 .src_addr = { 232 "\xff\xff\xff\xff\xff\xff\xff\xff" 233 "\xff\xff\xff\xff\xff\xff\xff\xff", 234 }, 235 .dst_addr = { 236 "\xff\xff\xff\xff\xff\xff\xff\xff" 237 "\xff\xff\xff\xff\xff\xff\xff\xff", 238 }, 239 .proto = -1, 240 }, 241 }, 242 .mask_sz = sizeof(struct rte_flow_item_ipv6), 243 .default_mask = &rte_flow_item_ipv6_mask, 244 .convert = tap_flow_create_ipv6, 245 }, 246 [RTE_FLOW_ITEM_TYPE_UDP] = { 247 .mask = &(const struct rte_flow_item_udp){ 248 .hdr = { 249 .src_port = -1, 250 .dst_port = -1, 251 }, 252 }, 253 .mask_sz = sizeof(struct rte_flow_item_udp), 254 .default_mask = &rte_flow_item_udp_mask, 255 .convert = tap_flow_create_udp, 256 }, 257 [RTE_FLOW_ITEM_TYPE_TCP] = { 258 .mask = &(const struct rte_flow_item_tcp){ 259 .hdr = { 260 .src_port = -1, 261 .dst_port = -1, 262 }, 263 }, 264 .mask_sz = sizeof(struct rte_flow_item_tcp), 265 .default_mask = &rte_flow_item_tcp_mask, 266 .convert = tap_flow_create_tcp, 267 }, 268 }; 269 270 /* 271 * TC rules, by growing priority 272 * 273 * Remote netdevice Tap netdevice 274 * +-------------+-------------+ +-------------+-------------+ 275 * | Ingress | Egress | | Ingress | Egress | 276 * |-------------|-------------| |-------------|-------------| 277 * | | \ / | | | REMOTE TX | prio 1 278 * | | \ / | | | \ / | prio 2 279 * | EXPLICIT | \ / | | EXPLICIT | \ / | . 280 * | | \ / | | | \ / | . 281 * | RULES | X | | RULES | X | . 282 * | . | / \ | | . | / \ | . 283 * | . | / \ | | . | / \ | . 284 * | . | / \ | | . | / \ | . 285 * | . | / \ | | . | / \ | . 286 * 287 * .... .... .... .... 288 * 289 * | . | \ / | | . | \ / | . 290 * | . | \ / | | . | \ / | . 291 * | | \ / | | | \ / | 292 * | LOCAL_MAC | \ / | | \ / | \ / | last prio - 5 293 * | PROMISC | X | | \ / | X | last prio - 4 294 * | ALLMULTI | / \ | | X | / \ | last prio - 3 295 * | BROADCAST | / \ | | / \ | / \ | last prio - 2 296 * | BROADCASTV6 | / \ | | / \ | / \ | last prio - 1 297 * | xx | / \ | | ISOLATE | / \ | last prio 298 * +-------------+-------------+ +-------------+-------------+ 299 * 300 * The implicit flow rules are stored in a list in with mandatorily the last two 301 * being the ISOLATE and REMOTE_TX rules. e.g.: 302 * 303 * LOCAL_MAC -> BROADCAST -> BROADCASTV6 -> REMOTE_TX -> ISOLATE -> NULL 304 * 305 * That enables tap_flow_isolate() to remove implicit rules by popping the list 306 * head and remove it as long as it applies on the remote netdevice. The 307 * implicit rule for TX redirection is not removed, as isolate concerns only 308 * incoming traffic. 309 */ 310 311 static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = { 312 [TAP_REMOTE_LOCAL_MAC] = { 313 .attr = { 314 .group = MAX_GROUP, 315 .priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC, 316 .ingress = 1, 317 }, 318 .items[0] = { 319 .type = RTE_FLOW_ITEM_TYPE_ETH, 320 .mask = &(const struct rte_flow_item_eth){ 321 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", 322 }, 323 }, 324 .items[1] = { 325 .type = RTE_FLOW_ITEM_TYPE_END, 326 }, 327 .mirred = TCA_EGRESS_REDIR, 328 }, 329 [TAP_REMOTE_BROADCAST] = { 330 .attr = { 331 .group = MAX_GROUP, 332 .priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST, 333 .ingress = 1, 334 }, 335 .items[0] = { 336 .type = RTE_FLOW_ITEM_TYPE_ETH, 337 .mask = &(const struct rte_flow_item_eth){ 338 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", 339 }, 340 .spec = &(const struct rte_flow_item_eth){ 341 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", 342 }, 343 }, 344 .items[1] = { 345 .type = RTE_FLOW_ITEM_TYPE_END, 346 }, 347 .mirred = TCA_EGRESS_MIRROR, 348 }, 349 [TAP_REMOTE_BROADCASTV6] = { 350 .attr = { 351 .group = MAX_GROUP, 352 .priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6, 353 .ingress = 1, 354 }, 355 .items[0] = { 356 .type = RTE_FLOW_ITEM_TYPE_ETH, 357 .mask = &(const struct rte_flow_item_eth){ 358 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00", 359 }, 360 .spec = &(const struct rte_flow_item_eth){ 361 .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00", 362 }, 363 }, 364 .items[1] = { 365 .type = RTE_FLOW_ITEM_TYPE_END, 366 }, 367 .mirred = TCA_EGRESS_MIRROR, 368 }, 369 [TAP_REMOTE_PROMISC] = { 370 .attr = { 371 .group = MAX_GROUP, 372 .priority = PRIORITY_MASK - TAP_REMOTE_PROMISC, 373 .ingress = 1, 374 }, 375 .items[0] = { 376 .type = RTE_FLOW_ITEM_TYPE_VOID, 377 }, 378 .items[1] = { 379 .type = RTE_FLOW_ITEM_TYPE_END, 380 }, 381 .mirred = TCA_EGRESS_MIRROR, 382 }, 383 [TAP_REMOTE_ALLMULTI] = { 384 .attr = { 385 .group = MAX_GROUP, 386 .priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI, 387 .ingress = 1, 388 }, 389 .items[0] = { 390 .type = RTE_FLOW_ITEM_TYPE_ETH, 391 .mask = &(const struct rte_flow_item_eth){ 392 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00", 393 }, 394 .spec = &(const struct rte_flow_item_eth){ 395 .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00", 396 }, 397 }, 398 .items[1] = { 399 .type = RTE_FLOW_ITEM_TYPE_END, 400 }, 401 .mirred = TCA_EGRESS_MIRROR, 402 }, 403 [TAP_REMOTE_TX] = { 404 .attr = { 405 .group = 0, 406 .priority = TAP_REMOTE_TX, 407 .egress = 1, 408 }, 409 .items[0] = { 410 .type = RTE_FLOW_ITEM_TYPE_VOID, 411 }, 412 .items[1] = { 413 .type = RTE_FLOW_ITEM_TYPE_END, 414 }, 415 .mirred = TCA_EGRESS_MIRROR, 416 }, 417 [TAP_ISOLATE] = { 418 .attr = { 419 .group = MAX_GROUP, 420 .priority = PRIORITY_MASK - TAP_ISOLATE, 421 .ingress = 1, 422 }, 423 .items[0] = { 424 .type = RTE_FLOW_ITEM_TYPE_VOID, 425 }, 426 .items[1] = { 427 .type = RTE_FLOW_ITEM_TYPE_END, 428 }, 429 }, 430 }; 431 432 /** 433 * Make as much checks as possible on an Ethernet item, and if a flow is 434 * provided, fill it appropriately with Ethernet info. 435 * 436 * @param[in] item 437 * Item specification. 438 * @param[in, out] data 439 * Additional data structure to tell next layers we've been here. 440 * 441 * @return 442 * 0 if checks are alright, -1 otherwise. 443 */ 444 static int 445 tap_flow_create_eth(const struct rte_flow_item *item, void *data) 446 { 447 struct convert_data *info = (struct convert_data *)data; 448 const struct rte_flow_item_eth *spec = item->spec; 449 const struct rte_flow_item_eth *mask = item->mask; 450 struct rte_flow *flow = info->flow; 451 struct nlmsg *msg; 452 453 /* use default mask if none provided */ 454 if (!mask) 455 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_ETH].default_mask; 456 /* TC does not support eth_type masking. Only accept if exact match. */ 457 if (mask->type && mask->type != 0xffff) 458 return -1; 459 if (!spec) 460 return 0; 461 /* store eth_type for consistency if ipv4/6 pattern item comes next */ 462 if (spec->type & mask->type) 463 info->eth_type = spec->type; 464 if (!flow) 465 return 0; 466 msg = &flow->msg; 467 if (!is_zero_ether_addr(&spec->dst)) { 468 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_DST, ETHER_ADDR_LEN, 469 &spec->dst.addr_bytes); 470 tap_nlattr_add(&msg->nh, 471 TCA_FLOWER_KEY_ETH_DST_MASK, ETHER_ADDR_LEN, 472 &mask->dst.addr_bytes); 473 } 474 if (!is_zero_ether_addr(&mask->src)) { 475 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_ETH_SRC, ETHER_ADDR_LEN, 476 &spec->src.addr_bytes); 477 tap_nlattr_add(&msg->nh, 478 TCA_FLOWER_KEY_ETH_SRC_MASK, ETHER_ADDR_LEN, 479 &mask->src.addr_bytes); 480 } 481 return 0; 482 } 483 484 /** 485 * Make as much checks as possible on a VLAN item, and if a flow is provided, 486 * fill it appropriately with VLAN info. 487 * 488 * @param[in] item 489 * Item specification. 490 * @param[in, out] data 491 * Additional data structure to tell next layers we've been here. 492 * 493 * @return 494 * 0 if checks are alright, -1 otherwise. 495 */ 496 static int 497 tap_flow_create_vlan(const struct rte_flow_item *item, void *data) 498 { 499 struct convert_data *info = (struct convert_data *)data; 500 const struct rte_flow_item_vlan *spec = item->spec; 501 const struct rte_flow_item_vlan *mask = item->mask; 502 struct rte_flow *flow = info->flow; 503 struct nlmsg *msg; 504 505 /* use default mask if none provided */ 506 if (!mask) 507 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_VLAN].default_mask; 508 /* TC does not support tpid masking. Only accept if exact match. */ 509 if (mask->tpid && mask->tpid != 0xffff) 510 return -1; 511 /* Double-tagging not supported. */ 512 if (spec && mask->tpid && spec->tpid != htons(ETH_P_8021Q)) 513 return -1; 514 info->vlan = 1; 515 if (!flow) 516 return 0; 517 msg = &flow->msg; 518 msg->t.tcm_info = TC_H_MAKE(msg->t.tcm_info, htons(ETH_P_8021Q)); 519 #define VLAN_PRIO(tci) ((tci) >> 13) 520 #define VLAN_ID(tci) ((tci) & 0xfff) 521 if (!spec) 522 return 0; 523 if (spec->tci) { 524 uint16_t tci = ntohs(spec->tci) & mask->tci; 525 uint16_t prio = VLAN_PRIO(tci); 526 uint8_t vid = VLAN_ID(tci); 527 528 if (prio) 529 tap_nlattr_add8(&msg->nh, 530 TCA_FLOWER_KEY_VLAN_PRIO, prio); 531 if (vid) 532 tap_nlattr_add16(&msg->nh, 533 TCA_FLOWER_KEY_VLAN_ID, vid); 534 } 535 return 0; 536 } 537 538 /** 539 * Make as much checks as possible on an IPv4 item, and if a flow is provided, 540 * fill it appropriately with IPv4 info. 541 * 542 * @param[in] item 543 * Item specification. 544 * @param[in, out] data 545 * Additional data structure to tell next layers we've been here. 546 * 547 * @return 548 * 0 if checks are alright, -1 otherwise. 549 */ 550 static int 551 tap_flow_create_ipv4(const struct rte_flow_item *item, void *data) 552 { 553 struct convert_data *info = (struct convert_data *)data; 554 const struct rte_flow_item_ipv4 *spec = item->spec; 555 const struct rte_flow_item_ipv4 *mask = item->mask; 556 struct rte_flow *flow = info->flow; 557 struct nlmsg *msg; 558 559 /* use default mask if none provided */ 560 if (!mask) 561 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV4].default_mask; 562 /* check that previous eth type is compatible with ipv4 */ 563 if (info->eth_type && info->eth_type != htons(ETH_P_IP)) 564 return -1; 565 /* store ip_proto for consistency if udp/tcp pattern item comes next */ 566 if (spec) 567 info->ip_proto = spec->hdr.next_proto_id; 568 if (!flow) 569 return 0; 570 msg = &flow->msg; 571 if (!info->eth_type) 572 info->eth_type = htons(ETH_P_IP); 573 if (!spec) 574 return 0; 575 if (spec->hdr.dst_addr) { 576 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST, 577 spec->hdr.dst_addr); 578 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_DST_MASK, 579 mask->hdr.dst_addr); 580 } 581 if (spec->hdr.src_addr) { 582 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC, 583 spec->hdr.src_addr); 584 tap_nlattr_add32(&msg->nh, TCA_FLOWER_KEY_IPV4_SRC_MASK, 585 mask->hdr.src_addr); 586 } 587 if (spec->hdr.next_proto_id) 588 tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, 589 spec->hdr.next_proto_id); 590 return 0; 591 } 592 593 /** 594 * Make as much checks as possible on an IPv6 item, and if a flow is provided, 595 * fill it appropriately with IPv6 info. 596 * 597 * @param[in] item 598 * Item specification. 599 * @param[in, out] data 600 * Additional data structure to tell next layers we've been here. 601 * 602 * @return 603 * 0 if checks are alright, -1 otherwise. 604 */ 605 static int 606 tap_flow_create_ipv6(const struct rte_flow_item *item, void *data) 607 { 608 struct convert_data *info = (struct convert_data *)data; 609 const struct rte_flow_item_ipv6 *spec = item->spec; 610 const struct rte_flow_item_ipv6 *mask = item->mask; 611 struct rte_flow *flow = info->flow; 612 uint8_t empty_addr[16] = { 0 }; 613 struct nlmsg *msg; 614 615 /* use default mask if none provided */ 616 if (!mask) 617 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_IPV6].default_mask; 618 /* check that previous eth type is compatible with ipv6 */ 619 if (info->eth_type && info->eth_type != htons(ETH_P_IPV6)) 620 return -1; 621 /* store ip_proto for consistency if udp/tcp pattern item comes next */ 622 if (spec) 623 info->ip_proto = spec->hdr.proto; 624 if (!flow) 625 return 0; 626 msg = &flow->msg; 627 if (!info->eth_type) 628 info->eth_type = htons(ETH_P_IPV6); 629 if (!spec) 630 return 0; 631 if (memcmp(spec->hdr.dst_addr, empty_addr, 16)) { 632 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST, 633 sizeof(spec->hdr.dst_addr), &spec->hdr.dst_addr); 634 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_DST_MASK, 635 sizeof(mask->hdr.dst_addr), &mask->hdr.dst_addr); 636 } 637 if (memcmp(spec->hdr.src_addr, empty_addr, 16)) { 638 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC, 639 sizeof(spec->hdr.src_addr), &spec->hdr.src_addr); 640 tap_nlattr_add(&msg->nh, TCA_FLOWER_KEY_IPV6_SRC_MASK, 641 sizeof(mask->hdr.src_addr), &mask->hdr.src_addr); 642 } 643 if (spec->hdr.proto) 644 tap_nlattr_add8(&msg->nh, 645 TCA_FLOWER_KEY_IP_PROTO, spec->hdr.proto); 646 return 0; 647 } 648 649 /** 650 * Make as much checks as possible on a UDP item, and if a flow is provided, 651 * fill it appropriately with UDP info. 652 * 653 * @param[in] item 654 * Item specification. 655 * @param[in, out] data 656 * Additional data structure to tell next layers we've been here. 657 * 658 * @return 659 * 0 if checks are alright, -1 otherwise. 660 */ 661 static int 662 tap_flow_create_udp(const struct rte_flow_item *item, void *data) 663 { 664 struct convert_data *info = (struct convert_data *)data; 665 const struct rte_flow_item_udp *spec = item->spec; 666 const struct rte_flow_item_udp *mask = item->mask; 667 struct rte_flow *flow = info->flow; 668 struct nlmsg *msg; 669 670 /* use default mask if none provided */ 671 if (!mask) 672 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_UDP].default_mask; 673 /* check that previous ip_proto is compatible with udp */ 674 if (info->ip_proto && info->ip_proto != IPPROTO_UDP) 675 return -1; 676 /* TC does not support UDP port masking. Only accept if exact match. */ 677 if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) || 678 (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff)) 679 return -1; 680 if (!flow) 681 return 0; 682 msg = &flow->msg; 683 tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_UDP); 684 if (!spec) 685 return 0; 686 if (spec->hdr.dst_port & mask->hdr.dst_port) 687 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_DST, 688 spec->hdr.dst_port); 689 if (spec->hdr.src_port & mask->hdr.src_port) 690 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_UDP_SRC, 691 spec->hdr.src_port); 692 return 0; 693 } 694 695 /** 696 * Make as much checks as possible on a TCP item, and if a flow is provided, 697 * fill it appropriately with TCP info. 698 * 699 * @param[in] item 700 * Item specification. 701 * @param[in, out] data 702 * Additional data structure to tell next layers we've been here. 703 * 704 * @return 705 * 0 if checks are alright, -1 otherwise. 706 */ 707 static int 708 tap_flow_create_tcp(const struct rte_flow_item *item, void *data) 709 { 710 struct convert_data *info = (struct convert_data *)data; 711 const struct rte_flow_item_tcp *spec = item->spec; 712 const struct rte_flow_item_tcp *mask = item->mask; 713 struct rte_flow *flow = info->flow; 714 struct nlmsg *msg; 715 716 /* use default mask if none provided */ 717 if (!mask) 718 mask = tap_flow_items[RTE_FLOW_ITEM_TYPE_TCP].default_mask; 719 /* check that previous ip_proto is compatible with tcp */ 720 if (info->ip_proto && info->ip_proto != IPPROTO_TCP) 721 return -1; 722 /* TC does not support TCP port masking. Only accept if exact match. */ 723 if ((mask->hdr.src_port && mask->hdr.src_port != 0xffff) || 724 (mask->hdr.dst_port && mask->hdr.dst_port != 0xffff)) 725 return -1; 726 if (!flow) 727 return 0; 728 msg = &flow->msg; 729 tap_nlattr_add8(&msg->nh, TCA_FLOWER_KEY_IP_PROTO, IPPROTO_TCP); 730 if (!spec) 731 return 0; 732 if (spec->hdr.dst_port & mask->hdr.dst_port) 733 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_DST, 734 spec->hdr.dst_port); 735 if (spec->hdr.src_port & mask->hdr.src_port) 736 tap_nlattr_add16(&msg->nh, TCA_FLOWER_KEY_TCP_SRC, 737 spec->hdr.src_port); 738 return 0; 739 } 740 741 /** 742 * Check support for a given item. 743 * 744 * @param[in] item 745 * Item specification. 746 * @param size 747 * Bit-Mask size in bytes. 748 * @param[in] supported_mask 749 * Bit-mask covering supported fields to compare with spec, last and mask in 750 * \item. 751 * @param[in] default_mask 752 * Bit-mask default mask if none is provided in \item. 753 * 754 * @return 755 * 0 on success. 756 */ 757 static int 758 tap_flow_item_validate(const struct rte_flow_item *item, 759 unsigned int size, 760 const uint8_t *supported_mask, 761 const uint8_t *default_mask) 762 { 763 int ret = 0; 764 765 /* An empty layer is allowed, as long as all fields are NULL */ 766 if (!item->spec && (item->mask || item->last)) 767 return -1; 768 /* Is the item spec compatible with what the NIC supports? */ 769 if (item->spec && !item->mask) { 770 unsigned int i; 771 const uint8_t *spec = item->spec; 772 773 for (i = 0; i < size; ++i) 774 if ((spec[i] | supported_mask[i]) != supported_mask[i]) 775 return -1; 776 /* Is the default mask compatible with what the NIC supports? */ 777 for (i = 0; i < size; i++) 778 if ((default_mask[i] | supported_mask[i]) != 779 supported_mask[i]) 780 return -1; 781 } 782 /* Is the item last compatible with what the NIC supports? */ 783 if (item->last && !item->mask) { 784 unsigned int i; 785 const uint8_t *spec = item->last; 786 787 for (i = 0; i < size; ++i) 788 if ((spec[i] | supported_mask[i]) != supported_mask[i]) 789 return -1; 790 } 791 /* Is the item mask compatible with what the NIC supports? */ 792 if (item->mask) { 793 unsigned int i; 794 const uint8_t *spec = item->mask; 795 796 for (i = 0; i < size; ++i) 797 if ((spec[i] | supported_mask[i]) != supported_mask[i]) 798 return -1; 799 } 800 /** 801 * Once masked, Are item spec and item last equal? 802 * TC does not support range so anything else is invalid. 803 */ 804 if (item->spec && item->last) { 805 uint8_t spec[size]; 806 uint8_t last[size]; 807 const uint8_t *apply = default_mask; 808 unsigned int i; 809 810 if (item->mask) 811 apply = item->mask; 812 for (i = 0; i < size; ++i) { 813 spec[i] = ((const uint8_t *)item->spec)[i] & apply[i]; 814 last[i] = ((const uint8_t *)item->last)[i] & apply[i]; 815 } 816 ret = memcmp(spec, last, size); 817 } 818 return ret; 819 } 820 821 /** 822 * Transform a DROP/PASSTHRU action item in the provided flow for TC. 823 * 824 * @param[in, out] flow 825 * Flow to be filled. 826 * @param[in] action 827 * Appropriate action to be set in the TCA_GACT_PARMS structure. 828 * 829 * @return 830 * 0 if checks are alright, -1 otherwise. 831 */ 832 static int 833 add_action_gact(struct rte_flow *flow, int action) 834 { 835 struct nlmsg *msg = &flow->msg; 836 size_t act_index = 1; 837 struct tc_gact p = { 838 .action = action 839 }; 840 841 if (tap_nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) 842 return -1; 843 if (tap_nlattr_nested_start(msg, act_index++) < 0) 844 return -1; 845 tap_nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact"); 846 if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) 847 return -1; 848 tap_nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p); 849 tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ 850 tap_nlattr_nested_finish(msg); /* nested act_index */ 851 tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ 852 return 0; 853 } 854 855 /** 856 * Transform a MIRRED action item in the provided flow for TC. 857 * 858 * @param[in, out] flow 859 * Flow to be filled. 860 * @param[in] ifindex 861 * Netdevice ifindex, where to mirror/redirect packet to. 862 * @param[in] action_type 863 * Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring. 864 * 865 * @return 866 * 0 if checks are alright, -1 otherwise. 867 */ 868 static int 869 add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type) 870 { 871 struct nlmsg *msg = &flow->msg; 872 size_t act_index = 1; 873 struct tc_mirred p = { 874 .eaction = action_type, 875 .ifindex = ifindex, 876 }; 877 878 if (tap_nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) 879 return -1; 880 if (tap_nlattr_nested_start(msg, act_index++) < 0) 881 return -1; 882 tap_nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred"); 883 if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) 884 return -1; 885 if (action_type == TCA_EGRESS_MIRROR) 886 p.action = TC_ACT_PIPE; 887 else /* REDIRECT */ 888 p.action = TC_ACT_STOLEN; 889 tap_nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p); 890 tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ 891 tap_nlattr_nested_finish(msg); /* nested act_index */ 892 tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ 893 return 0; 894 } 895 896 /** 897 * Transform a QUEUE action item in the provided flow for TC. 898 * 899 * @param[in, out] flow 900 * Flow to be filled. 901 * @param[in] queue 902 * Queue id to use. 903 * 904 * @return 905 * 0 if checks are alright, -1 otherwise. 906 */ 907 static int 908 add_action_skbedit(struct rte_flow *flow, uint16_t queue) 909 { 910 struct nlmsg *msg = &flow->msg; 911 size_t act_index = 1; 912 struct tc_skbedit p = { 913 .action = TC_ACT_PIPE 914 }; 915 916 if (tap_nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) 917 return -1; 918 if (tap_nlattr_nested_start(msg, act_index++) < 0) 919 return -1; 920 tap_nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit"); 921 if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) 922 return -1; 923 tap_nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p); 924 tap_nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue); 925 tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ 926 tap_nlattr_nested_finish(msg); /* nested act_index */ 927 tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ 928 return 0; 929 } 930 931 /** 932 * Validate a flow supported by TC. 933 * If flow param is not NULL, then also fill the netlink message inside. 934 * 935 * @param pmd 936 * Pointer to private structure. 937 * @param[in] attr 938 * Flow rule attributes. 939 * @param[in] pattern 940 * Pattern specification (list terminated by the END pattern item). 941 * @param[in] actions 942 * Associated actions (list terminated by the END action). 943 * @param[out] error 944 * Perform verbose error reporting if not NULL. 945 * @param[in, out] flow 946 * Flow structure to update. 947 * @param[in] mirred 948 * If set to TCA_EGRESS_REDIR, provided actions will be replaced with a 949 * redirection to the tap netdevice, and the TC rule will be configured 950 * on the remote netdevice in pmd. 951 * If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a 952 * mirroring to the tap netdevice, and the TC rule will be configured 953 * on the remote netdevice in pmd. Matching packets will thus be duplicated. 954 * If set to 0, the standard behavior is to be used: set correct actions for 955 * the TC rule, and apply it on the tap netdevice. 956 * 957 * @return 958 * 0 on success, a negative errno value otherwise and rte_errno is set. 959 */ 960 static int 961 priv_flow_process(struct pmd_internals *pmd, 962 const struct rte_flow_attr *attr, 963 const struct rte_flow_item items[], 964 const struct rte_flow_action actions[], 965 struct rte_flow_error *error, 966 struct rte_flow *flow, 967 int mirred) 968 { 969 const struct tap_flow_items *cur_item = tap_flow_items; 970 struct convert_data data = { 971 .eth_type = 0, 972 .ip_proto = 0, 973 .flow = flow, 974 }; 975 int action = 0; /* Only one action authorized for now */ 976 977 if (attr->group > MAX_GROUP) { 978 rte_flow_error_set( 979 error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP, 980 NULL, "group value too big: cannot exceed 15"); 981 return -rte_errno; 982 } 983 if (attr->priority > MAX_PRIORITY) { 984 rte_flow_error_set( 985 error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, 986 NULL, "priority value too big"); 987 return -rte_errno; 988 } else if (flow) { 989 uint16_t group = attr->group << GROUP_SHIFT; 990 uint16_t prio = group | (attr->priority + PRIORITY_OFFSET); 991 flow->msg.t.tcm_info = TC_H_MAKE(prio << 16, 992 flow->msg.t.tcm_info); 993 } 994 if (flow) { 995 if (mirred) { 996 /* 997 * If attr->ingress, the rule applies on remote ingress 998 * to match incoming packets 999 * If attr->egress, the rule applies on tap ingress (as 1000 * seen from the kernel) to deal with packets going out 1001 * from the DPDK app. 1002 */ 1003 flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0); 1004 } else { 1005 /* Standard rule on tap egress (kernel standpoint). */ 1006 flow->msg.t.tcm_parent = 1007 TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); 1008 } 1009 /* use flower filter type */ 1010 tap_nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower"); 1011 if (tap_nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0) 1012 goto exit_item_not_supported; 1013 } 1014 for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) { 1015 const struct tap_flow_items *token = NULL; 1016 unsigned int i; 1017 int err = 0; 1018 1019 if (items->type == RTE_FLOW_ITEM_TYPE_VOID) 1020 continue; 1021 for (i = 0; 1022 cur_item->items && 1023 cur_item->items[i] != RTE_FLOW_ITEM_TYPE_END; 1024 ++i) { 1025 if (cur_item->items[i] == items->type) { 1026 token = &tap_flow_items[items->type]; 1027 break; 1028 } 1029 } 1030 if (!token) 1031 goto exit_item_not_supported; 1032 cur_item = token; 1033 err = tap_flow_item_validate( 1034 items, cur_item->mask_sz, 1035 (const uint8_t *)cur_item->mask, 1036 (const uint8_t *)cur_item->default_mask); 1037 if (err) 1038 goto exit_item_not_supported; 1039 if (flow && cur_item->convert) { 1040 err = cur_item->convert(items, &data); 1041 if (err) 1042 goto exit_item_not_supported; 1043 } 1044 } 1045 if (flow) { 1046 if (data.vlan) { 1047 tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE, 1048 htons(ETH_P_8021Q)); 1049 tap_nlattr_add16(&flow->msg.nh, 1050 TCA_FLOWER_KEY_VLAN_ETH_TYPE, 1051 data.eth_type ? 1052 data.eth_type : htons(ETH_P_ALL)); 1053 } else if (data.eth_type) { 1054 tap_nlattr_add16(&flow->msg.nh, TCA_FLOWER_KEY_ETH_TYPE, 1055 data.eth_type); 1056 } 1057 } 1058 if (mirred && flow) { 1059 uint16_t if_index = pmd->if_index; 1060 1061 /* 1062 * If attr->egress && mirred, then this is a special 1063 * case where the rule must be applied on the tap, to 1064 * redirect packets coming from the DPDK App, out 1065 * through the remote netdevice. 1066 */ 1067 if (attr->egress) 1068 if_index = pmd->remote_if_index; 1069 if (add_action_mirred(flow, if_index, mirred) < 0) 1070 goto exit_action_not_supported; 1071 else 1072 goto end; 1073 } 1074 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { 1075 int err = 0; 1076 1077 if (actions->type == RTE_FLOW_ACTION_TYPE_VOID) { 1078 continue; 1079 } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { 1080 if (action) 1081 goto exit_action_not_supported; 1082 action = 1; 1083 if (flow) 1084 err = add_action_gact(flow, TC_ACT_SHOT); 1085 } else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) { 1086 if (action) 1087 goto exit_action_not_supported; 1088 action = 1; 1089 if (flow) 1090 err = add_action_gact(flow, TC_ACT_UNSPEC); 1091 } else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) { 1092 const struct rte_flow_action_queue *queue = 1093 (const struct rte_flow_action_queue *) 1094 actions->conf; 1095 1096 if (action) 1097 goto exit_action_not_supported; 1098 action = 1; 1099 if (!queue || 1100 (queue->index > pmd->dev->data->nb_rx_queues - 1)) 1101 goto exit_action_not_supported; 1102 if (flow) 1103 err = add_action_skbedit(flow, queue->index); 1104 } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) { 1105 /* Fake RSS support. */ 1106 const struct rte_flow_action_rss *rss = 1107 (const struct rte_flow_action_rss *) 1108 actions->conf; 1109 1110 if (action) 1111 goto exit_action_not_supported; 1112 action = 1; 1113 if (!rss || rss->num < 1 || 1114 (rss->queue[0] > pmd->dev->data->nb_rx_queues - 1)) 1115 goto exit_action_not_supported; 1116 if (flow) 1117 err = add_action_skbedit(flow, rss->queue[0]); 1118 } else { 1119 goto exit_action_not_supported; 1120 } 1121 if (err) 1122 goto exit_action_not_supported; 1123 } 1124 end: 1125 if (flow) 1126 tap_nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */ 1127 return 0; 1128 exit_item_not_supported: 1129 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, 1130 items, "item not supported"); 1131 return -rte_errno; 1132 exit_action_not_supported: 1133 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, 1134 actions, "action not supported"); 1135 return -rte_errno; 1136 } 1137 1138 1139 1140 /** 1141 * Validate a flow. 1142 * 1143 * @see rte_flow_validate() 1144 * @see rte_flow_ops 1145 */ 1146 static int 1147 tap_flow_validate(struct rte_eth_dev *dev, 1148 const struct rte_flow_attr *attr, 1149 const struct rte_flow_item items[], 1150 const struct rte_flow_action actions[], 1151 struct rte_flow_error *error) 1152 { 1153 struct pmd_internals *pmd = dev->data->dev_private; 1154 1155 return priv_flow_process(pmd, attr, items, actions, error, NULL, 0); 1156 } 1157 1158 /** 1159 * Set a unique handle in a flow. 1160 * 1161 * The kernel supports TC rules with equal priority, as long as they use the 1162 * same matching fields (e.g.: dst mac and ipv4) with different values (and 1163 * full mask to ensure no collision is possible). 1164 * In those rules, the handle (uint32_t) is the part that would identify 1165 * specifically each rule. 1166 * 1167 * On 32-bit architectures, the handle can simply be the flow's pointer address. 1168 * On 64-bit architectures, we rely on jhash(flow) to find a (sufficiently) 1169 * unique handle. 1170 * 1171 * @param[in, out] flow 1172 * The flow that needs its handle set. 1173 */ 1174 static void 1175 tap_flow_set_handle(struct rte_flow *flow) 1176 { 1177 uint32_t handle = 0; 1178 1179 if (sizeof(flow) > 4) 1180 handle = rte_jhash(&flow, sizeof(flow), 1); 1181 else 1182 handle = (uintptr_t)flow; 1183 /* must be at least 1 to avoid letting the kernel choose one for us */ 1184 if (!handle) 1185 handle = 1; 1186 flow->msg.t.tcm_handle = handle; 1187 } 1188 1189 /** 1190 * Create a flow. 1191 * 1192 * @see rte_flow_create() 1193 * @see rte_flow_ops 1194 */ 1195 static struct rte_flow * 1196 tap_flow_create(struct rte_eth_dev *dev, 1197 const struct rte_flow_attr *attr, 1198 const struct rte_flow_item items[], 1199 const struct rte_flow_action actions[], 1200 struct rte_flow_error *error) 1201 { 1202 struct pmd_internals *pmd = dev->data->dev_private; 1203 struct rte_flow *remote_flow = NULL; 1204 struct rte_flow *flow = NULL; 1205 struct nlmsg *msg = NULL; 1206 int err; 1207 1208 if (!pmd->if_index) { 1209 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, 1210 NULL, 1211 "can't create rule, ifindex not found"); 1212 goto fail; 1213 } 1214 /* 1215 * No rules configured through standard rte_flow should be set on the 1216 * priorities used by implicit rules. 1217 */ 1218 if ((attr->group == MAX_GROUP) && 1219 attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) { 1220 rte_flow_error_set( 1221 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, 1222 NULL, "priority value too big"); 1223 goto fail; 1224 } 1225 flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); 1226 if (!flow) { 1227 rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1228 NULL, "cannot allocate memory for rte_flow"); 1229 goto fail; 1230 } 1231 msg = &flow->msg; 1232 tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, 1233 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); 1234 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); 1235 tap_flow_set_handle(flow); 1236 if (priv_flow_process(pmd, attr, items, actions, error, flow, 0)) 1237 goto fail; 1238 err = tap_nl_send(pmd->nlsk_fd, &msg->nh); 1239 if (err < 0) { 1240 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, 1241 NULL, "couldn't send request to kernel"); 1242 goto fail; 1243 } 1244 err = tap_nl_recv_ack(pmd->nlsk_fd); 1245 if (err < 0) { 1246 RTE_LOG(ERR, PMD, 1247 "Kernel refused TC filter rule creation (%d): %s\n", 1248 errno, strerror(errno)); 1249 rte_flow_error_set(error, EEXIST, RTE_FLOW_ERROR_TYPE_HANDLE, 1250 NULL, 1251 "overlapping rules or Kernel too old for flower support"); 1252 goto fail; 1253 } 1254 LIST_INSERT_HEAD(&pmd->flows, flow, next); 1255 /** 1256 * If a remote device is configured, a TC rule with identical items for 1257 * matching must be set on that device, with a single action: redirect 1258 * to the local pmd->if_index. 1259 */ 1260 if (pmd->remote_if_index) { 1261 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); 1262 if (!remote_flow) { 1263 rte_flow_error_set( 1264 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 1265 "cannot allocate memory for rte_flow"); 1266 goto fail; 1267 } 1268 msg = &remote_flow->msg; 1269 /* set the rule if_index for the remote netdevice */ 1270 tc_init_msg( 1271 msg, pmd->remote_if_index, RTM_NEWTFILTER, 1272 NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); 1273 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); 1274 tap_flow_set_handle(remote_flow); 1275 if (priv_flow_process(pmd, attr, items, NULL, 1276 error, remote_flow, TCA_EGRESS_REDIR)) { 1277 rte_flow_error_set( 1278 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1279 NULL, "rte flow rule validation failed"); 1280 goto fail; 1281 } 1282 err = tap_nl_send(pmd->nlsk_fd, &msg->nh); 1283 if (err < 0) { 1284 rte_flow_error_set( 1285 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1286 NULL, "Failure sending nl request"); 1287 goto fail; 1288 } 1289 err = tap_nl_recv_ack(pmd->nlsk_fd); 1290 if (err < 0) { 1291 RTE_LOG(ERR, PMD, 1292 "Kernel refused TC filter rule creation (%d): %s\n", 1293 errno, strerror(errno)); 1294 rte_flow_error_set( 1295 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1296 NULL, 1297 "overlapping rules or Kernel too old for flower support"); 1298 goto fail; 1299 } 1300 flow->remote_flow = remote_flow; 1301 } 1302 return flow; 1303 fail: 1304 if (remote_flow) 1305 rte_free(remote_flow); 1306 if (flow) 1307 rte_free(flow); 1308 return NULL; 1309 } 1310 1311 /** 1312 * Destroy a flow using pointer to pmd_internal. 1313 * 1314 * @param[in, out] pmd 1315 * Pointer to private structure. 1316 * @param[in] flow 1317 * Pointer to the flow to destroy. 1318 * @param[in, out] error 1319 * Pointer to the flow error handler 1320 * 1321 * @return 0 if the flow could be destroyed, -1 otherwise. 1322 */ 1323 static int 1324 tap_flow_destroy_pmd(struct pmd_internals *pmd, 1325 struct rte_flow *flow, 1326 struct rte_flow_error *error) 1327 { 1328 struct rte_flow *remote_flow = flow->remote_flow; 1329 int ret = 0; 1330 1331 LIST_REMOVE(flow, next); 1332 flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1333 flow->msg.nh.nlmsg_type = RTM_DELTFILTER; 1334 1335 ret = tap_nl_send(pmd->nlsk_fd, &flow->msg.nh); 1336 if (ret < 0) { 1337 rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, 1338 NULL, "couldn't send request to kernel"); 1339 goto end; 1340 } 1341 ret = tap_nl_recv_ack(pmd->nlsk_fd); 1342 /* If errno is ENOENT, the rule is already no longer in the kernel. */ 1343 if (ret < 0 && errno == ENOENT) 1344 ret = 0; 1345 if (ret < 0) { 1346 RTE_LOG(ERR, PMD, 1347 "Kernel refused TC filter rule deletion (%d): %s\n", 1348 errno, strerror(errno)); 1349 rte_flow_error_set( 1350 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, 1351 "couldn't receive kernel ack to our request"); 1352 goto end; 1353 } 1354 if (remote_flow) { 1355 remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 1356 remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER; 1357 1358 ret = tap_nl_send(pmd->nlsk_fd, &remote_flow->msg.nh); 1359 if (ret < 0) { 1360 rte_flow_error_set( 1361 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1362 NULL, "Failure sending nl request"); 1363 goto end; 1364 } 1365 ret = tap_nl_recv_ack(pmd->nlsk_fd); 1366 if (ret < 0 && errno == ENOENT) 1367 ret = 0; 1368 if (ret < 0) { 1369 RTE_LOG(ERR, PMD, 1370 "Kernel refused TC filter rule deletion (%d): %s\n", 1371 errno, strerror(errno)); 1372 rte_flow_error_set( 1373 error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, 1374 NULL, "Failure trying to receive nl ack"); 1375 goto end; 1376 } 1377 } 1378 end: 1379 if (remote_flow) 1380 rte_free(remote_flow); 1381 rte_free(flow); 1382 return ret; 1383 } 1384 1385 /** 1386 * Destroy a flow. 1387 * 1388 * @see rte_flow_destroy() 1389 * @see rte_flow_ops 1390 */ 1391 static int 1392 tap_flow_destroy(struct rte_eth_dev *dev, 1393 struct rte_flow *flow, 1394 struct rte_flow_error *error) 1395 { 1396 struct pmd_internals *pmd = dev->data->dev_private; 1397 1398 return tap_flow_destroy_pmd(pmd, flow, error); 1399 } 1400 1401 /** 1402 * Enable/disable flow isolation. 1403 * 1404 * @see rte_flow_isolate() 1405 * @see rte_flow_ops 1406 */ 1407 static int 1408 tap_flow_isolate(struct rte_eth_dev *dev, 1409 int set, 1410 struct rte_flow_error *error __rte_unused) 1411 { 1412 struct pmd_internals *pmd = dev->data->dev_private; 1413 1414 if (set) 1415 pmd->flow_isolate = 1; 1416 else 1417 pmd->flow_isolate = 0; 1418 /* 1419 * If netdevice is there, setup appropriate flow rules immediately. 1420 * Otherwise it will be set when bringing up the netdevice (tun_alloc). 1421 */ 1422 if (!pmd->rxq[0].fd) 1423 return 0; 1424 if (set) { 1425 struct rte_flow *flow; 1426 1427 while (1) { 1428 flow = LIST_FIRST(&pmd->implicit_flows); 1429 if (!flow) 1430 break; 1431 /* 1432 * Remove all implicit rules on the remote. 1433 * Keep the local rule to redirect packets on TX. 1434 * Keep also the last implicit local rule: ISOLATE. 1435 */ 1436 if (flow->msg.t.tcm_ifindex == pmd->if_index) 1437 break; 1438 if (tap_flow_destroy_pmd(pmd, flow, NULL) < 0) 1439 goto error; 1440 } 1441 /* Switch the TC rule according to pmd->flow_isolate */ 1442 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1) 1443 goto error; 1444 } else { 1445 /* Switch the TC rule according to pmd->flow_isolate */ 1446 if (tap_flow_implicit_create(pmd, TAP_ISOLATE) == -1) 1447 goto error; 1448 if (!pmd->remote_if_index) 1449 return 0; 1450 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0) 1451 goto error; 1452 if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0) 1453 goto error; 1454 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0) 1455 goto error; 1456 if (tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0) 1457 goto error; 1458 if (dev->data->promiscuous && 1459 tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC) < 0) 1460 goto error; 1461 if (dev->data->all_multicast && 1462 tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI) < 0) 1463 goto error; 1464 } 1465 return 0; 1466 error: 1467 pmd->flow_isolate = 0; 1468 return rte_flow_error_set( 1469 error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, 1470 "TC rule creation failed"); 1471 } 1472 1473 /** 1474 * Destroy all flows. 1475 * 1476 * @see rte_flow_flush() 1477 * @see rte_flow_ops 1478 */ 1479 int 1480 tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error) 1481 { 1482 struct pmd_internals *pmd = dev->data->dev_private; 1483 struct rte_flow *flow; 1484 1485 while (!LIST_EMPTY(&pmd->flows)) { 1486 flow = LIST_FIRST(&pmd->flows); 1487 if (tap_flow_destroy(dev, flow, error) < 0) 1488 return -1; 1489 } 1490 return 0; 1491 } 1492 1493 /** 1494 * Add an implicit flow rule on the remote device to make sure traffic gets to 1495 * the tap netdevice from there. 1496 * 1497 * @param pmd 1498 * Pointer to private structure. 1499 * @param[in] idx 1500 * The idx in the implicit_rte_flows array specifying which rule to apply. 1501 * 1502 * @return -1 if the rule couldn't be applied, 0 otherwise. 1503 */ 1504 int tap_flow_implicit_create(struct pmd_internals *pmd, 1505 enum implicit_rule_index idx) 1506 { 1507 uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE; 1508 struct rte_flow_action *actions = implicit_rte_flows[idx].actions; 1509 struct rte_flow_action isolate_actions[2] = { 1510 [1] = { 1511 .type = RTE_FLOW_ACTION_TYPE_END, 1512 }, 1513 }; 1514 struct rte_flow_item *items = implicit_rte_flows[idx].items; 1515 struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr; 1516 struct rte_flow_item_eth eth_local = { .type = 0 }; 1517 uint16_t if_index = pmd->remote_if_index; 1518 struct rte_flow *remote_flow = NULL; 1519 struct nlmsg *msg = NULL; 1520 int err = 0; 1521 struct rte_flow_item items_local[2] = { 1522 [0] = { 1523 .type = items[0].type, 1524 .spec = ð_local, 1525 .mask = items[0].mask, 1526 }, 1527 [1] = { 1528 .type = items[1].type, 1529 } 1530 }; 1531 1532 remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); 1533 if (!remote_flow) { 1534 RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow\n"); 1535 goto fail; 1536 } 1537 msg = &remote_flow->msg; 1538 if (idx == TAP_REMOTE_TX) { 1539 if_index = pmd->if_index; 1540 } else if (idx == TAP_ISOLATE) { 1541 if_index = pmd->if_index; 1542 /* Don't be exclusive for this rule, it can be changed later. */ 1543 flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; 1544 isolate_actions[0].type = pmd->flow_isolate ? 1545 RTE_FLOW_ACTION_TYPE_DROP : 1546 RTE_FLOW_ACTION_TYPE_PASSTHRU; 1547 actions = isolate_actions; 1548 } else if (idx == TAP_REMOTE_LOCAL_MAC) { 1549 /* 1550 * eth addr couldn't be set in implicit_rte_flows[] as it is not 1551 * known at compile time. 1552 */ 1553 memcpy(ð_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr)); 1554 items = items_local; 1555 } 1556 tc_init_msg(msg, if_index, RTM_NEWTFILTER, flags); 1557 msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); 1558 /* 1559 * The ISOLATE rule is always present and must have a static handle, as 1560 * the action is changed whether the feature is enabled (DROP) or 1561 * disabled (PASSTHRU). 1562 */ 1563 if (idx == TAP_ISOLATE) 1564 remote_flow->msg.t.tcm_handle = ISOLATE_HANDLE; 1565 else 1566 tap_flow_set_handle(remote_flow); 1567 if (priv_flow_process(pmd, attr, items, actions, NULL, 1568 remote_flow, implicit_rte_flows[idx].mirred)) { 1569 RTE_LOG(ERR, PMD, "rte flow rule validation failed\n"); 1570 goto fail; 1571 } 1572 err = tap_nl_send(pmd->nlsk_fd, &msg->nh); 1573 if (err < 0) { 1574 RTE_LOG(ERR, PMD, "Failure sending nl request\n"); 1575 goto fail; 1576 } 1577 err = tap_nl_recv_ack(pmd->nlsk_fd); 1578 if (err < 0) { 1579 RTE_LOG(ERR, PMD, 1580 "Kernel refused TC filter rule creation (%d): %s\n", 1581 errno, strerror(errno)); 1582 goto fail; 1583 } 1584 LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next); 1585 return 0; 1586 fail: 1587 if (remote_flow) 1588 rte_free(remote_flow); 1589 return -1; 1590 } 1591 1592 /** 1593 * Remove specific implicit flow rule on the remote device. 1594 * 1595 * @param[in, out] pmd 1596 * Pointer to private structure. 1597 * @param[in] idx 1598 * The idx in the implicit_rte_flows array specifying which rule to remove. 1599 * 1600 * @return -1 if one of the implicit rules couldn't be created, 0 otherwise. 1601 */ 1602 int tap_flow_implicit_destroy(struct pmd_internals *pmd, 1603 enum implicit_rule_index idx) 1604 { 1605 struct rte_flow *remote_flow; 1606 int cur_prio = -1; 1607 int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET; 1608 1609 for (remote_flow = LIST_FIRST(&pmd->implicit_flows); 1610 remote_flow; 1611 remote_flow = LIST_NEXT(remote_flow, next)) { 1612 cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK; 1613 if (cur_prio != idx_prio) 1614 continue; 1615 return tap_flow_destroy_pmd(pmd, remote_flow, NULL); 1616 } 1617 return 0; 1618 } 1619 1620 /** 1621 * Destroy all implicit flows. 1622 * 1623 * @see rte_flow_flush() 1624 */ 1625 int 1626 tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error) 1627 { 1628 struct rte_flow *remote_flow; 1629 1630 while (!LIST_EMPTY(&pmd->implicit_flows)) { 1631 remote_flow = LIST_FIRST(&pmd->implicit_flows); 1632 if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0) 1633 return -1; 1634 } 1635 return 0; 1636 } 1637 1638 /** 1639 * Manage filter operations. 1640 * 1641 * @param dev 1642 * Pointer to Ethernet device structure. 1643 * @param filter_type 1644 * Filter type. 1645 * @param filter_op 1646 * Operation to perform. 1647 * @param arg 1648 * Pointer to operation-specific structure. 1649 * 1650 * @return 1651 * 0 on success, negative errno value on failure. 1652 */ 1653 int 1654 tap_dev_filter_ctrl(struct rte_eth_dev *dev, 1655 enum rte_filter_type filter_type, 1656 enum rte_filter_op filter_op, 1657 void *arg) 1658 { 1659 switch (filter_type) { 1660 case RTE_ETH_FILTER_GENERIC: 1661 if (filter_op != RTE_ETH_FILTER_GET) 1662 return -EINVAL; 1663 *(const void **)arg = &tap_flow_ops; 1664 return 0; 1665 default: 1666 RTE_LOG(ERR, PMD, "%p: filter type (%d) not supported\n", 1667 (void *)dev, filter_type); 1668 } 1669 return -EINVAL; 1670 } 1671 1672