1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 34 #ifdef TCP_OFFLOAD 35 #include <sys/param.h> 36 #include <sys/types.h> 37 #include <sys/kernel.h> 38 #include <sys/ktr.h> 39 #include <sys/module.h> 40 #include <sys/protosw.h> 41 #include <sys/refcount.h> 42 #include <sys/domain.h> 43 #include <sys/fnv_hash.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sysctl.h> 47 #include <net/ethernet.h> 48 #include <net/if.h> 49 #include <net/if_types.h> 50 #include <net/if_vlan_var.h> 51 #include <net/route.h> 52 #include <net/route/nhop.h> 53 #include <netinet/in.h> 54 #include <netinet/in_fib.h> 55 #include <netinet/in_pcb.h> 56 #include <netinet/ip.h> 57 #include <netinet/ip6.h> 58 #include <netinet6/in6_fib.h> 59 #include <netinet6/scope6_var.h> 60 #include <netinet/tcp_timer.h> 61 #define TCPSTATES 62 #include <netinet/tcp_fsm.h> 63 #include <netinet/tcp_var.h> 64 #include <netinet/toecore.h> 65 #include <netinet/cc/cc.h> 66 67 #include "common/common.h" 68 #include "common/t4_msg.h" 69 #include "common/t4_regs.h" 70 #include "t4_clip.h" 71 #include "tom/t4_tom_l2t.h" 72 #include "tom/t4_tom.h" 73 74 /* stid services */ 75 static int alloc_stid(struct adapter *, bool, void *); 76 static struct listen_ctx *lookup_stid(struct adapter *, int); 77 static void free_stid(struct adapter *, int , bool); 78 79 /* lctx services */ 80 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, 81 struct vi_info *); 82 static int free_lctx(struct adapter *, struct listen_ctx *); 83 static void hold_lctx(struct listen_ctx *); 84 static void listen_hash_add(struct adapter *, struct listen_ctx *); 85 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); 86 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); 87 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); 88 89 static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int); 90 91 static int create_server6(struct adapter *, struct listen_ctx *); 92 static int create_server(struct adapter *, struct listen_ctx *); 93 94 int 95 alloc_stid_tab(struct adapter *sc) 96 { 97 struct tid_info *t = &sc->tids; 98 99 MPASS(t->nstids > 0); 100 MPASS(t->stid_tab == NULL); 101 102 t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE, 103 M_ZERO | M_NOWAIT); 104 if (t->stid_tab == NULL) 105 return (ENOMEM); 106 t->stid_bitmap = bit_alloc(t->nstids, M_CXGBE, M_NOWAIT); 107 if (t->stid_bitmap == NULL) { 108 free(t->stid_tab, M_CXGBE); 109 t->stid_tab = NULL; 110 return (ENOMEM); 111 } 112 mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); 113 t->stids_in_use = 0; 114 115 return (0); 116 } 117 118 void 119 free_stid_tab(struct adapter *sc) 120 { 121 struct tid_info *t = &sc->tids; 122 123 KASSERT(t->stids_in_use == 0, 124 ("%s: %d tids still in use.", __func__, t->stids_in_use)); 125 126 if (mtx_initialized(&t->stid_lock)) 127 mtx_destroy(&t->stid_lock); 128 free(t->stid_tab, M_CXGBE); 129 t->stid_tab = NULL; 130 free(t->stid_bitmap, M_CXGBE); 131 t->stid_bitmap = NULL; 132 } 133 134 void 135 stop_stid_tab(struct adapter *sc) 136 { 137 struct tid_info *t = &sc->tids; 138 struct tom_data *td = sc->tom_softc; 139 struct listen_ctx *lctx; 140 struct synq_entry *synqe; 141 int i, ntids; 142 143 mtx_lock(&t->stid_lock); 144 t->stid_tab_stopped = true; 145 mtx_unlock(&t->stid_lock); 146 147 mtx_lock(&td->lctx_hash_lock); 148 for (i = 0; i <= td->listen_mask; i++) { 149 LIST_FOREACH(lctx, &td->listen_hash[i], link) 150 lctx->flags &= ~(LCTX_RPL_PENDING | LCTX_SETUP_IN_HW); 151 } 152 mtx_unlock(&td->lctx_hash_lock); 153 154 mtx_lock(&td->toep_list_lock); 155 TAILQ_FOREACH(synqe, &td->synqe_list, link) { 156 MPASS(sc->incarnation == synqe->incarnation); 157 MPASS(synqe->tid >= 0); 158 MPASS(synqe == lookup_tid(sc, synqe->tid)); 159 /* Remove tid from the lookup table immediately. */ 160 CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table", 161 __func__, synqe->tid, synqe->incarnation); 162 ntids = synqe->lctx->inp->inp_vflag & INP_IPV6 ? 2 : 1; 163 remove_tid(sc, synqe->tid, ntids); 164 #if 0 165 /* synqe->tid is stale now but left alone for debug. */ 166 synqe->tid = -1; 167 #endif 168 } 169 MPASS(TAILQ_EMPTY(&td->stranded_synqe)); 170 TAILQ_CONCAT(&td->stranded_synqe, &td->synqe_list, link); 171 MPASS(TAILQ_EMPTY(&td->synqe_list)); 172 mtx_unlock(&td->toep_list_lock); 173 } 174 175 void 176 restart_stid_tab(struct adapter *sc) 177 { 178 struct tid_info *t = &sc->tids; 179 struct tom_data *td = sc->tom_softc; 180 struct listen_ctx *lctx; 181 int i; 182 183 mtx_lock(&td->lctx_hash_lock); 184 for (i = 0; i <= td->listen_mask; i++) { 185 LIST_FOREACH(lctx, &td->listen_hash[i], link) { 186 MPASS((lctx->flags & (LCTX_RPL_PENDING | LCTX_SETUP_IN_HW)) == 0); 187 lctx->flags |= LCTX_RPL_PENDING; 188 if (lctx->inp->inp_vflag & INP_IPV6) 189 create_server6(sc, lctx); 190 else 191 create_server(sc, lctx); 192 } 193 } 194 mtx_unlock(&td->lctx_hash_lock); 195 196 mtx_lock(&t->stid_lock); 197 t->stid_tab_stopped = false; 198 mtx_unlock(&t->stid_lock); 199 200 } 201 202 static int 203 alloc_stid(struct adapter *sc, bool isipv6, void *ctx) 204 { 205 struct tid_info *t = &sc->tids; 206 const u_int n = isipv6 ? 2 : 1; 207 int stid, pair_stid; 208 u_int i; 209 ssize_t val; 210 211 mtx_lock(&t->stid_lock); 212 MPASS(t->stids_in_use <= t->nstids); 213 if (n > t->nstids - t->stids_in_use || t->stid_tab_stopped) { 214 mtx_unlock(&t->stid_lock); 215 return (-1); 216 } 217 218 stid = -1; 219 if (isipv6) { 220 /* 221 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 222 * cells) in the TCAM. We know that the start of the stid 223 * region is properly aligned already (the chip requires each 224 * region to be 128-cell aligned). 225 */ 226 for (i = 0; i + 1 < t->nstids; i = roundup2(val + 1, 2)) { 227 bit_ffc_area_at(t->stid_bitmap, i, t->nstids, 2, &val); 228 if (val == -1) 229 break; 230 if ((val & 1) == 0) { 231 stid = val; 232 break; 233 } 234 } 235 } else { 236 /* 237 * An IPv4 server needs one stid without any alignment 238 * requirements. But we try extra hard to find an available 239 * stid adjacent to a used stid so that free "stid-pairs" are 240 * left intact for IPv6. 241 */ 242 bit_ffc_at(t->stid_bitmap, 0, t->nstids, &val); 243 while (val != -1) { 244 if (stid == -1) { 245 /* 246 * First usable stid. Look no further if it's 247 * an ideal fit. 248 */ 249 stid = val; 250 if (val & 1 || bit_test(t->stid_bitmap, val + 1)) 251 break; 252 } else { 253 /* 254 * We have an unused stid already but are now 255 * looking for in-use stids because we'd prefer 256 * to grab an unused stid adjacent to one that's 257 * in use. 258 * 259 * Odd stids pair with the previous stid and 260 * even ones pair with the next stid. 261 */ 262 pair_stid = val & 1 ? val - 1 : val + 1; 263 if (bit_test(t->stid_bitmap, pair_stid) == 0) { 264 stid = pair_stid; 265 break; 266 } 267 } 268 val = roundup2(val + 1, 2); 269 if (val >= t->nstids) 270 break; 271 bit_ffs_at(t->stid_bitmap, val, t->nstids, &val); 272 } 273 } 274 275 if (stid >= 0) { 276 MPASS(stid + n - 1 < t->nstids); 277 MPASS(bit_ntest(t->stid_bitmap, stid, stid + n - 1, 0)); 278 bit_nset(t->stid_bitmap, stid, stid + n - 1); 279 t->stids_in_use += n; 280 t->stid_tab[stid] = ctx; 281 #ifdef INVARIANTS 282 if (n == 2) { 283 MPASS((stid & 1) == 0); 284 t->stid_tab[stid + 1] = NULL; 285 } 286 #endif 287 stid += t->stid_base; 288 } 289 mtx_unlock(&t->stid_lock); 290 return (stid); 291 } 292 293 static struct listen_ctx * 294 lookup_stid(struct adapter *sc, int stid) 295 { 296 struct tid_info *t = &sc->tids; 297 298 return (t->stid_tab[stid - t->stid_base]); 299 } 300 301 static void 302 free_stid(struct adapter *sc, int stid, bool isipv6) 303 { 304 struct tid_info *t = &sc->tids; 305 const u_int n = isipv6 ? 2 : 1; 306 307 mtx_lock(&t->stid_lock); 308 MPASS(stid >= t->stid_base); 309 stid -= t->stid_base; 310 MPASS(stid + n - 1 < t->nstids); 311 MPASS(t->stids_in_use <= t->nstids); 312 MPASS(t->stids_in_use >= n); 313 MPASS(t->stid_tab[stid] != NULL); 314 #ifdef INVARIANTS 315 if (n == 2) { 316 MPASS((stid & 1) == 0); 317 MPASS(t->stid_tab[stid + 1] == NULL); 318 } 319 #endif 320 MPASS(bit_ntest(t->stid_bitmap, stid, stid + n - 1, 1)); 321 bit_nclear(t->stid_bitmap, stid, stid + n - 1); 322 t->stid_tab[stid] = NULL; 323 t->stids_in_use -= n; 324 mtx_unlock(&t->stid_lock); 325 } 326 327 static struct listen_ctx * 328 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) 329 { 330 struct listen_ctx *lctx; 331 332 INP_WLOCK_ASSERT(inp); 333 334 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); 335 if (lctx == NULL) 336 return (NULL); 337 338 lctx->isipv6 = inp->inp_vflag & INP_IPV6; 339 lctx->stid = alloc_stid(sc, lctx->isipv6, lctx); 340 if (lctx->stid < 0) { 341 free(lctx, M_CXGBE); 342 return (NULL); 343 } 344 345 if (lctx->isipv6 && 346 !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { 347 lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); 348 if (lctx->ce == NULL) { 349 free(lctx, M_CXGBE); 350 return (NULL); 351 } 352 } 353 354 lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; 355 lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; 356 refcount_init(&lctx->refcount, 1); 357 358 lctx->inp = inp; 359 lctx->vnet = inp->inp_socket->so_vnet; 360 in_pcbref(inp); 361 362 return (lctx); 363 } 364 365 /* Don't call this directly, use release_lctx instead */ 366 static int 367 free_lctx(struct adapter *sc, struct listen_ctx *lctx) 368 { 369 struct inpcb *inp = lctx->inp; 370 371 INP_WLOCK_ASSERT(inp); 372 KASSERT(lctx->refcount == 0, 373 ("%s: refcount %d", __func__, lctx->refcount)); 374 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); 375 376 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", 377 __func__, lctx->stid, lctx, lctx->inp); 378 379 if (lctx->ce) 380 t4_release_clip_entry(sc, lctx->ce); 381 free_stid(sc, lctx->stid, lctx->isipv6); 382 free(lctx, M_CXGBE); 383 384 return (in_pcbrele_wlocked(inp)); 385 } 386 387 static void 388 hold_lctx(struct listen_ctx *lctx) 389 { 390 391 refcount_acquire(&lctx->refcount); 392 } 393 394 static inline uint32_t 395 listen_hashfn(void *key, u_long mask) 396 { 397 398 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); 399 } 400 401 /* 402 * Add a listen_ctx entry to the listen hash table. 403 */ 404 static void 405 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) 406 { 407 struct tom_data *td = sc->tom_softc; 408 int bucket = listen_hashfn(lctx->inp, td->listen_mask); 409 410 mtx_lock(&td->lctx_hash_lock); 411 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); 412 td->lctx_count++; 413 mtx_unlock(&td->lctx_hash_lock); 414 } 415 416 /* 417 * Look for the listening socket's context entry in the hash and return it. 418 */ 419 static struct listen_ctx * 420 listen_hash_find(struct adapter *sc, struct inpcb *inp) 421 { 422 struct tom_data *td = sc->tom_softc; 423 int bucket = listen_hashfn(inp, td->listen_mask); 424 struct listen_ctx *lctx; 425 426 mtx_lock(&td->lctx_hash_lock); 427 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { 428 if (lctx->inp == inp) 429 break; 430 } 431 mtx_unlock(&td->lctx_hash_lock); 432 433 return (lctx); 434 } 435 436 /* 437 * Removes the listen_ctx structure for inp from the hash and returns it. 438 */ 439 static struct listen_ctx * 440 listen_hash_del(struct adapter *sc, struct inpcb *inp) 441 { 442 struct tom_data *td = sc->tom_softc; 443 int bucket = listen_hashfn(inp, td->listen_mask); 444 struct listen_ctx *lctx, *l; 445 446 mtx_lock(&td->lctx_hash_lock); 447 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { 448 if (lctx->inp == inp) { 449 LIST_REMOVE(lctx, link); 450 td->lctx_count--; 451 break; 452 } 453 } 454 mtx_unlock(&td->lctx_hash_lock); 455 456 return (lctx); 457 } 458 459 /* 460 * Releases a hold on the lctx. Must be called with the listening socket's inp 461 * locked. The inp may be freed by this function and it returns NULL to 462 * indicate this. 463 */ 464 static struct inpcb * 465 release_lctx(struct adapter *sc, struct listen_ctx *lctx) 466 { 467 struct inpcb *inp = lctx->inp; 468 int inp_freed = 0; 469 470 INP_WLOCK_ASSERT(inp); 471 if (refcount_release(&lctx->refcount)) 472 inp_freed = free_lctx(sc, lctx); 473 474 return (inp_freed ? NULL : inp); 475 } 476 477 static void 478 send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe) 479 { 480 struct mbuf *m = synqe->syn; 481 if_t ifp = m->m_pkthdr.rcvif; 482 struct vi_info *vi = if_getsoftc(ifp); 483 struct port_info *pi = vi->pi; 484 struct wrqe *wr; 485 struct fw_flowc_wr *flowc; 486 struct sge_ofld_txq *ofld_txq; 487 struct sge_ofld_rxq *ofld_rxq; 488 const int nparams = 6; 489 const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 490 const u_int pfvf = sc->pf << S_FW_VIID_PFN; 491 492 INP_WLOCK_ASSERT(synqe->lctx->inp); 493 MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0); 494 495 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; 496 ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx]; 497 498 wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq); 499 if (wr == NULL) { 500 /* XXX */ 501 panic("%s: allocation failure.", __func__); 502 } 503 flowc = wrtod(wr); 504 memset(flowc, 0, wr->wr_len); 505 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 506 V_FW_FLOWC_WR_NPARAMS(nparams)); 507 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 508 V_FW_WR_FLOWID(synqe->tid)); 509 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; 510 flowc->mnemval[0].val = htobe32(pfvf); 511 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; 512 flowc->mnemval[1].val = htobe32(pi->tx_chan); 513 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; 514 flowc->mnemval[2].val = htobe32(pi->tx_chan); 515 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; 516 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); 517 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; 518 flowc->mnemval[4].val = htobe32(512); 519 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; 520 flowc->mnemval[5].val = htobe32(512); 521 522 synqe->flags |= TPF_FLOWC_WR_SENT; 523 t4_wrq_tx(sc, wr); 524 } 525 526 static void 527 send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe, 528 int rst_status) 529 { 530 struct adapter *sc = tod->tod_softc; 531 struct wrqe *wr; 532 struct cpl_abort_req *req; 533 534 INP_WLOCK_ASSERT(synqe->lctx->inp); 535 536 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", 537 __func__, synqe, synqe->flags, synqe->tid, 538 synqe->flags & TPF_ABORT_SHUTDOWN ? 539 " (abort already in progress)" : ""); 540 if (synqe->flags & TPF_ABORT_SHUTDOWN) 541 return; /* abort already in progress */ 542 synqe->flags |= TPF_ABORT_SHUTDOWN; 543 544 if (!(synqe->flags & TPF_FLOWC_WR_SENT)) 545 send_flowc_wr_synqe(sc, synqe); 546 547 wr = alloc_wrqe(sizeof(*req), 548 &sc->sge.ofld_txq[synqe->params.txq_idx].wrq); 549 if (wr == NULL) { 550 /* XXX */ 551 panic("%s: allocation failure.", __func__); 552 } 553 req = wrtod(wr); 554 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); 555 req->rsvd0 = 0; /* don't have a snd_nxt */ 556 req->rsvd1 = 1; /* no data sent yet */ 557 req->cmd = rst_status; 558 559 t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]); 560 } 561 562 static int 563 create_server(struct adapter *sc, struct listen_ctx *lctx) 564 { 565 struct wrqe *wr; 566 struct cpl_pass_open_req *req; 567 struct inpcb *inp = lctx->inp; 568 569 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 570 if (wr == NULL) { 571 log(LOG_ERR, "%s: allocation failure", __func__); 572 return (ENOMEM); 573 } 574 req = wrtod(wr); 575 576 INIT_TP_WR(req, 0); 577 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); 578 req->local_port = inp->inp_lport; 579 req->peer_port = 0; 580 req->local_ip = inp->inp_laddr.s_addr; 581 req->peer_ip = 0; 582 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 583 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 584 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 585 586 t4_wrq_tx(sc, wr); 587 return (0); 588 } 589 590 static int 591 create_server6(struct adapter *sc, struct listen_ctx *lctx) 592 { 593 struct wrqe *wr; 594 struct cpl_pass_open_req6 *req; 595 struct inpcb *inp = lctx->inp; 596 597 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 598 if (wr == NULL) { 599 log(LOG_ERR, "%s: allocation failure", __func__); 600 return (ENOMEM); 601 } 602 req = wrtod(wr); 603 604 INIT_TP_WR(req, 0); 605 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); 606 req->local_port = inp->inp_lport; 607 req->peer_port = 0; 608 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; 609 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; 610 req->peer_ip_hi = 0; 611 req->peer_ip_lo = 0; 612 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 613 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 614 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 615 616 t4_wrq_tx(sc, wr); 617 return (0); 618 } 619 620 static int 621 destroy_server(struct adapter *sc, struct listen_ctx *lctx) 622 { 623 struct wrqe *wr; 624 struct cpl_close_listsvr_req *req; 625 626 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 627 if (wr == NULL) { 628 /* XXX */ 629 panic("%s: allocation failure.", __func__); 630 } 631 req = wrtod(wr); 632 633 INIT_TP_WR(req, 0); 634 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, 635 lctx->stid)); 636 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); 637 req->rsvd = htobe16(0); 638 639 t4_wrq_tx(sc, wr); 640 return (0); 641 } 642 643 /* 644 * Start a listening server by sending a passive open request to HW. 645 * 646 * Can't take adapter lock here and access to sc->flags, 647 * sc->offload_map, if_capenable are all race prone. 648 */ 649 int 650 t4_listen_start(struct toedev *tod, struct tcpcb *tp) 651 { 652 struct adapter *sc = tod->tod_softc; 653 struct vi_info *vi; 654 struct port_info *pi; 655 struct inpcb *inp = tptoinpcb(tp); 656 struct listen_ctx *lctx; 657 int i, rc, v; 658 struct offload_settings settings; 659 660 INP_WLOCK_ASSERT(inp); 661 662 rw_rlock(&sc->policy_lock); 663 settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 664 EVL_MAKETAG(0xfff, 0, 0), inp); 665 rw_runlock(&sc->policy_lock); 666 if (!settings.offload) 667 return (0); 668 669 /* Don't start a hardware listener for any loopback address. */ 670 if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) 671 return (0); 672 if (!(inp->inp_vflag & INP_IPV6) && 673 IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) 674 return (0); 675 if (sc->flags & KERN_TLS_ON) 676 return (0); 677 #if 0 678 ADAPTER_LOCK(sc); 679 if (IS_BUSY(sc)) { 680 log(LOG_ERR, "%s: listen request ignored, %s is busy", 681 __func__, device_get_nameunit(sc->dev)); 682 goto done; 683 } 684 685 KASSERT(uld_active(sc, ULD_TOM), 686 ("%s: TOM not initialized", __func__)); 687 #endif 688 689 /* 690 * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first 691 * such VI's queues to send the passive open and receive the reply to 692 * it. 693 * 694 * XXX: need a way to mark a port in use by offload. if_cxgbe should 695 * then reject any attempt to bring down such a port (and maybe reject 696 * attempts to disable IFCAP_TOE on that port too?). 697 */ 698 for_each_port(sc, i) { 699 pi = sc->port[i]; 700 for_each_vi(pi, v, vi) { 701 if (vi->flags & VI_INIT_DONE && 702 if_getcapenable(vi->ifp) & IFCAP_TOE) 703 goto found; 704 } 705 } 706 goto done; /* no port that's UP with IFCAP_TOE enabled */ 707 found: 708 709 if (listen_hash_find(sc, inp) != NULL) 710 goto done; /* already setup */ 711 712 lctx = alloc_lctx(sc, inp, vi); 713 if (lctx == NULL) { 714 log(LOG_ERR, 715 "%s: listen request ignored, %s couldn't allocate lctx\n", 716 __func__, device_get_nameunit(sc->dev)); 717 goto done; 718 } 719 listen_hash_add(sc, lctx); 720 721 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", 722 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, 723 inp->inp_vflag); 724 725 if (inp->inp_vflag & INP_IPV6) 726 rc = create_server6(sc, lctx); 727 else 728 rc = create_server(sc, lctx); 729 if (rc != 0) { 730 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", 731 __func__, device_get_nameunit(sc->dev), rc); 732 (void) listen_hash_del(sc, inp); 733 inp = release_lctx(sc, lctx); 734 /* can't be freed, host stack has a reference */ 735 KASSERT(inp != NULL, ("%s: inp freed", __func__)); 736 goto done; 737 } 738 lctx->flags |= LCTX_RPL_PENDING; 739 done: 740 #if 0 741 ADAPTER_UNLOCK(sc); 742 #endif 743 return (0); 744 } 745 746 int 747 t4_listen_stop(struct toedev *tod, struct tcpcb *tp) 748 { 749 struct listen_ctx *lctx; 750 struct adapter *sc = tod->tod_softc; 751 struct inpcb *inp = tptoinpcb(tp); 752 753 INP_WLOCK_ASSERT(inp); 754 755 lctx = listen_hash_del(sc, inp); 756 if (lctx == NULL) 757 return (ENOENT); /* no hardware listener for this inp */ 758 759 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, 760 lctx, lctx->flags); 761 762 /* 763 * If the reply to the PASS_OPEN is still pending we'll wait for it to 764 * arrive and clean up when it does. 765 */ 766 if (lctx->flags & LCTX_RPL_PENDING) { 767 return (EINPROGRESS); 768 } 769 770 if (lctx->flags & LCTX_SETUP_IN_HW) 771 destroy_server(sc, lctx); 772 else 773 inp = release_lctx(sc, lctx); 774 return (0); 775 } 776 777 static inline struct synq_entry * 778 alloc_synqe(struct adapter *sc, struct listen_ctx *lctx, int flags) 779 { 780 struct synq_entry *synqe; 781 782 INP_RLOCK_ASSERT(lctx->inp); 783 MPASS(flags == M_WAITOK || flags == M_NOWAIT); 784 785 synqe = malloc(sizeof(*synqe), M_CXGBE, flags); 786 if (__predict_true(synqe != NULL)) { 787 synqe->flags = TPF_SYNQE; 788 synqe->incarnation = sc->incarnation; 789 refcount_init(&synqe->refcnt, 1); 790 synqe->lctx = lctx; 791 hold_lctx(lctx); /* Every synqe has a ref on its lctx. */ 792 synqe->syn = NULL; 793 } 794 795 return (synqe); 796 } 797 798 static inline void 799 hold_synqe(struct synq_entry *synqe) 800 { 801 802 refcount_acquire(&synqe->refcnt); 803 } 804 805 static inline struct inpcb * 806 release_synqe(struct adapter *sc, struct synq_entry *synqe) 807 { 808 struct inpcb *inp; 809 810 MPASS(synqe->flags & TPF_SYNQE); 811 MPASS(synqe->lctx != NULL); 812 813 inp = synqe->lctx->inp; 814 MPASS(inp != NULL); 815 INP_WLOCK_ASSERT(inp); 816 817 if (refcount_release(&synqe->refcnt)) { 818 inp = release_lctx(sc, synqe->lctx); 819 m_freem(synqe->syn); 820 free(synqe, M_CXGBE); 821 } 822 823 return (inp); 824 } 825 826 void 827 t4_syncache_added(struct toedev *tod __unused, void *arg) 828 { 829 struct synq_entry *synqe = arg; 830 831 hold_synqe(synqe); 832 } 833 834 void 835 t4_syncache_removed(struct toedev *tod, void *arg) 836 { 837 struct adapter *sc = tod->tod_softc; 838 struct synq_entry *synqe = arg; 839 struct inpcb *inp = synqe->lctx->inp; 840 841 /* 842 * XXX: this is a LOR but harmless when running from the softclock. 843 */ 844 INP_WLOCK(inp); 845 inp = release_synqe(sc, synqe); 846 if (inp != NULL) 847 INP_WUNLOCK(inp); 848 } 849 850 int 851 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) 852 { 853 struct synq_entry *synqe = arg; 854 855 if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) { 856 struct tcpopt to; 857 struct ip *ip = mtod(m, struct ip *); 858 struct tcphdr *th; 859 860 if (ip->ip_v == IPVERSION) 861 th = (void *)(ip + 1); 862 else 863 th = (void *)((struct ip6_hdr *)ip + 1); 864 bzero(&to, sizeof(to)); 865 tcp_dooptions(&to, (void *)(th + 1), 866 (th->th_off << 2) - sizeof(*th), TO_SYN); 867 868 /* save these for later */ 869 synqe->iss = be32toh(th->th_seq); 870 synqe->irs = be32toh(th->th_ack) - 1; 871 synqe->ts = to.to_tsval; 872 } 873 874 m_freem(m); /* don't need this any more */ 875 return (0); 876 } 877 878 static int 879 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, 880 struct mbuf *m) 881 { 882 struct adapter *sc = iq->adapter; 883 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); 884 int stid = GET_TID(cpl); 885 unsigned int status = cpl->status; 886 struct listen_ctx *lctx = lookup_stid(sc, stid); 887 struct inpcb *inp = lctx->inp; 888 #ifdef INVARIANTS 889 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 890 #endif 891 892 KASSERT(opcode == CPL_PASS_OPEN_RPL, 893 ("%s: unexpected opcode 0x%x", __func__, opcode)); 894 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 895 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 896 897 INP_WLOCK(inp); 898 899 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", 900 __func__, stid, status, lctx->flags); 901 902 lctx->flags &= ~LCTX_RPL_PENDING; 903 if (status == CPL_ERR_NONE) 904 lctx->flags |= LCTX_SETUP_IN_HW; 905 else 906 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); 907 908 #ifdef INVARIANTS 909 /* 910 * If the inp has been dropped (listening socket closed) then 911 * listen_stop must have run and taken the inp out of the hash. 912 */ 913 if (inp->inp_flags & INP_DROPPED) { 914 KASSERT(listen_hash_del(sc, inp) == NULL, 915 ("%s: inp %p still in listen hash", __func__, inp)); 916 } 917 #endif 918 919 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { 920 if (release_lctx(sc, lctx) != NULL) 921 INP_WUNLOCK(inp); 922 return (status); 923 } 924 925 /* 926 * Listening socket stopped listening earlier and now the chip tells us 927 * it has started the hardware listener. Stop it; the lctx will be 928 * released in do_close_server_rpl. 929 */ 930 if (inp->inp_flags & INP_DROPPED) { 931 destroy_server(sc, lctx); 932 INP_WUNLOCK(inp); 933 return (status); 934 } 935 936 /* 937 * Failed to start hardware listener. Take inp out of the hash and 938 * release our reference on it. An error message has been logged 939 * already. 940 */ 941 if (status != CPL_ERR_NONE) { 942 listen_hash_del(sc, inp); 943 if (release_lctx(sc, lctx) != NULL) 944 INP_WUNLOCK(inp); 945 return (status); 946 } 947 948 /* hardware listener open for business */ 949 950 INP_WUNLOCK(inp); 951 return (status); 952 } 953 954 static int 955 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, 956 struct mbuf *m) 957 { 958 struct adapter *sc = iq->adapter; 959 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); 960 int stid = GET_TID(cpl); 961 unsigned int status = cpl->status; 962 struct listen_ctx *lctx = lookup_stid(sc, stid); 963 struct inpcb *inp = lctx->inp; 964 #ifdef INVARIANTS 965 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 966 #endif 967 968 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, 969 ("%s: unexpected opcode 0x%x", __func__, opcode)); 970 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 971 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 972 973 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); 974 975 if (status != CPL_ERR_NONE) { 976 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", 977 __func__, status, stid); 978 return (status); 979 } 980 981 INP_WLOCK(inp); 982 inp = release_lctx(sc, lctx); 983 if (inp != NULL) 984 INP_WUNLOCK(inp); 985 986 return (status); 987 } 988 989 static void 990 done_with_synqe(struct adapter *sc, struct synq_entry *synqe) 991 { 992 struct tom_data *td = sc->tom_softc; 993 struct listen_ctx *lctx = synqe->lctx; 994 struct inpcb *inp = lctx->inp; 995 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; 996 int ntids; 997 998 INP_WLOCK_ASSERT(inp); 999 1000 if (synqe->tid != -1) { 1001 ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; 1002 remove_tid(sc, synqe->tid, ntids); 1003 mtx_lock(&td->toep_list_lock); 1004 TAILQ_REMOVE(&td->synqe_list, synqe, link); 1005 mtx_unlock(&td->toep_list_lock); 1006 release_tid(sc, synqe->tid, lctx->ctrlq); 1007 } 1008 t4_l2t_release(e); 1009 inp = release_synqe(sc, synqe); 1010 if (inp) 1011 INP_WUNLOCK(inp); 1012 } 1013 1014 void 1015 synack_failure_cleanup(struct adapter *sc, struct synq_entry *synqe) 1016 { 1017 INP_WLOCK(synqe->lctx->inp); 1018 done_with_synqe(sc, synqe); 1019 } 1020 1021 int 1022 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, 1023 struct mbuf *m) 1024 { 1025 struct adapter *sc = iq->adapter; 1026 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 1027 unsigned int tid = GET_TID(cpl); 1028 struct synq_entry *synqe = lookup_tid(sc, tid); 1029 struct listen_ctx *lctx = synqe->lctx; 1030 struct inpcb *inp = lctx->inp; 1031 struct sge_ofld_txq *ofld_txq; 1032 #ifdef INVARIANTS 1033 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1034 #endif 1035 1036 KASSERT(opcode == CPL_ABORT_REQ_RSS, 1037 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1038 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1039 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 1040 1041 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 1042 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 1043 1044 if (negative_advice(cpl->status)) 1045 return (0); /* Ignore negative advice */ 1046 1047 INP_WLOCK(inp); 1048 1049 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; 1050 1051 if (!(synqe->flags & TPF_FLOWC_WR_SENT)) 1052 send_flowc_wr_synqe(sc, synqe); 1053 1054 /* 1055 * If we'd initiated an abort earlier the reply to it is responsible for 1056 * cleaning up resources. Otherwise we tear everything down right here 1057 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 1058 */ 1059 if (synqe->flags & TPF_ABORT_SHUTDOWN) { 1060 INP_WUNLOCK(inp); 1061 goto done; 1062 } 1063 1064 done_with_synqe(sc, synqe); 1065 /* inp lock released by done_with_synqe */ 1066 done: 1067 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 1068 return (0); 1069 } 1070 1071 int 1072 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, 1073 struct mbuf *m) 1074 { 1075 struct adapter *sc = iq->adapter; 1076 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 1077 unsigned int tid = GET_TID(cpl); 1078 struct synq_entry *synqe = lookup_tid(sc, tid); 1079 struct listen_ctx *lctx = synqe->lctx; 1080 struct inpcb *inp = lctx->inp; 1081 #ifdef INVARIANTS 1082 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1083 #endif 1084 1085 KASSERT(opcode == CPL_ABORT_RPL_RSS, 1086 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1087 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1088 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 1089 1090 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 1091 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 1092 1093 INP_WLOCK(inp); 1094 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 1095 ("%s: wasn't expecting abort reply for synqe %p (0x%x)", 1096 __func__, synqe, synqe->flags)); 1097 1098 done_with_synqe(sc, synqe); 1099 /* inp lock released by done_with_synqe */ 1100 1101 return (0); 1102 } 1103 1104 void 1105 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) 1106 { 1107 struct adapter *sc = tod->tod_softc; 1108 struct tom_data *td = sc->tom_softc; 1109 struct synq_entry *synqe = arg; 1110 struct inpcb *inp = sotoinpcb(so); 1111 struct toepcb *toep = synqe->toep; 1112 1113 NET_EPOCH_ASSERT(); /* prevents bad race with accept() */ 1114 INP_WLOCK_ASSERT(inp); 1115 KASSERT(synqe->flags & TPF_SYNQE, 1116 ("%s: %p not a synq_entry?", __func__, arg)); 1117 MPASS(toep->tid == synqe->tid); 1118 1119 offload_socket(so, toep); 1120 make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt); 1121 toep->flags |= TPF_CPL_PENDING; 1122 update_tid(sc, synqe->tid, toep); 1123 synqe->flags |= TPF_SYNQE_EXPANDED; 1124 mtx_lock(&td->toep_list_lock); 1125 /* Remove synqe from its list and add the TOE PCB to the active list. */ 1126 TAILQ_REMOVE(&td->synqe_list, synqe, link); 1127 TAILQ_INSERT_TAIL(&td->toep_list, toep, link); 1128 toep->flags |= TPF_IN_TOEP_LIST; 1129 mtx_unlock(&td->toep_list_lock); 1130 inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ? 1131 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4; 1132 inp->inp_flowid = synqe->rss_hash; 1133 } 1134 1135 static void 1136 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) 1137 { 1138 bzero(to, sizeof(*to)); 1139 1140 if (t4opt->mss) { 1141 to->to_flags |= TOF_MSS; 1142 to->to_mss = be16toh(t4opt->mss); 1143 } 1144 1145 if (t4opt->wsf > 0 && t4opt->wsf < 15) { 1146 to->to_flags |= TOF_SCALE; 1147 to->to_wscale = t4opt->wsf; 1148 } 1149 1150 if (t4opt->tstamp) 1151 to->to_flags |= TOF_TS; 1152 1153 if (t4opt->sack) 1154 to->to_flags |= TOF_SACKPERM; 1155 } 1156 1157 static bool 1158 encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl) 1159 { 1160 u_int hlen = be32toh(cpl->hdr_len); 1161 1162 if (chip_id(sc) >= CHELSIO_T6) 1163 return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); 1164 else 1165 return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); 1166 } 1167 1168 static void 1169 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, 1170 struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos) 1171 { 1172 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1173 const struct ether_header *eh; 1174 unsigned int hlen = be32toh(cpl->hdr_len); 1175 uintptr_t l3hdr; 1176 const struct tcphdr *tcp; 1177 1178 eh = (const void *)(cpl + 1); 1179 if (chip_id(sc) >= CHELSIO_T6) { 1180 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); 1181 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); 1182 } else { 1183 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); 1184 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); 1185 } 1186 1187 /* extract TOS (DiffServ + ECN) byte for AccECN */ 1188 if (iptos) { 1189 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1190 const struct ip *ip = (const void *)l3hdr; 1191 *iptos = ip->ip_tos; 1192 } 1193 #ifdef INET6 1194 else 1195 if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) { 1196 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1197 *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 1198 } 1199 #endif /* INET */ 1200 } 1201 1202 if (inc) { 1203 bzero(inc, sizeof(*inc)); 1204 inc->inc_fport = tcp->th_sport; 1205 inc->inc_lport = tcp->th_dport; 1206 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1207 const struct ip *ip = (const void *)l3hdr; 1208 1209 inc->inc_faddr = ip->ip_src; 1210 inc->inc_laddr = ip->ip_dst; 1211 } else { 1212 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1213 1214 inc->inc_flags |= INC_ISIPV6; 1215 inc->inc6_faddr = ip6->ip6_src; 1216 inc->inc6_laddr = ip6->ip6_dst; 1217 } 1218 } 1219 1220 if (th) { 1221 bcopy(tcp, th, sizeof(*th)); 1222 tcp_fields_to_host(th); /* just like tcp_input */ 1223 } 1224 } 1225 1226 static struct l2t_entry * 1227 get_l2te_for_nexthop(struct port_info *pi, if_t ifp, 1228 struct in_conninfo *inc) 1229 { 1230 struct l2t_entry *e; 1231 struct sockaddr_in6 sin6; 1232 struct sockaddr *dst = (void *)&sin6; 1233 struct nhop_object *nh; 1234 1235 if (inc->inc_flags & INC_ISIPV6) { 1236 bzero(dst, sizeof(struct sockaddr_in6)); 1237 dst->sa_len = sizeof(struct sockaddr_in6); 1238 dst->sa_family = AF_INET6; 1239 1240 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { 1241 /* no need for route lookup */ 1242 e = t4_l2t_get(pi, ifp, dst); 1243 return (e); 1244 } 1245 1246 nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0); 1247 if (nh == NULL) 1248 return (NULL); 1249 if (nh->nh_ifp != ifp) 1250 return (NULL); 1251 if (nh->nh_flags & NHF_GATEWAY) 1252 ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr; 1253 else 1254 ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr; 1255 } else { 1256 dst->sa_len = sizeof(struct sockaddr_in); 1257 dst->sa_family = AF_INET; 1258 1259 nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0); 1260 if (nh == NULL) 1261 return (NULL); 1262 if (nh->nh_ifp != ifp) 1263 return (NULL); 1264 if (nh->nh_flags & NHF_GATEWAY) 1265 if (nh->gw_sa.sa_family == AF_INET) 1266 ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr; 1267 else 1268 *((struct sockaddr_in6 *)dst) = nh->gw6_sa; 1269 else 1270 ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr; 1271 } 1272 1273 e = t4_l2t_get(pi, ifp, dst); 1274 return (e); 1275 } 1276 1277 static int 1278 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0, 1279 uint32_t opt2, int tid) 1280 { 1281 struct wrqe *wr; 1282 struct cpl_pass_accept_rpl *rpl; 1283 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; 1284 1285 wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : 1286 sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]); 1287 if (wr == NULL) 1288 return (ENOMEM); 1289 rpl = wrtod(wr); 1290 1291 if (is_t4(sc)) 1292 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); 1293 else { 1294 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; 1295 1296 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); 1297 rpl5->iss = htobe32(synqe->iss); 1298 } 1299 rpl->opt0 = opt0; 1300 rpl->opt2 = opt2; 1301 1302 return (t4_l2t_send(sc, wr, e)); 1303 } 1304 1305 #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \ 1306 if (!tunnel) { \ 1307 m_freem(m); \ 1308 m = NULL; \ 1309 } \ 1310 reject_reason = __LINE__; \ 1311 goto reject; \ 1312 } while (0) 1313 1314 /* 1315 * The context associated with a tid entry via insert_tid could be a synq_entry 1316 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. 1317 */ 1318 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); 1319 1320 /* 1321 * Incoming SYN on a listening socket. 1322 * 1323 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, 1324 * etc. 1325 */ 1326 static int 1327 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, 1328 struct mbuf *m) 1329 { 1330 struct adapter *sc = iq->adapter; 1331 struct tom_data *td = sc->tom_softc; 1332 struct toedev *tod; 1333 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1334 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1335 unsigned int tid = GET_TID(cpl); 1336 struct listen_ctx *lctx = lookup_stid(sc, stid); 1337 struct inpcb *inp; 1338 struct socket *so; 1339 struct in_conninfo inc; 1340 struct tcphdr th; 1341 struct tcpopt to; 1342 struct port_info *pi; 1343 struct vi_info *vi; 1344 if_t hw_ifp, ifp; 1345 struct l2t_entry *e = NULL; 1346 struct synq_entry *synqe = NULL; 1347 int reject_reason, v, ntids; 1348 uint16_t vid, l2info; 1349 struct epoch_tracker et; 1350 #ifdef INVARIANTS 1351 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1352 #endif 1353 struct offload_settings settings; 1354 uint8_t iptos; 1355 1356 KASSERT(opcode == CPL_PASS_ACCEPT_REQ, 1357 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1358 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1359 1360 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, 1361 lctx); 1362 1363 /* 1364 * Figure out the port the SYN arrived on. We'll look for an exact VI 1365 * match in a bit but in case we don't find any we'll use the main VI as 1366 * the incoming ifnet. 1367 */ 1368 l2info = be16toh(cpl->l2info); 1369 pi = sc->port[G_SYN_INTF(l2info)]; 1370 hw_ifp = pi->vi[0].ifp; 1371 m->m_pkthdr.rcvif = hw_ifp; 1372 1373 CURVNET_SET(lctx->vnet); /* before any potential REJECT */ 1374 1375 /* 1376 * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will 1377 * also hit the listener. We don't want to offload those. 1378 */ 1379 if (encapsulated_syn(sc, cpl)) { 1380 REJECT_PASS_ACCEPT_REQ(true); 1381 } 1382 1383 /* 1384 * Use the MAC index to lookup the associated VI. If this SYN didn't 1385 * match a perfect MAC filter, punt. 1386 */ 1387 if (!(l2info & F_SYN_XACT_MATCH)) { 1388 REJECT_PASS_ACCEPT_REQ(true); 1389 } 1390 for_each_vi(pi, v, vi) { 1391 if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info)) 1392 goto found; 1393 } 1394 REJECT_PASS_ACCEPT_REQ(true); 1395 found: 1396 hw_ifp = vi->ifp; /* the cxgbe ifnet */ 1397 m->m_pkthdr.rcvif = hw_ifp; 1398 tod = TOEDEV(hw_ifp); 1399 1400 /* 1401 * Don't offload if the peer requested a TCP option that's not known to 1402 * the silicon. Send the SYN to the kernel instead. 1403 */ 1404 if (__predict_false(cpl->tcpopt.unknown)) 1405 REJECT_PASS_ACCEPT_REQ(true); 1406 1407 /* 1408 * Figure out if there is a pseudo interface (vlan, lagg, etc.) 1409 * involved. Don't offload if the SYN had a VLAN tag and the vid 1410 * doesn't match anything on this interface. 1411 * 1412 * XXX: lagg support, lagg + vlan support. 1413 */ 1414 vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); 1415 if (vid != 0xfff && vid != 0) { 1416 ifp = VLAN_DEVAT(hw_ifp, vid); 1417 if (ifp == NULL) 1418 REJECT_PASS_ACCEPT_REQ(true); 1419 } else 1420 ifp = hw_ifp; 1421 1422 /* 1423 * Don't offload if the ifnet that the SYN came in on is not in the same 1424 * vnet as the listening socket. 1425 */ 1426 if (lctx->vnet != if_getvnet(ifp)) 1427 REJECT_PASS_ACCEPT_REQ(true); 1428 1429 pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos); 1430 if (inc.inc_flags & INC_ISIPV6) { 1431 1432 /* Don't offload if the ifcap isn't enabled */ 1433 if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0) 1434 REJECT_PASS_ACCEPT_REQ(true); 1435 1436 /* 1437 * SYN must be directed to an IP6 address on this ifnet. This 1438 * is more restrictive than in6_localip. 1439 */ 1440 NET_EPOCH_ENTER(et); 1441 if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) { 1442 NET_EPOCH_EXIT(et); 1443 REJECT_PASS_ACCEPT_REQ(true); 1444 } 1445 1446 ntids = 2; 1447 } else { 1448 1449 /* Don't offload if the ifcap isn't enabled */ 1450 if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0) 1451 REJECT_PASS_ACCEPT_REQ(true); 1452 1453 /* 1454 * SYN must be directed to an IP address on this ifnet. This 1455 * is more restrictive than in_localip. 1456 */ 1457 NET_EPOCH_ENTER(et); 1458 if (!in_ifhasaddr(ifp, inc.inc_laddr)) { 1459 NET_EPOCH_EXIT(et); 1460 REJECT_PASS_ACCEPT_REQ(true); 1461 } 1462 1463 ntids = 1; 1464 } 1465 1466 e = get_l2te_for_nexthop(pi, ifp, &inc); 1467 if (e == NULL) { 1468 NET_EPOCH_EXIT(et); 1469 REJECT_PASS_ACCEPT_REQ(true); 1470 } 1471 1472 /* Don't offload if the 4-tuple is already in use */ 1473 if (toe_4tuple_check(&inc, &th, ifp) != 0) { 1474 NET_EPOCH_EXIT(et); 1475 REJECT_PASS_ACCEPT_REQ(false); 1476 } 1477 1478 inp = lctx->inp; /* listening socket, not owned by TOE */ 1479 INP_RLOCK(inp); 1480 1481 /* Don't offload if the listening socket has closed */ 1482 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1483 INP_RUNLOCK(inp); 1484 NET_EPOCH_EXIT(et); 1485 REJECT_PASS_ACCEPT_REQ(false); 1486 } 1487 so = inp->inp_socket; 1488 rw_rlock(&sc->policy_lock); 1489 settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 1490 EVL_MAKETAG(0xfff, 0, 0), inp); 1491 rw_runlock(&sc->policy_lock); 1492 if (!settings.offload) { 1493 INP_RUNLOCK(inp); 1494 NET_EPOCH_EXIT(et); 1495 REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */ 1496 } 1497 1498 synqe = alloc_synqe(sc, lctx, M_NOWAIT); 1499 if (synqe == NULL) { 1500 INP_RUNLOCK(inp); 1501 NET_EPOCH_EXIT(et); 1502 REJECT_PASS_ACCEPT_REQ(true); 1503 } 1504 MPASS(rss->hash_type == RSS_HASH_TCP); 1505 synqe->rss_hash = be32toh(rss->hash_val); 1506 atomic_store_int(&synqe->ok_to_respond, 0); 1507 1508 init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx, 1509 &synqe->params); 1510 1511 /* 1512 * If all goes well t4_syncache_respond will get called during 1513 * syncache_add. Note that syncache_add releases the pcb lock. 1514 */ 1515 t4opt_to_tcpopt(&cpl->tcpopt, &to); 1516 toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos); 1517 1518 if (atomic_load_int(&synqe->ok_to_respond) > 0) { 1519 uint64_t opt0; 1520 uint32_t opt2; 1521 1522 opt0 = calc_options0(vi, &synqe->params); 1523 opt2 = calc_options2(vi, &synqe->params); 1524 1525 insert_tid(sc, tid, synqe, ntids); 1526 synqe->tid = tid; 1527 synqe->syn = m; 1528 m = NULL; 1529 mtx_lock(&td->toep_list_lock); 1530 TAILQ_INSERT_TAIL(&td->synqe_list, synqe, link); 1531 mtx_unlock(&td->toep_list_lock); 1532 1533 if (send_synack(sc, synqe, opt0, opt2, tid) != 0) { 1534 remove_tid(sc, tid, ntids); 1535 m = synqe->syn; 1536 synqe->syn = NULL; 1537 mtx_lock(&td->toep_list_lock); 1538 TAILQ_REMOVE(&td->synqe_list, synqe, link); 1539 mtx_unlock(&td->toep_list_lock); 1540 NET_EPOCH_EXIT(et); 1541 REJECT_PASS_ACCEPT_REQ(true); 1542 } 1543 CTR6(KTR_CXGBE, 1544 "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x", 1545 __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2)); 1546 } else { 1547 NET_EPOCH_EXIT(et); 1548 REJECT_PASS_ACCEPT_REQ(false); 1549 } 1550 1551 NET_EPOCH_EXIT(et); 1552 CURVNET_RESTORE(); 1553 return (0); 1554 reject: 1555 CURVNET_RESTORE(); 1556 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, 1557 reject_reason); 1558 1559 if (e) 1560 t4_l2t_release(e); 1561 release_tid(sc, tid, lctx->ctrlq); 1562 if (synqe) { 1563 inp = synqe->lctx->inp; 1564 INP_WLOCK(inp); 1565 inp = release_synqe(sc, synqe); 1566 if (inp) 1567 INP_WUNLOCK(inp); 1568 } 1569 1570 if (m) { 1571 /* 1572 * The connection request hit a TOE listener but is being passed 1573 * on to the kernel sw stack instead of getting offloaded. 1574 */ 1575 m_adj(m, sizeof(*cpl)); 1576 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | 1577 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 1578 m->m_pkthdr.csum_data = 0xffff; 1579 if_input(hw_ifp, m); 1580 } 1581 1582 return (reject_reason); 1583 } 1584 1585 static void 1586 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, 1587 const struct cpl_pass_establish *cpl, struct in_conninfo *inc, 1588 struct tcphdr *th, struct tcpopt *to) 1589 { 1590 uint16_t tcp_opt = be16toh(cpl->tcp_opt); 1591 uint8_t iptos; 1592 1593 /* start off with the original SYN */ 1594 pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos); 1595 1596 /* modify parts to make it look like the ACK to our SYN|ACK */ 1597 tcp_set_flags(th, TH_ACK); 1598 th->th_ack = synqe->iss + 1; 1599 th->th_seq = be32toh(cpl->rcv_isn); 1600 bzero(to, sizeof(*to)); 1601 if (G_TCPOPT_TSTAMP(tcp_opt)) { 1602 to->to_flags |= TOF_TS; 1603 to->to_tsecr = synqe->ts; 1604 } 1605 } 1606 1607 static int 1608 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, 1609 struct mbuf *m) 1610 { 1611 struct adapter *sc = iq->adapter; 1612 struct vi_info *vi; 1613 if_t ifp; 1614 const struct cpl_pass_establish *cpl = (const void *)(rss + 1); 1615 #if defined(KTR) || defined(INVARIANTS) 1616 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1617 #endif 1618 unsigned int tid = GET_TID(cpl); 1619 struct synq_entry *synqe = lookup_tid(sc, tid); 1620 struct listen_ctx *lctx = synqe->lctx; 1621 struct inpcb *inp = lctx->inp, *new_inp; 1622 struct socket *so; 1623 struct tcphdr th; 1624 struct tcpopt to; 1625 struct in_conninfo inc; 1626 struct toepcb *toep; 1627 struct epoch_tracker et; 1628 int rstreason; 1629 #ifdef INVARIANTS 1630 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1631 #endif 1632 1633 KASSERT(opcode == CPL_PASS_ESTABLISH, 1634 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1635 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1636 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1637 KASSERT(synqe->flags & TPF_SYNQE, 1638 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); 1639 1640 CURVNET_SET(lctx->vnet); 1641 NET_EPOCH_ENTER(et); /* for syncache_expand */ 1642 INP_WLOCK(inp); 1643 1644 CTR6(KTR_CXGBE, 1645 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", 1646 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); 1647 1648 ifp = synqe->syn->m_pkthdr.rcvif; 1649 vi = if_getsoftc(ifp); 1650 KASSERT(vi->adapter == sc, 1651 ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); 1652 1653 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1654 reset: 1655 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST); 1656 INP_WUNLOCK(inp); 1657 NET_EPOCH_EXIT(et); 1658 CURVNET_RESTORE(); 1659 return (0); 1660 } 1661 1662 KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], 1663 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, 1664 synqe->params.rxq_idx, 1665 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); 1666 1667 toep = alloc_toepcb(vi, M_NOWAIT); 1668 if (toep == NULL) 1669 goto reset; 1670 toep->tid = tid; 1671 toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx]; 1672 toep->vnet = lctx->vnet; 1673 bcopy(&synqe->params, &toep->params, sizeof(toep->params)); 1674 init_toepcb(vi, toep); 1675 1676 MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); 1677 MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); 1678 synqe->tcp_opt = cpl->tcp_opt; 1679 synqe->toep = toep; 1680 1681 /* Come up with something that syncache_expand should be ok with. */ 1682 synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); 1683 if (inc.inc_flags & INC_ISIPV6) { 1684 if (lctx->ce == NULL) { 1685 toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true); 1686 if (toep->ce == NULL) { 1687 free_toepcb(toep); 1688 goto reset; /* RST without a CLIP entry? */ 1689 } 1690 } else { 1691 t4_hold_clip_entry(sc, lctx->ce); 1692 toep->ce = lctx->ce; 1693 } 1694 } 1695 so = inp->inp_socket; 1696 KASSERT(so != NULL, ("%s: socket is NULL", __func__)); 1697 1698 rstreason = toe_syncache_expand(&inc, &to, &th, &so); 1699 if (rstreason < 0) { 1700 free_toepcb(toep); 1701 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST); 1702 INP_WUNLOCK(inp); 1703 NET_EPOCH_EXIT(et); 1704 CURVNET_RESTORE(); 1705 return (0); 1706 } else if (rstreason == 0 || so == NULL) { 1707 free_toepcb(toep); 1708 goto reset; 1709 } 1710 1711 /* New connection inpcb is already locked by syncache_expand(). */ 1712 new_inp = sotoinpcb(so); 1713 INP_WLOCK_ASSERT(new_inp); 1714 MPASS(so->so_vnet == lctx->vnet); 1715 1716 /* 1717 * This is for expansion from syncookies. 1718 * 1719 * XXX: we've held the tcbinfo lock throughout so there's no risk of 1720 * anyone accept'ing a connection before we've installed our hooks, but 1721 * this somewhat defeats the purpose of having a tod_offload_socket :-( 1722 */ 1723 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { 1724 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); 1725 t4_offload_socket(TOEDEV(ifp), synqe, so); 1726 } 1727 1728 INP_WUNLOCK(new_inp); 1729 1730 /* Done with the synqe */ 1731 inp = release_synqe(sc, synqe); 1732 if (inp != NULL) 1733 INP_WUNLOCK(inp); 1734 NET_EPOCH_EXIT(et); 1735 CURVNET_RESTORE(); 1736 1737 return (0); 1738 } 1739 1740 void 1741 t4_init_listen_cpl_handlers(void) 1742 { 1743 1744 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); 1745 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); 1746 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 1747 t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 1748 } 1749 1750 void 1751 t4_uninit_listen_cpl_handlers(void) 1752 { 1753 1754 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); 1755 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); 1756 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); 1757 t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); 1758 } 1759 #endif 1760