10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 55766Sgg161487 * Common Development and Distribution License (the "License"). 65766Sgg161487 * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 218917SVenkatakrishnan.Rajagopalan@Sun.COM 220Sstevel@tonic-gate /* 2312163SRamaswamy.Tummala@Sun.COM * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate #ifndef _SYS_IB_CLIENTS_IBD_H 270Sstevel@tonic-gate #define _SYS_IB_CLIENTS_IBD_H 280Sstevel@tonic-gate 290Sstevel@tonic-gate #ifdef __cplusplus 300Sstevel@tonic-gate extern "C" { 310Sstevel@tonic-gate #endif 320Sstevel@tonic-gate 3311534SKevin.Ge@Sun.COM /* The following macros are used in both ibd.c and ibd_cm.c */ 3411534SKevin.Ge@Sun.COM 3511534SKevin.Ge@Sun.COM /* 3611534SKevin.Ge@Sun.COM * Completion queue polling control 3711534SKevin.Ge@Sun.COM */ 3811534SKevin.Ge@Sun.COM #define IBD_CQ_POLLING 0x1 3911534SKevin.Ge@Sun.COM #define IBD_REDO_CQ_POLLING 0x2 4011534SKevin.Ge@Sun.COM 4111534SKevin.Ge@Sun.COM /* 4211534SKevin.Ge@Sun.COM * Maximum length for returning chained mps back to crossbow. 4311534SKevin.Ge@Sun.COM * Also used as the maximum number of rx wc's polled at a time. 4411534SKevin.Ge@Sun.COM */ 4511534SKevin.Ge@Sun.COM #define IBD_MAX_RX_MP_LEN 16 4611534SKevin.Ge@Sun.COM 4711534SKevin.Ge@Sun.COM /* 4811534SKevin.Ge@Sun.COM * When doing multiple-send-wr, this value determines how many to do at 4911534SKevin.Ge@Sun.COM * a time (in a single ibt_post_send). 5011534SKevin.Ge@Sun.COM */ 5111534SKevin.Ge@Sun.COM #define IBD_MAX_TX_POST_MULTIPLE 4 5211534SKevin.Ge@Sun.COM 5311534SKevin.Ge@Sun.COM /* 5411534SKevin.Ge@Sun.COM * Flag bits for resources to reap 5511534SKevin.Ge@Sun.COM */ 5611534SKevin.Ge@Sun.COM #define IBD_RSRC_SWQE 0x1 5711534SKevin.Ge@Sun.COM #define IBD_RSRC_LSOBUF 0x2 5811534SKevin.Ge@Sun.COM #define IBD_RSRC_RC_SWQE 0x4 5911534SKevin.Ge@Sun.COM #define IBD_RSRC_RC_TX_LARGEBUF 0x8 6011534SKevin.Ge@Sun.COM 6111534SKevin.Ge@Sun.COM /* 6211534SKevin.Ge@Sun.COM * Async operation types 6311534SKevin.Ge@Sun.COM */ 6411534SKevin.Ge@Sun.COM #define IBD_ASYNC_GETAH 1 6511534SKevin.Ge@Sun.COM #define IBD_ASYNC_JOIN 2 6611534SKevin.Ge@Sun.COM #define IBD_ASYNC_LEAVE 3 6711534SKevin.Ge@Sun.COM #define IBD_ASYNC_PROMON 4 6811534SKevin.Ge@Sun.COM #define IBD_ASYNC_PROMOFF 5 6911534SKevin.Ge@Sun.COM #define IBD_ASYNC_REAP 6 7011534SKevin.Ge@Sun.COM #define IBD_ASYNC_TRAP 7 7111534SKevin.Ge@Sun.COM #define IBD_ASYNC_SCHED 8 7211534SKevin.Ge@Sun.COM #define IBD_ASYNC_LINK 9 7311534SKevin.Ge@Sun.COM #define IBD_ASYNC_EXIT 10 7411534SKevin.Ge@Sun.COM #define IBD_ASYNC_RC_TOO_BIG 11 75*13030SKevin.Ge@Sun.COM #define IBD_ASYNC_RC_CLOSE_ACT_CHAN 12 76*13030SKevin.Ge@Sun.COM #define IBD_ASYNC_RC_RECYCLE_ACE 13 77*13030SKevin.Ge@Sun.COM #define IBD_ASYNC_RC_CLOSE_PAS_CHAN 14 78*13030SKevin.Ge@Sun.COM 79*13030SKevin.Ge@Sun.COM /* 80*13030SKevin.Ge@Sun.COM * State of IBD driver initialization during attach/m_start 81*13030SKevin.Ge@Sun.COM */ 82*13030SKevin.Ge@Sun.COM #define IBD_DRV_STATE_INITIALIZED 0x000001 83*13030SKevin.Ge@Sun.COM #define IBD_DRV_RXINTR_ADDED 0x000002 84*13030SKevin.Ge@Sun.COM #define IBD_DRV_TXINTR_ADDED 0x000004 85*13030SKevin.Ge@Sun.COM #define IBD_DRV_IBTL_ATTACH_DONE 0x000008 86*13030SKevin.Ge@Sun.COM #define IBD_DRV_HCA_OPENED 0x000010 87*13030SKevin.Ge@Sun.COM #define IBD_DRV_PD_ALLOCD 0x000020 88*13030SKevin.Ge@Sun.COM #define IBD_DRV_MAC_REGISTERED 0x000040 89*13030SKevin.Ge@Sun.COM #define IBD_DRV_PORT_DETAILS_OBTAINED 0x000080 90*13030SKevin.Ge@Sun.COM #define IBD_DRV_BCAST_GROUP_FOUND 0x000100 91*13030SKevin.Ge@Sun.COM #define IBD_DRV_ACACHE_INITIALIZED 0x000200 92*13030SKevin.Ge@Sun.COM #define IBD_DRV_CQS_ALLOCD 0x000400 93*13030SKevin.Ge@Sun.COM #define IBD_DRV_UD_CHANNEL_SETUP 0x000800 94*13030SKevin.Ge@Sun.COM #define IBD_DRV_TXLIST_ALLOCD 0x001000 95*13030SKevin.Ge@Sun.COM #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x002000 96*13030SKevin.Ge@Sun.COM #define IBD_DRV_RXLIST_ALLOCD 0x004000 97*13030SKevin.Ge@Sun.COM #define IBD_DRV_BCAST_GROUP_JOINED 0x008000 98*13030SKevin.Ge@Sun.COM #define IBD_DRV_ASYNC_THR_CREATED 0x010000 99*13030SKevin.Ge@Sun.COM #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x020000 100*13030SKevin.Ge@Sun.COM #define IBD_DRV_SM_NOTICES_REGISTERED 0x040000 101*13030SKevin.Ge@Sun.COM #define IBD_DRV_STARTED 0x080000 102*13030SKevin.Ge@Sun.COM #define IBD_DRV_RC_SRQ_ALLOCD 0x100000 103*13030SKevin.Ge@Sun.COM #define IBD_DRV_RC_LARGEBUF_ALLOCD 0x200000 104*13030SKevin.Ge@Sun.COM #define IBD_DRV_RC_LISTEN 0x400000 105*13030SKevin.Ge@Sun.COM #ifdef DEBUG 106*13030SKevin.Ge@Sun.COM #define IBD_DRV_RC_PRIVATE_STATE 0x800000 107*13030SKevin.Ge@Sun.COM #endif 108*13030SKevin.Ge@Sun.COM #define IBD_DRV_IN_DELETION 0x1000000 109*13030SKevin.Ge@Sun.COM #define IBD_DRV_IN_LATE_HCA_INIT 0x2000000 110*13030SKevin.Ge@Sun.COM #define IBD_DRV_REQ_LIST_INITED 0x4000000 111*13030SKevin.Ge@Sun.COM #define IBD_DRV_RC_TIMEOUT 0x8000000 11211534SKevin.Ge@Sun.COM 11311534SKevin.Ge@Sun.COM /* 11411534SKevin.Ge@Sun.COM * Miscellaneous constants 11511534SKevin.Ge@Sun.COM */ 11611534SKevin.Ge@Sun.COM #define IBD_SEND 0 11711534SKevin.Ge@Sun.COM #define IBD_RECV 1 11811534SKevin.Ge@Sun.COM 11912163SRamaswamy.Tummala@Sun.COM /* Tunables defaults and limits */ 12012163SRamaswamy.Tummala@Sun.COM #define IBD_LINK_MODE_UD 0 12112163SRamaswamy.Tummala@Sun.COM #define IBD_LINK_MODE_RC 1 12212163SRamaswamy.Tummala@Sun.COM 12312163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_LINK_MODE IBD_LINK_MODE_RC 12412163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_LSO_POLICY B_TRUE 12512163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_NUM_LSO_BUFS 1024 12612163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_CREATE_BCAST_GROUP B_TRUE 12712163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_COALESCE_COMPLETIONS B_TRUE 12812163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_UD_RX_COMP_COUNT 4 12912163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_UD_RX_COMP_USEC 10 13012163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_UD_TX_COMP_COUNT 16 13112163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_UD_TX_COMP_USEC 300 13212163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_RC_RX_COMP_COUNT 4 13312163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_RC_RX_COMP_USEC 10 13412163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_RC_TX_COMP_COUNT 10 13512163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_RC_TX_COMP_USEC 300 13612163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_UD_TX_COPY_THRESH 4096 13712163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_RC_RX_COPY_THRESH 4096 13812163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_RC_TX_COPY_THRESH 4096 13912163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_UD_NUM_RWQE 4000 14012163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_UD_NUM_SWQE 4000 14112163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_RC_ENABLE_SRQ B_TRUE 142*13030SKevin.Ge@Sun.COM #if defined(__i386) 143*13030SKevin.Ge@Sun.COM #define IBD_DEF_RC_NUM_RWQE 511 144*13030SKevin.Ge@Sun.COM #define IBD_DEF_RC_NUM_SWQE 255 145*13030SKevin.Ge@Sun.COM #else 14612163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_RC_NUM_RWQE 2047 14712163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_RC_NUM_SWQE 511 148*13030SKevin.Ge@Sun.COM #endif 14912163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_NUM_AH 256 15012163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_HASH_SIZE 32 15112163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_RC_NUM_SRQ (IBD_DEF_RC_NUM_RWQE - 1) 15212163SRamaswamy.Tummala@Sun.COM #define IBD_DEF_RC_RX_RWQE_THRESH (IBD_DEF_RC_NUM_RWQE >> 2) 15312163SRamaswamy.Tummala@Sun.COM 15412163SRamaswamy.Tummala@Sun.COM /* Tunable limits */ 15512163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_NUM_LSO_BUFS 512 15612163SRamaswamy.Tummala@Sun.COM #define IBD_MAX_NUM_LSO_BUFS 4096 15712163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_UD_TX_COPY_THRESH 2048 15812163SRamaswamy.Tummala@Sun.COM #define IBD_MAX_UD_TX_COPY_THRESH 65536 15912163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_UD_NUM_SWQE 512 16012163SRamaswamy.Tummala@Sun.COM #define IBD_MAX_UD_NUM_SWQE 8000 16112163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_UD_NUM_RWQE 512 16212163SRamaswamy.Tummala@Sun.COM #define IBD_MAX_UD_NUM_RWQE 8000 16312163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_NUM_AH 32 16412163SRamaswamy.Tummala@Sun.COM #define IBD_MAX_NUM_AH 8192 16512163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_HASH_SIZE 32 16612163SRamaswamy.Tummala@Sun.COM #define IBD_MAX_HASH_SIZE 1024 16712163SRamaswamy.Tummala@Sun.COM 168*13030SKevin.Ge@Sun.COM #if defined(__i386) 169*13030SKevin.Ge@Sun.COM #define IBD_MIN_RC_NUM_SWQE 255 170*13030SKevin.Ge@Sun.COM #else 17112163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_RC_NUM_SWQE 511 172*13030SKevin.Ge@Sun.COM #endif 17312163SRamaswamy.Tummala@Sun.COM #define IBD_MAX_RC_NUM_SWQE 8000 17412163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_RC_NUM_RWQE 511 17512163SRamaswamy.Tummala@Sun.COM #define IBD_MAX_RC_NUM_RWQE 8000 17612163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_RC_RX_COPY_THRESH 1500 17712163SRamaswamy.Tummala@Sun.COM #define IBD_MAX_RC_RX_COPY_THRESH 65520 17812163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_RC_TX_COPY_THRESH 1500 17912163SRamaswamy.Tummala@Sun.COM #define IBD_MAX_RC_TX_COPY_THRESH 65520 18012163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_RC_NUM_SRQ (IBD_MIN_RC_NUM_RWQE - 1) 18112163SRamaswamy.Tummala@Sun.COM #define IBD_MIN_RC_RX_RWQE_THRESH (IBD_MIN_RC_NUM_RWQE >> 2) 18212163SRamaswamy.Tummala@Sun.COM 18311534SKevin.Ge@Sun.COM /* 18411534SKevin.Ge@Sun.COM * Thresholds 18511534SKevin.Ge@Sun.COM * 18611534SKevin.Ge@Sun.COM * When waiting for resources (swqes or lso buffers) to become available, 18711534SKevin.Ge@Sun.COM * the first two thresholds below determine how long to wait before informing 18811534SKevin.Ge@Sun.COM * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 18911534SKevin.Ge@Sun.COM * determines how low the available swqes should go before we start polling 19011534SKevin.Ge@Sun.COM * the completion queue. 19111534SKevin.Ge@Sun.COM */ 19211534SKevin.Ge@Sun.COM #define IBD_FREE_LSOS_THRESH 8 19311534SKevin.Ge@Sun.COM #define IBD_FREE_SWQES_THRESH 20 19411534SKevin.Ge@Sun.COM #define IBD_TX_POLL_THRESH 80 19511534SKevin.Ge@Sun.COM 19611534SKevin.Ge@Sun.COM #ifdef DEBUG 19711534SKevin.Ge@Sun.COM void debug_print(int l, char *fmt, ...); 19811534SKevin.Ge@Sun.COM #define DPRINT debug_print 19911534SKevin.Ge@Sun.COM #else 20011534SKevin.Ge@Sun.COM #define DPRINT 0 && 20111534SKevin.Ge@Sun.COM #endif 20211534SKevin.Ge@Sun.COM 20311534SKevin.Ge@Sun.COM /* 20411534SKevin.Ge@Sun.COM * AH and MCE active list manipulation: 20511534SKevin.Ge@Sun.COM * 20611534SKevin.Ge@Sun.COM * Multicast disable requests and MCG delete traps are two cases 20711534SKevin.Ge@Sun.COM * where the active AH entry for the mcg (if any unreferenced one exists) 20811534SKevin.Ge@Sun.COM * will be moved to the free list (to force the next Tx to the mcg to 20911534SKevin.Ge@Sun.COM * join the MCG in SendOnly mode). Port up handling will also move AHs 21011534SKevin.Ge@Sun.COM * from active to free list. 21111534SKevin.Ge@Sun.COM * 21211534SKevin.Ge@Sun.COM * In the case when some transmits are still pending on an entry 21311534SKevin.Ge@Sun.COM * for an mcg, but a multicast disable has already been issued on the 21411534SKevin.Ge@Sun.COM * mcg, there are some options to consider to preserve the join state 21511534SKevin.Ge@Sun.COM * to ensure the emitted packet is properly routed on the IBA fabric. 21611534SKevin.Ge@Sun.COM * For the AH, we can 21711534SKevin.Ge@Sun.COM * 1. take out of active list at multicast disable time. 21811534SKevin.Ge@Sun.COM * 2. take out of active list only when last pending Tx completes. 21911534SKevin.Ge@Sun.COM * For the MCE, we can 22011534SKevin.Ge@Sun.COM * 3. take out of active list at multicast disable time. 22111534SKevin.Ge@Sun.COM * 4. take out of active list only when last pending Tx completes. 22211534SKevin.Ge@Sun.COM * 5. move from active list to stale list at multicast disable time. 22311534SKevin.Ge@Sun.COM * We choose to use 2,4. We use option 4 so that if a multicast enable 22411534SKevin.Ge@Sun.COM * is tried before the pending Tx completes, the enable code finds the 22511534SKevin.Ge@Sun.COM * mce in the active list and just has to make sure it will not be reaped 22611534SKevin.Ge@Sun.COM * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 22711534SKevin.Ge@Sun.COM * a stale list (#5) that would be checked in the enable code would need 22811534SKevin.Ge@Sun.COM * to be implemented. Option 2 is used, because otherwise, a Tx attempt 22911534SKevin.Ge@Sun.COM * after the multicast disable would try to put an AH in the active list, 23011534SKevin.Ge@Sun.COM * and associate the mce it finds in the active list to this new AH, 23111534SKevin.Ge@Sun.COM * whereas the mce is already associated with the previous AH (taken off 23211534SKevin.Ge@Sun.COM * the active list), and will be removed once the pending Tx's complete 23311534SKevin.Ge@Sun.COM * (unless a reference count on mce's is implemented). One implication of 23411534SKevin.Ge@Sun.COM * using 2,4 is that new Tx's posted before the pending Tx's complete will 23511534SKevin.Ge@Sun.COM * grab new references on the AH, further delaying the leave. 23611534SKevin.Ge@Sun.COM * 23711534SKevin.Ge@Sun.COM * In the case of mcg delete (or create) trap when the port is sendonly 23811534SKevin.Ge@Sun.COM * joined, the AH and MCE handling is different: the AH and MCE has to be 23911534SKevin.Ge@Sun.COM * immediately taken off the active lists (forcing a join and path lookup 24011534SKevin.Ge@Sun.COM * at the next Tx is the only guaranteed means of ensuring a proper Tx 24111534SKevin.Ge@Sun.COM * to an mcg as it is repeatedly created and deleted and goes thru 24211534SKevin.Ge@Sun.COM * reincarnations). 24311534SKevin.Ge@Sun.COM * 24411534SKevin.Ge@Sun.COM * When a port is already sendonly joined, and a multicast enable is 24511534SKevin.Ge@Sun.COM * attempted, the same mce structure is promoted; this ensures only a 24611534SKevin.Ge@Sun.COM * single mce on the active list tracks the most powerful join state. 24711534SKevin.Ge@Sun.COM * 24811534SKevin.Ge@Sun.COM * In the case of port up event handling, the MCE for sendonly membership 24911534SKevin.Ge@Sun.COM * is freed up, and the ACE is put into the free list as soon as possible 25011534SKevin.Ge@Sun.COM * (depending on whether posted Tx's have completed). For fullmembership 25111534SKevin.Ge@Sun.COM * MCE's though, the ACE is similarly handled; but the MCE is kept around 25211534SKevin.Ge@Sun.COM * (a re-JOIN is attempted) only if the DLPI leave has not already been 25311534SKevin.Ge@Sun.COM * done; else the mce is deconstructed (mc_fullreap case). 25411534SKevin.Ge@Sun.COM * 25511534SKevin.Ge@Sun.COM * MCG creation and deletion trap handling: 25611534SKevin.Ge@Sun.COM * 25711534SKevin.Ge@Sun.COM * These traps are unreliable (meaning sometimes the trap might never 25811534SKevin.Ge@Sun.COM * be delivered to the subscribed nodes) and may arrive out-of-order 25911534SKevin.Ge@Sun.COM * since they use UD transport. An alternative to relying on these 26011534SKevin.Ge@Sun.COM * unreliable traps is to poll for mcg presence every so often, but 26111534SKevin.Ge@Sun.COM * instead of doing that, we try to be as conservative as possible 26211534SKevin.Ge@Sun.COM * while handling the traps, and hope that the traps do arrive at 26311534SKevin.Ge@Sun.COM * the subscribed nodes soon. Note that if a node is fullmember 26411534SKevin.Ge@Sun.COM * joined to an mcg, it can not possibly receive a mcg create/delete 26511534SKevin.Ge@Sun.COM * trap for that mcg (by fullmember definition); if it does, it is 26611534SKevin.Ge@Sun.COM * an old trap from a previous incarnation of the mcg. 26711534SKevin.Ge@Sun.COM * 26811534SKevin.Ge@Sun.COM * Whenever a trap is received, the driver cleans up its sendonly 26911534SKevin.Ge@Sun.COM * membership to the group; we choose to do a sendonly leave even 27011534SKevin.Ge@Sun.COM * on a creation trap to handle the case of a prior deletion of the mcg 27111534SKevin.Ge@Sun.COM * having gone unnoticed. Consider an example scenario: 27211534SKevin.Ge@Sun.COM * T1: MCG M is deleted, and fires off deletion trap D1. 27311534SKevin.Ge@Sun.COM * T2: MCG M is recreated, fires off creation trap C1, which is lost. 27411534SKevin.Ge@Sun.COM * T3: Node N tries to transmit to M, joining in sendonly mode. 27511534SKevin.Ge@Sun.COM * T4: MCG M is deleted, and fires off deletion trap D2. 27611534SKevin.Ge@Sun.COM * T5: N receives a deletion trap, but can not distinguish D1 from D2. 27711534SKevin.Ge@Sun.COM * If the trap is D2, then a LEAVE is not required, since the mcg 27811534SKevin.Ge@Sun.COM * is already deleted; but if it is D1, a LEAVE is required. A safe 27911534SKevin.Ge@Sun.COM * approach is to always LEAVE, but the SM may be confused if it 28011534SKevin.Ge@Sun.COM * receives a LEAVE without a prior JOIN. 28111534SKevin.Ge@Sun.COM * 28211534SKevin.Ge@Sun.COM * Management of the non-membership to an mcg is similar to the above, 28311534SKevin.Ge@Sun.COM * except that if the interface is in promiscuous mode, it is required 28411534SKevin.Ge@Sun.COM * to attempt to re-join the mcg after receiving a trap. Unfortunately, 28511534SKevin.Ge@Sun.COM * if the re-join attempt fails (in which case a warning message needs 28611534SKevin.Ge@Sun.COM * to be printed), it is not clear whether it failed due to the mcg not 28711534SKevin.Ge@Sun.COM * existing, or some fabric/hca issues, due to the delayed nature of 28811534SKevin.Ge@Sun.COM * trap delivery. Querying the SA to establish presence/absence of the 28911534SKevin.Ge@Sun.COM * mcg is also racy at best. Thus, the driver just prints a warning 29011534SKevin.Ge@Sun.COM * message when it can not rejoin after receiving a create trap, although 29111534SKevin.Ge@Sun.COM * this might be (on rare occasions) a mis-warning if the create trap is 29211534SKevin.Ge@Sun.COM * received after the mcg was deleted. 29311534SKevin.Ge@Sun.COM */ 29411534SKevin.Ge@Sun.COM 29511534SKevin.Ge@Sun.COM /* 29611534SKevin.Ge@Sun.COM * Implementation of atomic "recycle" bits and reference count 29711534SKevin.Ge@Sun.COM * on address handles. This utilizes the fact that max reference 29811534SKevin.Ge@Sun.COM * count on any handle is limited by number of send wqes, thus 29911534SKevin.Ge@Sun.COM * high bits in the ac_ref field can be used as the recycle bits, 30011534SKevin.Ge@Sun.COM * and only the low bits hold the number of pending Tx requests. 30111534SKevin.Ge@Sun.COM * This atomic AH reference counting allows the Tx completion 30211534SKevin.Ge@Sun.COM * handler not to acquire the id_ac_mutex to process every completion, 30311534SKevin.Ge@Sun.COM * thus reducing lock contention problems between completion and 30411534SKevin.Ge@Sun.COM * the Tx path. 30511534SKevin.Ge@Sun.COM */ 30611534SKevin.Ge@Sun.COM #define CYCLEVAL 0x80000 30711534SKevin.Ge@Sun.COM #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 30811534SKevin.Ge@Sun.COM #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 30911534SKevin.Ge@Sun.COM #define GET_REF(ace) ((ace)->ac_ref) 31011534SKevin.Ge@Sun.COM #define GET_REF_CYCLE(ace) ( \ 31111534SKevin.Ge@Sun.COM /* \ 31211534SKevin.Ge@Sun.COM * Make sure "cycle" bit is set. \ 31311534SKevin.Ge@Sun.COM */ \ 31411534SKevin.Ge@Sun.COM ASSERT(CYCLE_SET(ace)), \ 31511534SKevin.Ge@Sun.COM ((ace)->ac_ref & ~(CYCLEVAL)) \ 31611534SKevin.Ge@Sun.COM ) 31711534SKevin.Ge@Sun.COM #define INC_REF(ace, num) { \ 31811534SKevin.Ge@Sun.COM atomic_add_32(&(ace)->ac_ref, num); \ 31911534SKevin.Ge@Sun.COM } 32011534SKevin.Ge@Sun.COM #define SET_CYCLE_IF_REF(ace) ( \ 32111534SKevin.Ge@Sun.COM CYCLE_SET(ace) ? B_TRUE : \ 32211534SKevin.Ge@Sun.COM atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 32311534SKevin.Ge@Sun.COM CYCLEVAL ? \ 32411534SKevin.Ge@Sun.COM /* \ 32511534SKevin.Ge@Sun.COM * Clear the "cycle" bit we just set; \ 32611534SKevin.Ge@Sun.COM * ref count known to be 0 from above. \ 32711534SKevin.Ge@Sun.COM */ \ 32811534SKevin.Ge@Sun.COM CLEAR_REFCYCLE(ace), B_FALSE : \ 32911534SKevin.Ge@Sun.COM /* \ 33011534SKevin.Ge@Sun.COM * We set "cycle" bit; let caller know. \ 33111534SKevin.Ge@Sun.COM */ \ 33211534SKevin.Ge@Sun.COM B_TRUE \ 33311534SKevin.Ge@Sun.COM ) 33411534SKevin.Ge@Sun.COM #define DEC_REF_DO_CYCLE(ace) ( \ 33511534SKevin.Ge@Sun.COM atomic_dec_32_nv(&ace->ac_ref) == CYCLEVAL ? \ 33611534SKevin.Ge@Sun.COM /* \ 33711534SKevin.Ge@Sun.COM * Ref count known to be 0 from above. \ 33811534SKevin.Ge@Sun.COM */ \ 33911534SKevin.Ge@Sun.COM B_TRUE : \ 34011534SKevin.Ge@Sun.COM B_FALSE \ 34111534SKevin.Ge@Sun.COM ) 34211534SKevin.Ge@Sun.COM 34311534SKevin.Ge@Sun.COM /* 34411534SKevin.Ge@Sun.COM * Address handle entries maintained by the driver are kept in the 34511534SKevin.Ge@Sun.COM * free and active lists. Each entry starts out in the free list; 34611534SKevin.Ge@Sun.COM * it migrates to the active list when primed using ibt_get_paths() 34711534SKevin.Ge@Sun.COM * and ibt_modify_ud_dest() for transmission to a specific destination. 34811534SKevin.Ge@Sun.COM * In the active list, the entry has a reference count indicating the 34911534SKevin.Ge@Sun.COM * number of ongoing/uncompleted transmits that reference it. The 35011534SKevin.Ge@Sun.COM * entry is left in the active list even after the reference count 35111534SKevin.Ge@Sun.COM * goes to 0, since successive transmits can find it there and do 35211534SKevin.Ge@Sun.COM * not need to set up another entry (ie the path information is 35311534SKevin.Ge@Sun.COM * cached using the active list). Entries on the active list are 35411534SKevin.Ge@Sun.COM * also hashed using the destination link address as a key for faster 35511534SKevin.Ge@Sun.COM * lookups during transmits. 35611534SKevin.Ge@Sun.COM * 35711534SKevin.Ge@Sun.COM * For any destination address (unicast or multicast, whatever the 35811534SKevin.Ge@Sun.COM * join states), there will be at most one entry in the active list. 35911534SKevin.Ge@Sun.COM * Entries with a 0 reference count on the active list can be reused 36011534SKevin.Ge@Sun.COM * for a transmit to a new destination, if the free list is empty. 36111534SKevin.Ge@Sun.COM * 36211534SKevin.Ge@Sun.COM * The AH free list insertion/deletion is protected with the id_ac_mutex, 36311534SKevin.Ge@Sun.COM * since the async thread and Tx callback handlers insert/delete. The 36411534SKevin.Ge@Sun.COM * active list does not need a lock (all operations are done by the 36511534SKevin.Ge@Sun.COM * async thread) but updates to the reference count are atomically 36611534SKevin.Ge@Sun.COM * done (increments done by Tx path, decrements by the Tx callback handler). 36711534SKevin.Ge@Sun.COM */ 36811534SKevin.Ge@Sun.COM #define IBD_ACACHE_INSERT_FREE(state, ce) \ 36911534SKevin.Ge@Sun.COM list_insert_head(&state->id_ah_free, ce) 37011534SKevin.Ge@Sun.COM #define IBD_ACACHE_GET_FREE(state) \ 37111534SKevin.Ge@Sun.COM list_get_head(&state->id_ah_free) 37211534SKevin.Ge@Sun.COM #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 37311534SKevin.Ge@Sun.COM int _ret_; \ 37411534SKevin.Ge@Sun.COM list_insert_head(&state->id_ah_active, ce); \ 37511534SKevin.Ge@Sun.COM _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 37611534SKevin.Ge@Sun.COM (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 37711534SKevin.Ge@Sun.COM ASSERT(_ret_ == 0); \ 37811534SKevin.Ge@Sun.COM state->id_ac_hot_ace = ce; \ 37911534SKevin.Ge@Sun.COM } 38011534SKevin.Ge@Sun.COM #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 38111534SKevin.Ge@Sun.COM list_remove(&state->id_ah_active, ce); \ 38211534SKevin.Ge@Sun.COM if (state->id_ac_hot_ace == ce) \ 38311534SKevin.Ge@Sun.COM state->id_ac_hot_ace = NULL; \ 38411534SKevin.Ge@Sun.COM (void) mod_hash_remove(state->id_ah_active_hash, \ 38511534SKevin.Ge@Sun.COM (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 38611534SKevin.Ge@Sun.COM } 38711534SKevin.Ge@Sun.COM #define IBD_ACACHE_GET_ACTIVE(state) \ 38811534SKevin.Ge@Sun.COM list_get_head(&state->id_ah_active) 38911534SKevin.Ge@Sun.COM 39011534SKevin.Ge@Sun.COM /* 39111534SKevin.Ge@Sun.COM * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 39211534SKevin.Ge@Sun.COM * front of optional src/tgt link layer address. Right now Solaris inserts 39311534SKevin.Ge@Sun.COM * padding by default at the end. The routine which is doing is nce_xmit() 39411534SKevin.Ge@Sun.COM * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 39511534SKevin.Ge@Sun.COM * the packet comes down from IP layer to the IBD driver, it is in the 39611534SKevin.Ge@Sun.COM * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 39711534SKevin.Ge@Sun.COM * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 39811534SKevin.Ge@Sun.COM * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 39911534SKevin.Ge@Sun.COM * 40011534SKevin.Ge@Sun.COM * The send routine at IBD driver changes this packet as follows: 40111534SKevin.Ge@Sun.COM * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 40211534SKevin.Ge@Sun.COM * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 40311534SKevin.Ge@Sun.COM * aligned. 40411534SKevin.Ge@Sun.COM * 40511534SKevin.Ge@Sun.COM * At the receiving side again ibd_process_rx takes the above packet and 40611534SKevin.Ge@Sun.COM * removes the two bytes of front padding and inserts it at the end. This 40711534SKevin.Ge@Sun.COM * is since the IP layer does not understand padding at the front. 40811534SKevin.Ge@Sun.COM */ 40911534SKevin.Ge@Sun.COM #define IBD_PAD_NSNA(ip6h, len, type) { \ 41011534SKevin.Ge@Sun.COM uchar_t *nd_lla_ptr; \ 41111534SKevin.Ge@Sun.COM icmp6_t *icmp6; \ 41211534SKevin.Ge@Sun.COM nd_opt_hdr_t *opt; \ 41311534SKevin.Ge@Sun.COM int i; \ 41411534SKevin.Ge@Sun.COM \ 41511534SKevin.Ge@Sun.COM icmp6 = (icmp6_t *)&ip6h[1]; \ 41611534SKevin.Ge@Sun.COM len -= sizeof (nd_neighbor_advert_t); \ 41711534SKevin.Ge@Sun.COM if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 41811534SKevin.Ge@Sun.COM (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 41911534SKevin.Ge@Sun.COM (len != 0)) { \ 42011534SKevin.Ge@Sun.COM opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 42111534SKevin.Ge@Sun.COM + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 42211534SKevin.Ge@Sun.COM ASSERT(opt != NULL); \ 42311534SKevin.Ge@Sun.COM nd_lla_ptr = (uchar_t *)&opt[1]; \ 42411534SKevin.Ge@Sun.COM if (type == IBD_SEND) { \ 42511534SKevin.Ge@Sun.COM for (i = IPOIB_ADDRL; i > 0; i--) \ 42611534SKevin.Ge@Sun.COM *(nd_lla_ptr + i + 1) = \ 42711534SKevin.Ge@Sun.COM *(nd_lla_ptr + i - 1); \ 42811534SKevin.Ge@Sun.COM } else { \ 42911534SKevin.Ge@Sun.COM for (i = 0; i < IPOIB_ADDRL; i++) \ 43011534SKevin.Ge@Sun.COM *(nd_lla_ptr + i) = \ 43111534SKevin.Ge@Sun.COM *(nd_lla_ptr + i + 2); \ 43211534SKevin.Ge@Sun.COM } \ 43311534SKevin.Ge@Sun.COM *(nd_lla_ptr + i) = 0; \ 43411534SKevin.Ge@Sun.COM *(nd_lla_ptr + i + 1) = 0; \ 43511534SKevin.Ge@Sun.COM } \ 43611534SKevin.Ge@Sun.COM } 43711534SKevin.Ge@Sun.COM 43811534SKevin.Ge@Sun.COM 4390Sstevel@tonic-gate /* 4400Sstevel@tonic-gate * IETF defined IPoIB encapsulation header, with 2b of ethertype 4410Sstevel@tonic-gate * followed by 2 reserved bytes. This is at the start of the 4420Sstevel@tonic-gate * datagram sent to and received over the wire by the driver. 4430Sstevel@tonic-gate */ 4440Sstevel@tonic-gate typedef struct ipoib_header { 4450Sstevel@tonic-gate ushort_t ipoib_type; 4460Sstevel@tonic-gate ushort_t ipoib_mbz; 4470Sstevel@tonic-gate } ipoib_hdr_t; 4480Sstevel@tonic-gate 4490Sstevel@tonic-gate #define IPOIB_HDRSIZE sizeof (struct ipoib_header) 4500Sstevel@tonic-gate 4510Sstevel@tonic-gate /* 4520Sstevel@tonic-gate * IETF defined IPoIB link address; IBA QPN, followed by GID, 4530Sstevel@tonic-gate * which has a prefix and suffix, as reported via ARP. 4540Sstevel@tonic-gate */ 4550Sstevel@tonic-gate typedef struct ipoib_mac { 4560Sstevel@tonic-gate uint32_t ipoib_qpn; 4570Sstevel@tonic-gate uint32_t ipoib_gidpref[2]; 4580Sstevel@tonic-gate uint32_t ipoib_gidsuff[2]; 4590Sstevel@tonic-gate } ipoib_mac_t; 4600Sstevel@tonic-gate 4610Sstevel@tonic-gate #define IPOIB_ADDRL sizeof (struct ipoib_mac) 4620Sstevel@tonic-gate 4630Sstevel@tonic-gate /* 4640Sstevel@tonic-gate * Pseudo header prepended to datagram in DLIOCRAW transmit path 4650Sstevel@tonic-gate * and when GLD hands the datagram to the gldm_send entry point. 4660Sstevel@tonic-gate */ 4670Sstevel@tonic-gate typedef struct ipoib_ptxhdr { 4680Sstevel@tonic-gate ipoib_mac_t ipoib_dest; 4690Sstevel@tonic-gate ipoib_hdr_t ipoib_rhdr; 4700Sstevel@tonic-gate } ipoib_ptxhdr_t; 4710Sstevel@tonic-gate 4720Sstevel@tonic-gate #define IPOIBDLSAP(p, offset) ((ipoib_ptxhdr_t *)((caddr_t)(p)+offset)) 4730Sstevel@tonic-gate 4740Sstevel@tonic-gate /* 4750Sstevel@tonic-gate * The pseudo-GRH structure that sits before the data in the 4760Sstevel@tonic-gate * receive buffer, and is overlaid on top of the real GRH. 4770Sstevel@tonic-gate * The driver sets the ipoib_vertcflow to 0 if the pseudo-GRH 4780Sstevel@tonic-gate * does not hold valid information. If it is indicated valid, 4790Sstevel@tonic-gate * the driver must additionally provide the sender's qpn in 4800Sstevel@tonic-gate * network byte order in ipoib_sqpn, and not touch the 4810Sstevel@tonic-gate * remaining parts which were DMA'ed in by the IBA hardware. 4820Sstevel@tonic-gate */ 4830Sstevel@tonic-gate typedef struct ipoib_pgrh { 4840Sstevel@tonic-gate uint32_t ipoib_vertcflow; 4850Sstevel@tonic-gate uint32_t ipoib_sqpn; 4860Sstevel@tonic-gate uint32_t ipoib_sgid_pref[2]; 4870Sstevel@tonic-gate uint32_t ipoib_sgid_suff[2]; 4880Sstevel@tonic-gate uint32_t ipoib_dgid_pref[2]; 4890Sstevel@tonic-gate uint32_t ipoib_dgid_suff[2]; 4900Sstevel@tonic-gate } ipoib_pgrh_t; 4910Sstevel@tonic-gate 4920Sstevel@tonic-gate /* 4930Sstevel@tonic-gate * The GRH is also dma'ed into recv buffers, thus space needs 4940Sstevel@tonic-gate * to be allocated for them. 4950Sstevel@tonic-gate */ 4960Sstevel@tonic-gate #define IPOIB_GRH_SIZE sizeof (ipoib_pgrh_t) 4970Sstevel@tonic-gate 49811534SKevin.Ge@Sun.COM /* support the RC (reliable connected) mode */ 49911534SKevin.Ge@Sun.COM #define IBD_MAC_ADDR_RC 0x80000000 50011534SKevin.Ge@Sun.COM /* support the UC (unreliable connected) mode */ 50111534SKevin.Ge@Sun.COM #define IBD_MAC_ADDR_UC 0x40000000 50211534SKevin.Ge@Sun.COM 50311534SKevin.Ge@Sun.COM #define IBD_RC_SERVICE_ID 0x100000000000000ULL 50411534SKevin.Ge@Sun.COM 50511534SKevin.Ge@Sun.COM /* 50611534SKevin.Ge@Sun.COM * Legacy OFED had used a wrong service ID (one additional zero digit) for 50711534SKevin.Ge@Sun.COM * many years. To interop with legacy OFED, we support this wrong service ID 50811534SKevin.Ge@Sun.COM * here. 50911534SKevin.Ge@Sun.COM */ 51011534SKevin.Ge@Sun.COM #define IBD_RC_SERVICE_ID_OFED_INTEROP 0x1000000000000000ULL 51111534SKevin.Ge@Sun.COM 51211534SKevin.Ge@Sun.COM #define IBD_RC_MIN_CQ_SIZE 0x7f 51311534SKevin.Ge@Sun.COM 51411534SKevin.Ge@Sun.COM /* Number of ibt_wc_t provided for each RC channel */ 51511534SKevin.Ge@Sun.COM #define IBD_RC_MAX_CQ_WC 0x3f 51611534SKevin.Ge@Sun.COM 5170Sstevel@tonic-gate #if defined(_KERNEL) && !defined(_BOOT) 5180Sstevel@tonic-gate 5190Sstevel@tonic-gate #include <sys/ib/ibtl/ibti.h> 5200Sstevel@tonic-gate #include <sys/ib/ib_pkt_hdrs.h> 5210Sstevel@tonic-gate #include <sys/list.h> 5228275SEric Cheng #include <sys/mac_provider.h> 5235766Sgg161487 #include <sys/mac_ib.h> 5240Sstevel@tonic-gate #include <sys/modhash.h> 5250Sstevel@tonic-gate 52611534SKevin.Ge@Sun.COM /* State of a reliable connected channel (ibd_rc_chan_t->chan_state) */ 52711534SKevin.Ge@Sun.COM typedef enum { 52811534SKevin.Ge@Sun.COM IBD_RC_STATE_INIT = 0, 52911534SKevin.Ge@Sun.COM 53011534SKevin.Ge@Sun.COM /* Active side */ 53111534SKevin.Ge@Sun.COM IBD_RC_STATE_ACT_REP_RECV, /* reply received */ 53211534SKevin.Ge@Sun.COM IBD_RC_STATE_ACT_ESTAB, /* established, ready to send */ 53311534SKevin.Ge@Sun.COM IBD_RC_STATE_ACT_REJECT, /* rejected */ 53411534SKevin.Ge@Sun.COM /* Someone else is closing this channel, please don't re-close it */ 53511534SKevin.Ge@Sun.COM IBD_RC_STATE_ACT_CLOSING, 53611534SKevin.Ge@Sun.COM IBD_RC_STATE_ACT_CLOSED, 53711534SKevin.Ge@Sun.COM IBD_RC_STATE_ACT_ERROR, 53811534SKevin.Ge@Sun.COM 53911534SKevin.Ge@Sun.COM /* Passive side */ 54011534SKevin.Ge@Sun.COM IBD_RC_STATE_PAS_REQ_RECV, /* request received */ 54111534SKevin.Ge@Sun.COM IBD_RC_STATE_PAS_ESTAB, /* established, ready to receive */ 54211534SKevin.Ge@Sun.COM IBD_RC_STATE_PAS_REJECT, /* rejected */ 54311534SKevin.Ge@Sun.COM 54411534SKevin.Ge@Sun.COM IBD_RC_STATE_PAS_CLOSED 54511534SKevin.Ge@Sun.COM } ibd_rc_chan_state_t; 54611534SKevin.Ge@Sun.COM 5470Sstevel@tonic-gate /* 5480Sstevel@tonic-gate * Structure to encapsulate various types of async requests. 5490Sstevel@tonic-gate */ 5500Sstevel@tonic-gate typedef struct ibd_acache_rq { 5510Sstevel@tonic-gate struct list_node rq_list; /* list of pending work */ 5520Sstevel@tonic-gate int rq_op; /* what operation */ 5530Sstevel@tonic-gate ipoib_mac_t rq_mac; 5540Sstevel@tonic-gate ib_gid_t rq_gid; 5550Sstevel@tonic-gate void *rq_ptr; 55611534SKevin.Ge@Sun.COM void *rq_ptr2; 5570Sstevel@tonic-gate } ibd_req_t; 5580Sstevel@tonic-gate 5590Sstevel@tonic-gate typedef struct ibd_mcache { 5600Sstevel@tonic-gate struct list_node mc_list; /* full/non list */ 5610Sstevel@tonic-gate uint8_t mc_jstate; 5620Sstevel@tonic-gate boolean_t mc_fullreap; 5630Sstevel@tonic-gate ibt_mcg_info_t mc_info; 5640Sstevel@tonic-gate ibd_req_t mc_req; /* to queue LEAVE req */ 5650Sstevel@tonic-gate } ibd_mce_t; 5660Sstevel@tonic-gate 5670Sstevel@tonic-gate typedef struct ibd_acache_s { 5680Sstevel@tonic-gate struct list_node ac_list; /* free/active list */ 5690Sstevel@tonic-gate ibt_ud_dest_hdl_t ac_dest; 5700Sstevel@tonic-gate ipoib_mac_t ac_mac; 5710Sstevel@tonic-gate uint32_t ac_ref; 5720Sstevel@tonic-gate ibd_mce_t *ac_mce; /* for MCG AHs */ 57311534SKevin.Ge@Sun.COM 57411534SKevin.Ge@Sun.COM /* For Reliable Connected mode */ 57511534SKevin.Ge@Sun.COM struct ibd_rc_chan_s *ac_chan; 57611534SKevin.Ge@Sun.COM /* protect tx_too_big_ongoing */ 57711534SKevin.Ge@Sun.COM kmutex_t tx_too_big_mutex; 57811534SKevin.Ge@Sun.COM /* Deal with too big packet */ 57911534SKevin.Ge@Sun.COM boolean_t tx_too_big_ongoing; 5800Sstevel@tonic-gate } ibd_ace_t; 5810Sstevel@tonic-gate 5828917SVenkatakrishnan.Rajagopalan@Sun.COM #define IBD_MAX_SQSEG 59 5838917SVenkatakrishnan.Rajagopalan@Sun.COM #define IBD_MAX_RQSEG 1 5848917SVenkatakrishnan.Rajagopalan@Sun.COM 5858917SVenkatakrishnan.Rajagopalan@Sun.COM typedef enum { 5868917SVenkatakrishnan.Rajagopalan@Sun.COM IBD_WQE_SEND, 5878917SVenkatakrishnan.Rajagopalan@Sun.COM IBD_WQE_RECV 5888917SVenkatakrishnan.Rajagopalan@Sun.COM } ibd_wqe_type_t; 5898917SVenkatakrishnan.Rajagopalan@Sun.COM 5908917SVenkatakrishnan.Rajagopalan@Sun.COM typedef enum { 5918917SVenkatakrishnan.Rajagopalan@Sun.COM IBD_WQE_TXBUF = 1, 5928917SVenkatakrishnan.Rajagopalan@Sun.COM IBD_WQE_LSOBUF = 2, 59311534SKevin.Ge@Sun.COM IBD_WQE_MAPPED = 3, 59411534SKevin.Ge@Sun.COM IBD_WQE_RC_COPYBUF = 4 5958917SVenkatakrishnan.Rajagopalan@Sun.COM } ibd_wqe_buftype_t; 5968917SVenkatakrishnan.Rajagopalan@Sun.COM 59711534SKevin.Ge@Sun.COM #ifdef DEBUG 59811534SKevin.Ge@Sun.COM typedef struct ibd_rc_stat_s { 59911534SKevin.Ge@Sun.COM kstat_named_t rc_rcv_trans_byte; 60011534SKevin.Ge@Sun.COM kstat_named_t rc_rcv_trans_pkt; 60111534SKevin.Ge@Sun.COM kstat_named_t rc_rcv_copy_byte; 60211534SKevin.Ge@Sun.COM kstat_named_t rc_rcv_copy_pkt; 60311534SKevin.Ge@Sun.COM kstat_named_t rc_rcv_alloc_fail; 60411534SKevin.Ge@Sun.COM 60511534SKevin.Ge@Sun.COM kstat_named_t rc_rcq_err; /* fail in rcq handler */ 60611534SKevin.Ge@Sun.COM 60711534SKevin.Ge@Sun.COM kstat_named_t rc_rwqe_short; /* short rwqe */ 60811534SKevin.Ge@Sun.COM 60911534SKevin.Ge@Sun.COM kstat_named_t rc_xmt_bytes; 61012163SRamaswamy.Tummala@Sun.COM /* pkt size <= state->id_rc_tx_copy_thresh */ 61111534SKevin.Ge@Sun.COM kstat_named_t rc_xmt_small_pkt; 61211534SKevin.Ge@Sun.COM kstat_named_t rc_xmt_fragmented_pkt; 61311534SKevin.Ge@Sun.COM /* fail in ibt_map_mem_iov() */ 61411534SKevin.Ge@Sun.COM kstat_named_t rc_xmt_map_fail_pkt; 61511534SKevin.Ge@Sun.COM /* succ in ibt_map_mem_iov() */ 61611534SKevin.Ge@Sun.COM kstat_named_t rc_xmt_map_succ_pkt; 61711534SKevin.Ge@Sun.COM 61811534SKevin.Ge@Sun.COM kstat_named_t rc_ace_not_found; /* ace not found */ 61911534SKevin.Ge@Sun.COM /* no swqe even after recycle */ 62011534SKevin.Ge@Sun.COM kstat_named_t rc_scq_no_swqe; 62111534SKevin.Ge@Sun.COM /* no tx large buf even after recycle */ 62211534SKevin.Ge@Sun.COM kstat_named_t rc_scq_no_largebuf; 62311534SKevin.Ge@Sun.COM 62411534SKevin.Ge@Sun.COM /* short swqe in ibd_send() */ 62511534SKevin.Ge@Sun.COM kstat_named_t rc_swqe_short; 62611534SKevin.Ge@Sun.COM /* call mac_tx_update() when there is enough swqe */ 62711534SKevin.Ge@Sun.COM kstat_named_t rc_swqe_mac_update; 62811534SKevin.Ge@Sun.COM /* short large buf in ibd_send() */ 62911534SKevin.Ge@Sun.COM kstat_named_t rc_xmt_buf_short; 63011534SKevin.Ge@Sun.COM /* call mac_tx_update() when there is enough Tx large buffers */ 63111534SKevin.Ge@Sun.COM kstat_named_t rc_xmt_buf_mac_update; 63211534SKevin.Ge@Sun.COM 63311534SKevin.Ge@Sun.COM kstat_named_t rc_conn_succ; /* # of success connect */ 63411534SKevin.Ge@Sun.COM kstat_named_t rc_conn_fail; /* # of fail connect */ 63511534SKevin.Ge@Sun.COM /* ace->ac_chan == NULL for unicast packet */ 63611534SKevin.Ge@Sun.COM kstat_named_t rc_null_conn; 63711534SKevin.Ge@Sun.COM /* not in active established state */ 63811534SKevin.Ge@Sun.COM kstat_named_t rc_no_estab_conn; 63911534SKevin.Ge@Sun.COM 64011534SKevin.Ge@Sun.COM kstat_named_t rc_act_close; /* call ibd_rc_act_close() */ 64111534SKevin.Ge@Sun.COM kstat_named_t rc_pas_close; /* call ibd_rc_pas_close() */ 64211534SKevin.Ge@Sun.COM kstat_named_t rc_delay_ace_recycle; 64311534SKevin.Ge@Sun.COM kstat_named_t rc_act_close_simultaneous; 64411534SKevin.Ge@Sun.COM 64511534SKevin.Ge@Sun.COM kstat_named_t rc_reset_cnt; /* # of Reset RC channel */ 646*13030SKevin.Ge@Sun.COM kstat_named_t rc_timeout_act; 647*13030SKevin.Ge@Sun.COM kstat_named_t rc_timeout_pas; 64811534SKevin.Ge@Sun.COM } ibd_rc_stat_t; 64911534SKevin.Ge@Sun.COM #endif 65011534SKevin.Ge@Sun.COM 65111534SKevin.Ge@Sun.COM typedef struct ibd_rc_chan_list_s { 65211534SKevin.Ge@Sun.COM /* This mutex protects chan_list and ibd_rc_chan_t.next */ 65311534SKevin.Ge@Sun.COM kmutex_t chan_list_mutex; 65411534SKevin.Ge@Sun.COM struct ibd_rc_chan_s *chan_list; 65511534SKevin.Ge@Sun.COM } ibd_rc_chan_list_t; 65611534SKevin.Ge@Sun.COM 65711534SKevin.Ge@Sun.COM typedef struct ibd_rc_tx_largebuf_s { 65811534SKevin.Ge@Sun.COM struct ibd_rc_tx_largebuf_s *lb_next; 65911534SKevin.Ge@Sun.COM uint8_t *lb_buf; 66011534SKevin.Ge@Sun.COM } ibd_rc_tx_largebuf_t; 66111534SKevin.Ge@Sun.COM 6628917SVenkatakrishnan.Rajagopalan@Sun.COM /* 6638917SVenkatakrishnan.Rajagopalan@Sun.COM * Pre-registered copybuf used for send and receive 6648917SVenkatakrishnan.Rajagopalan@Sun.COM */ 6658917SVenkatakrishnan.Rajagopalan@Sun.COM typedef struct ibd_copybuf_s { 6668917SVenkatakrishnan.Rajagopalan@Sun.COM ibt_wr_ds_t ic_sgl; 6678917SVenkatakrishnan.Rajagopalan@Sun.COM uint8_t *ic_bufaddr; 6688917SVenkatakrishnan.Rajagopalan@Sun.COM } ibd_copybuf_t; 6690Sstevel@tonic-gate 6700Sstevel@tonic-gate typedef struct ibd_wqe_s { 6710Sstevel@tonic-gate struct ibd_wqe_s *w_next; 6720Sstevel@tonic-gate ibd_copybuf_t w_copybuf; 6730Sstevel@tonic-gate mblk_t *im_mblk; 6740Sstevel@tonic-gate } ibd_wqe_t; 6750Sstevel@tonic-gate 6768917SVenkatakrishnan.Rajagopalan@Sun.COM /* 6778917SVenkatakrishnan.Rajagopalan@Sun.COM * Send WQE 6788917SVenkatakrishnan.Rajagopalan@Sun.COM */ 6790Sstevel@tonic-gate typedef struct ibd_swqe_s { 6800Sstevel@tonic-gate ibd_wqe_t w_ibd_swqe; 6818917SVenkatakrishnan.Rajagopalan@Sun.COM ibd_wqe_buftype_t w_buftype; 6820Sstevel@tonic-gate ibt_send_wr_t w_swr; 6830Sstevel@tonic-gate ibd_ace_t *w_ahandle; 6848917SVenkatakrishnan.Rajagopalan@Sun.COM ibt_mi_hdl_t w_mi_hdl; 6858917SVenkatakrishnan.Rajagopalan@Sun.COM ibt_wr_ds_t w_sgl[IBD_MAX_SQSEG]; 68611534SKevin.Ge@Sun.COM ibd_rc_tx_largebuf_t *w_rc_tx_largebuf; 6870Sstevel@tonic-gate } ibd_swqe_t; 6880Sstevel@tonic-gate 6890Sstevel@tonic-gate #define swqe_next w_ibd_swqe.w_next 6900Sstevel@tonic-gate #define swqe_copybuf w_ibd_swqe.w_copybuf 6910Sstevel@tonic-gate #define swqe_im_mblk w_ibd_swqe.im_mblk 6920Sstevel@tonic-gate #define SWQE_TO_WQE(swqe) (ibd_wqe_t *)&((swqe)->w_ibd_swqe) 6930Sstevel@tonic-gate #define WQE_TO_SWQE(wqe) (ibd_swqe_t *)wqe 6940Sstevel@tonic-gate 6958917SVenkatakrishnan.Rajagopalan@Sun.COM /* 6968917SVenkatakrishnan.Rajagopalan@Sun.COM * Receive WQE 6978917SVenkatakrishnan.Rajagopalan@Sun.COM */ 6980Sstevel@tonic-gate typedef struct ibd_rwqe_s { 6990Sstevel@tonic-gate ibd_wqe_t w_ibd_rwqe; 7000Sstevel@tonic-gate struct ibd_state_s *w_state; 7010Sstevel@tonic-gate ibt_recv_wr_t w_rwr; 7020Sstevel@tonic-gate frtn_t w_freemsg_cb; 70311534SKevin.Ge@Sun.COM boolean_t w_freeing_wqe; 70411534SKevin.Ge@Sun.COM struct ibd_rc_chan_s *w_chan; 7050Sstevel@tonic-gate } ibd_rwqe_t; 7060Sstevel@tonic-gate 7070Sstevel@tonic-gate #define rwqe_next w_ibd_rwqe.w_next 7080Sstevel@tonic-gate #define rwqe_copybuf w_ibd_rwqe.w_copybuf 7090Sstevel@tonic-gate #define rwqe_im_mblk w_ibd_rwqe.im_mblk 7100Sstevel@tonic-gate #define RWQE_TO_WQE(rwqe) (ibd_wqe_t *)&((rwqe)->w_ibd_rwqe) 7110Sstevel@tonic-gate #define WQE_TO_RWQE(wqe) (ibd_rwqe_t *)wqe 7120Sstevel@tonic-gate 7130Sstevel@tonic-gate typedef struct ibd_list_s { 71410852SBill.Taylor@Sun.COM kmutex_t dl_mutex; 7150Sstevel@tonic-gate ibd_wqe_t *dl_head; 7160Sstevel@tonic-gate union { 7170Sstevel@tonic-gate boolean_t pending_sends; 7180Sstevel@tonic-gate uint32_t bufs_outstanding; 7190Sstevel@tonic-gate } ustat; 7200Sstevel@tonic-gate uint32_t dl_cnt; 7210Sstevel@tonic-gate } ibd_list_t; 7220Sstevel@tonic-gate 7230Sstevel@tonic-gate #define dl_pending_sends ustat.pending_sends 7240Sstevel@tonic-gate #define dl_bufs_outstanding ustat.bufs_outstanding 7250Sstevel@tonic-gate 7260Sstevel@tonic-gate /* 7278917SVenkatakrishnan.Rajagopalan@Sun.COM * LSO buffers 7288917SVenkatakrishnan.Rajagopalan@Sun.COM * 7298917SVenkatakrishnan.Rajagopalan@Sun.COM * Under normal circumstances we should never need to use any buffer 7308917SVenkatakrishnan.Rajagopalan@Sun.COM * that's larger than MTU. Unfortunately, IB HCA has limitations 7318917SVenkatakrishnan.Rajagopalan@Sun.COM * on the length of SGL that are much smaller than those for regular 7328917SVenkatakrishnan.Rajagopalan@Sun.COM * ethernet NICs. Since the network layer doesn't care to limit the 7338917SVenkatakrishnan.Rajagopalan@Sun.COM * number of mblk fragments in any send mp chain, we end up having to 7348917SVenkatakrishnan.Rajagopalan@Sun.COM * use these larger-than-MTU sized (larger than id_tx_buf_sz actually) 7358917SVenkatakrishnan.Rajagopalan@Sun.COM * buffers occasionally. 7368917SVenkatakrishnan.Rajagopalan@Sun.COM */ 7378917SVenkatakrishnan.Rajagopalan@Sun.COM typedef struct ibd_lsobuf_s { 7388917SVenkatakrishnan.Rajagopalan@Sun.COM struct ibd_lsobuf_s *lb_next; 7398917SVenkatakrishnan.Rajagopalan@Sun.COM uint8_t *lb_buf; 7408917SVenkatakrishnan.Rajagopalan@Sun.COM int lb_isfree; 7418917SVenkatakrishnan.Rajagopalan@Sun.COM } ibd_lsobuf_t; 7428917SVenkatakrishnan.Rajagopalan@Sun.COM 7438917SVenkatakrishnan.Rajagopalan@Sun.COM typedef struct ibd_lsobkt_s { 7448917SVenkatakrishnan.Rajagopalan@Sun.COM uint8_t *bkt_mem; 7458917SVenkatakrishnan.Rajagopalan@Sun.COM ibd_lsobuf_t *bkt_bufl; 7468917SVenkatakrishnan.Rajagopalan@Sun.COM ibd_lsobuf_t *bkt_free_head; 7478917SVenkatakrishnan.Rajagopalan@Sun.COM ibt_mr_hdl_t bkt_mr_hdl; 7488917SVenkatakrishnan.Rajagopalan@Sun.COM ibt_mr_desc_t bkt_mr_desc; 7498917SVenkatakrishnan.Rajagopalan@Sun.COM uint_t bkt_nelem; 7508917SVenkatakrishnan.Rajagopalan@Sun.COM uint_t bkt_nfree; 7518917SVenkatakrishnan.Rajagopalan@Sun.COM } ibd_lsobkt_t; 7528917SVenkatakrishnan.Rajagopalan@Sun.COM 75312163SRamaswamy.Tummala@Sun.COM #define IBD_PORT_DRIVER 0x1 75412163SRamaswamy.Tummala@Sun.COM #define IBD_PARTITION_OBJ 0x2 75512163SRamaswamy.Tummala@Sun.COM 7568917SVenkatakrishnan.Rajagopalan@Sun.COM /* 75710852SBill.Taylor@Sun.COM * Posting to a single software rx post queue is contentious, 75810852SBill.Taylor@Sun.COM * so break it out to (multiple) an array of queues. 75910852SBill.Taylor@Sun.COM * 76010852SBill.Taylor@Sun.COM * Try to ensure rx_queue structs fall in different cache lines using a filler. 76110852SBill.Taylor@Sun.COM * Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes. 76210852SBill.Taylor@Sun.COM */ 76310852SBill.Taylor@Sun.COM #define RX_QUEUE_CACHE_LINE \ 76411421SBill.Taylor@Sun.COM (64 - (sizeof (kmutex_t) + sizeof (ibd_wqe_t *) + sizeof (uint_t))) 76510852SBill.Taylor@Sun.COM typedef struct ibd_rx_queue_s { 76610852SBill.Taylor@Sun.COM kmutex_t rx_post_lock; 76710852SBill.Taylor@Sun.COM ibd_wqe_t *rx_head; 76811421SBill.Taylor@Sun.COM uint_t rx_cnt; 76911421SBill.Taylor@Sun.COM uint8_t rx_pad[RX_QUEUE_CACHE_LINE]; 77010852SBill.Taylor@Sun.COM } ibd_rx_queue_t; 77110852SBill.Taylor@Sun.COM 77210852SBill.Taylor@Sun.COM /* 7730Sstevel@tonic-gate * This structure maintains information per port per HCA 7740Sstevel@tonic-gate * (per network interface). 7750Sstevel@tonic-gate */ 7760Sstevel@tonic-gate typedef struct ibd_state_s { 77712163SRamaswamy.Tummala@Sun.COM uint_t id_type; 7780Sstevel@tonic-gate dev_info_t *id_dip; 7790Sstevel@tonic-gate ibt_clnt_hdl_t id_ibt_hdl; 7800Sstevel@tonic-gate ibt_hca_hdl_t id_hca_hdl; 7810Sstevel@tonic-gate ibt_pd_hdl_t id_pd_hdl; 7825766Sgg161487 kmem_cache_t *id_req_kmc; 7830Sstevel@tonic-gate 78410852SBill.Taylor@Sun.COM ibd_list_t id_tx_rel_list; 78510852SBill.Taylor@Sun.COM 78611421SBill.Taylor@Sun.COM uint32_t id_running; 78711421SBill.Taylor@Sun.COM 7880Sstevel@tonic-gate uint32_t id_max_sqseg; 78910852SBill.Taylor@Sun.COM uint32_t id_max_sqseg_hiwm; 7900Sstevel@tonic-gate ibd_list_t id_tx_list; 7915766Sgg161487 ddi_softintr_t id_tx; 7920Sstevel@tonic-gate uint32_t id_tx_sends; 7938917SVenkatakrishnan.Rajagopalan@Sun.COM 79410852SBill.Taylor@Sun.COM kmutex_t id_txpost_lock; 79510852SBill.Taylor@Sun.COM ibd_swqe_t *id_tx_head; 79610852SBill.Taylor@Sun.COM ibd_swqe_t *id_tx_tail; 79710852SBill.Taylor@Sun.COM int id_tx_busy; 79810852SBill.Taylor@Sun.COM 79910852SBill.Taylor@Sun.COM uint_t id_tx_buf_sz; 8008917SVenkatakrishnan.Rajagopalan@Sun.COM uint8_t *id_tx_bufs; 80110852SBill.Taylor@Sun.COM ibd_swqe_t *id_tx_wqes; 8028917SVenkatakrishnan.Rajagopalan@Sun.COM ibt_mr_hdl_t id_tx_mr_hdl; 8038917SVenkatakrishnan.Rajagopalan@Sun.COM ibt_mr_desc_t id_tx_mr_desc; 8048917SVenkatakrishnan.Rajagopalan@Sun.COM 8058917SVenkatakrishnan.Rajagopalan@Sun.COM kmutex_t id_lso_lock; 8068917SVenkatakrishnan.Rajagopalan@Sun.COM ibd_lsobkt_t *id_lso; 8078917SVenkatakrishnan.Rajagopalan@Sun.COM 80810852SBill.Taylor@Sun.COM kmutex_t id_scq_poll_lock; 80910852SBill.Taylor@Sun.COM int id_scq_poll_busy; 8108917SVenkatakrishnan.Rajagopalan@Sun.COM 8110Sstevel@tonic-gate ibt_cq_hdl_t id_scq_hdl; 8120Sstevel@tonic-gate ibt_wc_t *id_txwcs; 8135766Sgg161487 uint32_t id_txwcs_size; 8140Sstevel@tonic-gate 81510852SBill.Taylor@Sun.COM int id_rx_nqueues; 81610852SBill.Taylor@Sun.COM ibd_rx_queue_t *id_rx_queues; 81711421SBill.Taylor@Sun.COM int id_rx_post_queue_index; 81811421SBill.Taylor@Sun.COM uint32_t id_rx_post_active; 8198917SVenkatakrishnan.Rajagopalan@Sun.COM 82010852SBill.Taylor@Sun.COM ibd_rwqe_t *id_rx_wqes; 82110852SBill.Taylor@Sun.COM uint8_t *id_rx_bufs; 82210852SBill.Taylor@Sun.COM ibt_mr_hdl_t id_rx_mr_hdl; 82310852SBill.Taylor@Sun.COM ibt_mr_desc_t id_rx_mr_desc; 82410852SBill.Taylor@Sun.COM uint_t id_rx_buf_sz; 82512163SRamaswamy.Tummala@Sun.COM /* 82612163SRamaswamy.Tummala@Sun.COM * id_ud_num_rwqe 82712163SRamaswamy.Tummala@Sun.COM * Number of "receive WQE" elements that will be allocated and used 82812163SRamaswamy.Tummala@Sun.COM * by ibd. This parameter is limited by the maximum channel size of 82912163SRamaswamy.Tummala@Sun.COM * the HCA. Each buffer in the receive wqe will be of MTU size. 83012163SRamaswamy.Tummala@Sun.COM */ 83112163SRamaswamy.Tummala@Sun.COM uint32_t id_ud_num_rwqe; 8320Sstevel@tonic-gate ibd_list_t id_rx_list; 8335766Sgg161487 ddi_softintr_t id_rx; 83410852SBill.Taylor@Sun.COM uint32_t id_rx_bufs_outstanding_limit; 83510852SBill.Taylor@Sun.COM uint32_t id_rx_allocb; 83610852SBill.Taylor@Sun.COM uint32_t id_rx_allocb_failed; 83710852SBill.Taylor@Sun.COM ibd_list_t id_rx_free_list; 83810852SBill.Taylor@Sun.COM 83910852SBill.Taylor@Sun.COM kmutex_t id_rcq_poll_lock; 84010852SBill.Taylor@Sun.COM int id_rcq_poll_busy; 84110852SBill.Taylor@Sun.COM uint32_t id_rxwcs_size; 8425766Sgg161487 ibt_wc_t *id_rxwcs; 84310852SBill.Taylor@Sun.COM ibt_cq_hdl_t id_rcq_hdl; 8440Sstevel@tonic-gate 8450Sstevel@tonic-gate ibt_channel_hdl_t id_chnl_hdl; 8460Sstevel@tonic-gate ib_pkey_t id_pkey; 8470Sstevel@tonic-gate uint16_t id_pkix; 8480Sstevel@tonic-gate uint8_t id_port; 8490Sstevel@tonic-gate ibt_mcg_info_t *id_mcinfo; 8500Sstevel@tonic-gate 8515766Sgg161487 mac_handle_t id_mh; 8528917SVenkatakrishnan.Rajagopalan@Sun.COM mac_resource_handle_t id_rh; 8530Sstevel@tonic-gate ib_gid_t id_sgid; 8540Sstevel@tonic-gate ib_qpn_t id_qpnum; 8550Sstevel@tonic-gate ipoib_mac_t id_macaddr; 8560Sstevel@tonic-gate ib_gid_t id_mgid; 8570Sstevel@tonic-gate ipoib_mac_t id_bcaddr; 8580Sstevel@tonic-gate 8590Sstevel@tonic-gate int id_mtu; 8600Sstevel@tonic-gate uchar_t id_scope; 8610Sstevel@tonic-gate 8620Sstevel@tonic-gate kmutex_t id_acache_req_lock; 8630Sstevel@tonic-gate kcondvar_t id_acache_req_cv; 8648917SVenkatakrishnan.Rajagopalan@Sun.COM struct list id_req_list; 8650Sstevel@tonic-gate kt_did_t id_async_thrid; 8660Sstevel@tonic-gate 8670Sstevel@tonic-gate kmutex_t id_ac_mutex; 86810852SBill.Taylor@Sun.COM ibd_ace_t *id_ac_hot_ace; 8698917SVenkatakrishnan.Rajagopalan@Sun.COM struct list id_ah_active; 8700Sstevel@tonic-gate struct list id_ah_free; 8710Sstevel@tonic-gate ipoib_mac_t id_ah_addr; 8720Sstevel@tonic-gate ibd_req_t id_ah_req; 8730Sstevel@tonic-gate char id_ah_op; 8748917SVenkatakrishnan.Rajagopalan@Sun.COM uint64_t id_ah_error; 8750Sstevel@tonic-gate ibd_ace_t *id_ac_list; 8768917SVenkatakrishnan.Rajagopalan@Sun.COM mod_hash_t *id_ah_active_hash; 8770Sstevel@tonic-gate 8780Sstevel@tonic-gate kmutex_t id_mc_mutex; 8790Sstevel@tonic-gate struct list id_mc_full; 8800Sstevel@tonic-gate struct list id_mc_non; 8810Sstevel@tonic-gate 8820Sstevel@tonic-gate kmutex_t id_trap_lock; 8830Sstevel@tonic-gate kcondvar_t id_trap_cv; 8840Sstevel@tonic-gate boolean_t id_trap_stop; 8850Sstevel@tonic-gate uint32_t id_trap_inprog; 8860Sstevel@tonic-gate 8875766Sgg161487 char id_prom_op; 8880Sstevel@tonic-gate 8890Sstevel@tonic-gate kmutex_t id_sched_lock; 8908917SVenkatakrishnan.Rajagopalan@Sun.COM int id_sched_needed; 89110852SBill.Taylor@Sun.COM int id_sched_cnt; 89210852SBill.Taylor@Sun.COM int id_sched_lso_cnt; 8930Sstevel@tonic-gate 8940Sstevel@tonic-gate kmutex_t id_link_mutex; 8955766Sgg161487 link_state_t id_link_state; 8960Sstevel@tonic-gate uint64_t id_link_speed; 8970Sstevel@tonic-gate 8980Sstevel@tonic-gate uint64_t id_num_intrs; 8990Sstevel@tonic-gate uint64_t id_tx_short; 90012163SRamaswamy.Tummala@Sun.COM /* 90112163SRamaswamy.Tummala@Sun.COM * id_ud_num_swqe 90212163SRamaswamy.Tummala@Sun.COM * Number of "send WQE" elements that will be allocated and used by 90312163SRamaswamy.Tummala@Sun.COM * ibd. When tuning this parameter, the size of pre-allocated, pre- 90412163SRamaswamy.Tummala@Sun.COM * mapped copy buffer in each of these send wqes must be taken into 90512163SRamaswamy.Tummala@Sun.COM * account. This copy buffer size is determined by the value of 90612163SRamaswamy.Tummala@Sun.COM * IBD_TX_BUF_SZ (this is currently set to the same value of 90712163SRamaswamy.Tummala@Sun.COM * ibd_tx_copy_thresh, but may be changed independently if needed). 90812163SRamaswamy.Tummala@Sun.COM */ 90912163SRamaswamy.Tummala@Sun.COM uint32_t id_ud_num_swqe; 9105766Sgg161487 9115766Sgg161487 uint64_t id_xmt_bytes; 9128917SVenkatakrishnan.Rajagopalan@Sun.COM uint64_t id_rcv_bytes; 9135766Sgg161487 uint64_t id_multi_xmt; 9145766Sgg161487 uint64_t id_brd_xmt; 9155766Sgg161487 uint64_t id_multi_rcv; 9165766Sgg161487 uint64_t id_brd_rcv; 9175766Sgg161487 uint64_t id_xmt_pkt; 9185766Sgg161487 uint64_t id_rcv_pkt; 9198917SVenkatakrishnan.Rajagopalan@Sun.COM 9208917SVenkatakrishnan.Rajagopalan@Sun.COM uint32_t id_hwcksum_capab; 9218917SVenkatakrishnan.Rajagopalan@Sun.COM boolean_t id_lso_policy; 9228917SVenkatakrishnan.Rajagopalan@Sun.COM boolean_t id_lso_capable; 9238917SVenkatakrishnan.Rajagopalan@Sun.COM uint_t id_lso_maxlen; 9248917SVenkatakrishnan.Rajagopalan@Sun.COM int id_hca_res_lkey_capab; 9258917SVenkatakrishnan.Rajagopalan@Sun.COM ibt_lkey_t id_res_lkey; 9269985SVenkatakrishnan.Rajagopalan@Sun.COM 92710300SVenki.Rajagopalan@Sun.COM boolean_t id_bgroup_created; 92810300SVenki.Rajagopalan@Sun.COM kmutex_t id_macst_lock; 92910300SVenki.Rajagopalan@Sun.COM kcondvar_t id_macst_cv; 9309985SVenkatakrishnan.Rajagopalan@Sun.COM uint32_t id_mac_state; 93111534SKevin.Ge@Sun.COM 93211534SKevin.Ge@Sun.COM /* For Reliable Connected Mode */ 93311534SKevin.Ge@Sun.COM boolean_t id_enable_rc; 93411534SKevin.Ge@Sun.COM boolean_t rc_enable_srq; 93511534SKevin.Ge@Sun.COM 93611534SKevin.Ge@Sun.COM int rc_mtu; 93711534SKevin.Ge@Sun.COM uint32_t rc_tx_max_sqseg; 93811768SKevin.Ge@Sun.COM /* 93911768SKevin.Ge@Sun.COM * In IPoIB over Reliable Connected mode, its mac address is added 94011768SKevin.Ge@Sun.COM * an "IBD_MAC_ADDR_RC" prefix. But for loopback filter in function 94111768SKevin.Ge@Sun.COM * ibd_process_rx(), the input mac address should not include the 94211768SKevin.Ge@Sun.COM * "IBD_MAC_ADDR_RC" prefix. 94311768SKevin.Ge@Sun.COM * 94411768SKevin.Ge@Sun.COM * So, we introduce the rc_macaddr_loopback for the loopback filter in 94511768SKevin.Ge@Sun.COM * IPoIB over Reliable Connected mode. 94611768SKevin.Ge@Sun.COM * 94711768SKevin.Ge@Sun.COM * rc_macaddr_loopback = id_macaddr excludes "IBD_MAC_ADDR_RC" prefix. 94811768SKevin.Ge@Sun.COM */ 94911768SKevin.Ge@Sun.COM ipoib_mac_t rc_macaddr_loopback; 95011534SKevin.Ge@Sun.COM 95111534SKevin.Ge@Sun.COM ibt_srv_hdl_t rc_listen_hdl; 95211534SKevin.Ge@Sun.COM ibt_sbind_hdl_t rc_listen_bind; 95311534SKevin.Ge@Sun.COM ibt_srv_hdl_t rc_listen_hdl_OFED_interop; 95411534SKevin.Ge@Sun.COM ibt_sbind_hdl_t rc_listen_bind_OFED_interop; 95511534SKevin.Ge@Sun.COM 95611534SKevin.Ge@Sun.COM ibd_rc_chan_list_t rc_pass_chan_list; 95711534SKevin.Ge@Sun.COM /* obsolete active channel list */ 95811534SKevin.Ge@Sun.COM ibd_rc_chan_list_t rc_obs_act_chan_list; 95911534SKevin.Ge@Sun.COM 96011534SKevin.Ge@Sun.COM kmutex_t rc_ace_recycle_lock; 96111534SKevin.Ge@Sun.COM ibd_ace_t *rc_ace_recycle; 96211534SKevin.Ge@Sun.COM 96311534SKevin.Ge@Sun.COM /* Send */ 96411534SKevin.Ge@Sun.COM /* 96511534SKevin.Ge@Sun.COM * This mutex protects rc_tx_largebuf_free_head, rc_tx_largebuf_nfree 96611534SKevin.Ge@Sun.COM * and ibd_rc_tx_largebuf_t->lb_next 96711534SKevin.Ge@Sun.COM */ 96811534SKevin.Ge@Sun.COM kmutex_t rc_tx_large_bufs_lock; 96911534SKevin.Ge@Sun.COM ibd_rc_tx_largebuf_t *rc_tx_largebuf_free_head; 97011534SKevin.Ge@Sun.COM uint_t rc_tx_largebuf_nfree; 97111534SKevin.Ge@Sun.COM /* The chunk of whole Tx large buffers */ 97211534SKevin.Ge@Sun.COM uint8_t *rc_tx_mr_bufs; 97311534SKevin.Ge@Sun.COM ibt_mr_hdl_t rc_tx_mr_hdl; 97411534SKevin.Ge@Sun.COM ibt_mr_desc_t rc_tx_mr_desc; 97511534SKevin.Ge@Sun.COM ibd_rc_tx_largebuf_t *rc_tx_largebuf_desc_base; /* base addr */ 97611534SKevin.Ge@Sun.COM 97711534SKevin.Ge@Sun.COM boolean_t rc_enable_iov_map; 97811534SKevin.Ge@Sun.COM uint_t rc_max_sqseg_hiwm; 97911534SKevin.Ge@Sun.COM 98011534SKevin.Ge@Sun.COM /* For SRQ */ 98111534SKevin.Ge@Sun.COM uint32_t rc_srq_size; 98211534SKevin.Ge@Sun.COM ibt_srq_hdl_t rc_srq_hdl; 98311534SKevin.Ge@Sun.COM ibd_list_t rc_srq_rwqe_list; 98411534SKevin.Ge@Sun.COM ibd_list_t rc_srq_free_list; 98511534SKevin.Ge@Sun.COM ibd_rwqe_t *rc_srq_rwqes; 98611534SKevin.Ge@Sun.COM uint8_t *rc_srq_rx_bufs; 98711534SKevin.Ge@Sun.COM ibt_mr_hdl_t rc_srq_rx_mr_hdl; 98811534SKevin.Ge@Sun.COM ibt_mr_desc_t rc_srq_rx_mr_desc; 98911534SKevin.Ge@Sun.COM 99011534SKevin.Ge@Sun.COM /* For chained receive */ 99111534SKevin.Ge@Sun.COM kmutex_t rc_rx_lock; 99211534SKevin.Ge@Sun.COM mblk_t *rc_rx_mp; 99311534SKevin.Ge@Sun.COM mblk_t *rc_rx_mp_tail; 99411534SKevin.Ge@Sun.COM uint32_t rc_rx_mp_len; 99511534SKevin.Ge@Sun.COM 996*13030SKevin.Ge@Sun.COM uint32_t rc_num_tx_chan; 997*13030SKevin.Ge@Sun.COM uint32_t rc_num_rx_chan; 998*13030SKevin.Ge@Sun.COM 999*13030SKevin.Ge@Sun.COM /* Protect rc_timeout_start and rc_timeout */ 1000*13030SKevin.Ge@Sun.COM kmutex_t rc_timeout_lock; 1001*13030SKevin.Ge@Sun.COM boolean_t rc_timeout_start; 1002*13030SKevin.Ge@Sun.COM timeout_id_t rc_timeout; 1003*13030SKevin.Ge@Sun.COM 100411534SKevin.Ge@Sun.COM /* Counters for RC mode */ 100511534SKevin.Ge@Sun.COM /* RX */ 100611534SKevin.Ge@Sun.COM /* 100711534SKevin.Ge@Sun.COM * # of Received packets. These packets are directly transferred to GLD 100811534SKevin.Ge@Sun.COM * without copy it 100911534SKevin.Ge@Sun.COM */ 101011534SKevin.Ge@Sun.COM uint64_t rc_rcv_trans_byte; 101111534SKevin.Ge@Sun.COM uint64_t rc_rcv_trans_pkt; 101211534SKevin.Ge@Sun.COM /* 101311534SKevin.Ge@Sun.COM * # of Received packets. We will allocate new buffers for these packet, 101411534SKevin.Ge@Sun.COM * copy their content into new buffers, then transfer to GLD 101511534SKevin.Ge@Sun.COM */ 101611534SKevin.Ge@Sun.COM uint64_t rc_rcv_copy_byte; 101711534SKevin.Ge@Sun.COM uint64_t rc_rcv_copy_pkt; 101811534SKevin.Ge@Sun.COM uint64_t rc_rcv_alloc_fail; 101911534SKevin.Ge@Sun.COM 102011534SKevin.Ge@Sun.COM #ifdef DEBUG 102111534SKevin.Ge@Sun.COM uint64_t rc_rwqe_short; /* short rwqe */ 102211534SKevin.Ge@Sun.COM #endif 102311534SKevin.Ge@Sun.COM 102411534SKevin.Ge@Sun.COM /* wc->wc_status != IBT_WC_SUCCESS */ 102511534SKevin.Ge@Sun.COM uint64_t rc_rcq_err; 102611534SKevin.Ge@Sun.COM 102711534SKevin.Ge@Sun.COM /* Tx */ 102811534SKevin.Ge@Sun.COM uint64_t rc_xmt_bytes; 102911534SKevin.Ge@Sun.COM 103011534SKevin.Ge@Sun.COM /* pkt size <= ibd_rc_tx_copy_thresh */ 103111534SKevin.Ge@Sun.COM uint64_t rc_xmt_small_pkt; 103211534SKevin.Ge@Sun.COM uint64_t rc_xmt_fragmented_pkt; 103311534SKevin.Ge@Sun.COM /* fail in ibt_map_mem_iov() */ 103411534SKevin.Ge@Sun.COM uint64_t rc_xmt_map_fail_pkt; 103511534SKevin.Ge@Sun.COM /* succ in ibt_map_mem_iov() */ 103611534SKevin.Ge@Sun.COM uint64_t rc_xmt_map_succ_pkt; 103711534SKevin.Ge@Sun.COM 103811534SKevin.Ge@Sun.COM uint64_t rc_ace_not_found; 103911534SKevin.Ge@Sun.COM 104011534SKevin.Ge@Sun.COM uint64_t rc_xmt_drop_too_long_pkt; 104111534SKevin.Ge@Sun.COM uint64_t rc_xmt_icmp_too_long_pkt; 104211534SKevin.Ge@Sun.COM uint64_t rc_xmt_reenter_too_long_pkt; 104311534SKevin.Ge@Sun.COM 104411534SKevin.Ge@Sun.COM /* short swqe in ibd_send() */ 104511534SKevin.Ge@Sun.COM uint64_t rc_swqe_short; 104611534SKevin.Ge@Sun.COM /* call mac_tx_update when there is enough swqe */ 104711534SKevin.Ge@Sun.COM uint64_t rc_swqe_mac_update; 104811534SKevin.Ge@Sun.COM /* short tx large copy buf in ibd_send() */ 104911534SKevin.Ge@Sun.COM uint64_t rc_xmt_buf_short; 105011534SKevin.Ge@Sun.COM /* call mac_tx_update when there is enough Tx copy buf */ 105111534SKevin.Ge@Sun.COM uint64_t rc_xmt_buf_mac_update; 105211534SKevin.Ge@Sun.COM 105311534SKevin.Ge@Sun.COM /* No swqe even after call swqe recycle function */ 105411534SKevin.Ge@Sun.COM uint64_t rc_scq_no_swqe; 105511534SKevin.Ge@Sun.COM /* No large Tx buf even after call swqe recycle function */ 105611534SKevin.Ge@Sun.COM uint64_t rc_scq_no_largebuf; 105711534SKevin.Ge@Sun.COM 105811534SKevin.Ge@Sun.COM /* Connection setup and close */ 105911534SKevin.Ge@Sun.COM uint64_t rc_conn_succ; /* time of succ connect */ 106011534SKevin.Ge@Sun.COM uint64_t rc_conn_fail; /* time of fail connect */ 106111534SKevin.Ge@Sun.COM /* ace->ac_chan == NULL for unicast packet */ 106211534SKevin.Ge@Sun.COM uint64_t rc_null_conn; 106311534SKevin.Ge@Sun.COM /* not in active established state */ 106411534SKevin.Ge@Sun.COM uint64_t rc_no_estab_conn; 106511534SKevin.Ge@Sun.COM 106611534SKevin.Ge@Sun.COM uint64_t rc_act_close; /* call ibd_rc_act_close() */ 106711534SKevin.Ge@Sun.COM uint64_t rc_pas_close; /* call ibd_rc_pas_close() */ 106811534SKevin.Ge@Sun.COM uint64_t rc_delay_ace_recycle; 106911534SKevin.Ge@Sun.COM uint64_t rc_act_close_simultaneous; 1070*13030SKevin.Ge@Sun.COM /* Fail to close a channel because someone else is still using it */ 1071*13030SKevin.Ge@Sun.COM uint64_t rc_act_close_not_clean; 1072*13030SKevin.Ge@Sun.COM /* RCQ is being invoked when closing RC channel */ 1073*13030SKevin.Ge@Sun.COM uint64_t rc_pas_close_rcq_invoking; 107411534SKevin.Ge@Sun.COM 107511534SKevin.Ge@Sun.COM /* the counter of reset RC channel */ 107611534SKevin.Ge@Sun.COM uint64_t rc_reset_cnt; 107711534SKevin.Ge@Sun.COM 1078*13030SKevin.Ge@Sun.COM uint64_t rc_timeout_act; 1079*13030SKevin.Ge@Sun.COM uint64_t rc_timeout_pas; 1080*13030SKevin.Ge@Sun.COM 1081*13030SKevin.Ge@Sun.COM /* 1082*13030SKevin.Ge@Sun.COM * Fail to stop this port because this port is connecting to a remote 1083*13030SKevin.Ge@Sun.COM * port 1084*13030SKevin.Ge@Sun.COM */ 1085*13030SKevin.Ge@Sun.COM uint64_t rc_stop_connect; 1086*13030SKevin.Ge@Sun.COM 108711534SKevin.Ge@Sun.COM #ifdef DEBUG 108811534SKevin.Ge@Sun.COM kstat_t *rc_ksp; 108911534SKevin.Ge@Sun.COM #endif 109012163SRamaswamy.Tummala@Sun.COM ib_guid_t id_hca_guid; 109112163SRamaswamy.Tummala@Sun.COM ib_guid_t id_port_guid; 109212163SRamaswamy.Tummala@Sun.COM datalink_id_t id_dlinkid; 109312163SRamaswamy.Tummala@Sun.COM datalink_id_t id_plinkid; 109412163SRamaswamy.Tummala@Sun.COM int id_port_inst; 109512163SRamaswamy.Tummala@Sun.COM struct ibd_state_s *id_next; 109612163SRamaswamy.Tummala@Sun.COM boolean_t id_force_create; 109712163SRamaswamy.Tummala@Sun.COM boolean_t id_bgroup_present; 109812163SRamaswamy.Tummala@Sun.COM uint_t id_hca_max_chan_sz; 109912163SRamaswamy.Tummala@Sun.COM 110012163SRamaswamy.Tummala@Sun.COM /* 110112163SRamaswamy.Tummala@Sun.COM * UD Mode Tunables 110212163SRamaswamy.Tummala@Sun.COM * 110312163SRamaswamy.Tummala@Sun.COM * id_ud_tx_copy_thresh 110412163SRamaswamy.Tummala@Sun.COM * This sets the threshold at which ibd will attempt to do a bcopy 110512163SRamaswamy.Tummala@Sun.COM * of the outgoing data into a pre-mapped buffer. IPoIB driver's 110612163SRamaswamy.Tummala@Sun.COM * send behavior is restricted by various parameters, so setting of 110712163SRamaswamy.Tummala@Sun.COM * this value must be made after careful considerations only. For 110812163SRamaswamy.Tummala@Sun.COM * instance, IB HCAs currently impose a relatively small limit 110912163SRamaswamy.Tummala@Sun.COM * (when compared to ethernet NICs) on the length of the SGL for 111012163SRamaswamy.Tummala@Sun.COM * transmit. On the other hand, the ip stack could send down mp 111112163SRamaswamy.Tummala@Sun.COM * chains that are quite long when LSO is enabled. 111212163SRamaswamy.Tummala@Sun.COM * 111312163SRamaswamy.Tummala@Sun.COM * id_num_lso_bufs 111412163SRamaswamy.Tummala@Sun.COM * Number of "larger-than-MTU" copy buffers to use for cases when the 111512163SRamaswamy.Tummala@Sun.COM * outgoing mblk chain is too fragmented to be used with 111612163SRamaswamy.Tummala@Sun.COM * ibt_map_mem_iov() and too large to be used with regular MTU-sized 111712163SRamaswamy.Tummala@Sun.COM * copy buffers. It is not recommended to tune this variable without 111812163SRamaswamy.Tummala@Sun.COM * understanding the application environment and/or memory resources. 111912163SRamaswamy.Tummala@Sun.COM * The size of each of these lso buffers is determined by the value of 112012163SRamaswamy.Tummala@Sun.COM * IBD_LSO_BUFSZ. 112112163SRamaswamy.Tummala@Sun.COM * 112212163SRamaswamy.Tummala@Sun.COM * id_num_ah 112312163SRamaswamy.Tummala@Sun.COM * Number of AH cache entries to allocate 112412163SRamaswamy.Tummala@Sun.COM * 112512163SRamaswamy.Tummala@Sun.COM * id_hash_size 112612163SRamaswamy.Tummala@Sun.COM * Hash table size for the active AH list 112712163SRamaswamy.Tummala@Sun.COM * 112812163SRamaswamy.Tummala@Sun.COM */ 112912163SRamaswamy.Tummala@Sun.COM uint_t id_ud_tx_copy_thresh; 113012163SRamaswamy.Tummala@Sun.COM uint_t id_num_lso_bufs; 113112163SRamaswamy.Tummala@Sun.COM uint_t id_num_ah; 113212163SRamaswamy.Tummala@Sun.COM uint_t id_hash_size; 113312163SRamaswamy.Tummala@Sun.COM 113412163SRamaswamy.Tummala@Sun.COM boolean_t id_create_broadcast_group; 113512163SRamaswamy.Tummala@Sun.COM 113612163SRamaswamy.Tummala@Sun.COM boolean_t id_allow_coalesce_comp_tuning; 113712163SRamaswamy.Tummala@Sun.COM uint_t id_ud_rx_comp_count; 113812163SRamaswamy.Tummala@Sun.COM uint_t id_ud_rx_comp_usec; 113912163SRamaswamy.Tummala@Sun.COM uint_t id_ud_tx_comp_count; 114012163SRamaswamy.Tummala@Sun.COM uint_t id_ud_tx_comp_usec; 114112163SRamaswamy.Tummala@Sun.COM 114212163SRamaswamy.Tummala@Sun.COM /* RC Mode Tunables */ 114312163SRamaswamy.Tummala@Sun.COM 114412163SRamaswamy.Tummala@Sun.COM uint_t id_rc_rx_comp_count; 114512163SRamaswamy.Tummala@Sun.COM uint_t id_rc_rx_comp_usec; 114612163SRamaswamy.Tummala@Sun.COM uint_t id_rc_tx_comp_count; 114712163SRamaswamy.Tummala@Sun.COM uint_t id_rc_tx_comp_usec; 114812163SRamaswamy.Tummala@Sun.COM /* 114912163SRamaswamy.Tummala@Sun.COM * id_rc_tx_copy_thresh 115012163SRamaswamy.Tummala@Sun.COM * This sets the threshold at which ibd will attempt to do a bcopy 115112163SRamaswamy.Tummala@Sun.COM * of the outgoing data into a pre-mapped buffer. 115212163SRamaswamy.Tummala@Sun.COM * 115312163SRamaswamy.Tummala@Sun.COM * id_rc_rx_copy_thresh 115412163SRamaswamy.Tummala@Sun.COM * If (the size of incoming buffer <= id_rc_rx_copy_thresh), ibd 115512163SRamaswamy.Tummala@Sun.COM * will attempt to allocate a buffer and do a bcopy of the incoming 115612163SRamaswamy.Tummala@Sun.COM * data into the allocated buffer. 115712163SRamaswamy.Tummala@Sun.COM * 115812163SRamaswamy.Tummala@Sun.COM * id_rc_rx_rwqe_thresh 115912163SRamaswamy.Tummala@Sun.COM * If (the number of available rwqe < ibd_rc_rx_rwqe_thresh), ibd 116012163SRamaswamy.Tummala@Sun.COM * will attempt to allocate a buffer and do a bcopy of the incoming 116112163SRamaswamy.Tummala@Sun.COM * data into the allocated buffer. 116212163SRamaswamy.Tummala@Sun.COM * 116312163SRamaswamy.Tummala@Sun.COM * id_rc_num_swqe 116412163SRamaswamy.Tummala@Sun.COM * 1) Send CQ size = ibd_rc_num_swqe 116512163SRamaswamy.Tummala@Sun.COM * 2) The send queue size = ibd_rc_num_swqe -1 116612163SRamaswamy.Tummala@Sun.COM * 3) Number of pre-allocated Tx buffers for ibt_post_send() = 116712163SRamaswamy.Tummala@Sun.COM * ibd_rc_num_swqe - 1. 116812163SRamaswamy.Tummala@Sun.COM * 116912163SRamaswamy.Tummala@Sun.COM * id_rc_num_rwqe 1170*13030SKevin.Ge@Sun.COM * 1) For non-SRQ, we pre-post id_rc_num_rwqe number of WRs 117112163SRamaswamy.Tummala@Sun.COM * via ibt_post_receive() for receive queue of each RC channel. 1172*13030SKevin.Ge@Sun.COM * 2) For SRQ and non-SRQ, receive CQ size = id_rc_num_rwqe 117312163SRamaswamy.Tummala@Sun.COM * 117412163SRamaswamy.Tummala@Sun.COM * For SRQ 1175*13030SKevin.Ge@Sun.COM * If using SRQ, we allocate id_rc_num_srq number of buffers (the 117612163SRamaswamy.Tummala@Sun.COM * size of each buffer is equal to RC mtu). And post them by 117712163SRamaswamy.Tummala@Sun.COM * ibt_post_srq(). 117812163SRamaswamy.Tummala@Sun.COM * 117912163SRamaswamy.Tummala@Sun.COM * id_rc_num_srq 1180*13030SKevin.Ge@Sun.COM * id_rc_num_srq should not be larger than id_rc_num_rwqe, 118112163SRamaswamy.Tummala@Sun.COM * otherwise it will cause a bug with the following warnings: 118212163SRamaswamy.Tummala@Sun.COM * NOTICE: hermon0: Device Error: EQE cq overrun or protection error 118312163SRamaswamy.Tummala@Sun.COM * NOTICE: hermon0: Device Error: EQE local work queue catastrophic 118412163SRamaswamy.Tummala@Sun.COM * error 118512163SRamaswamy.Tummala@Sun.COM * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff 118612163SRamaswamy.Tummala@Sun.COM * catastrophic channel error 118712163SRamaswamy.Tummala@Sun.COM * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff 118812163SRamaswamy.Tummala@Sun.COM * completion queue error 118912163SRamaswamy.Tummala@Sun.COM */ 119012163SRamaswamy.Tummala@Sun.COM uint_t id_rc_tx_copy_thresh; 119112163SRamaswamy.Tummala@Sun.COM uint_t id_rc_rx_copy_thresh; 119212163SRamaswamy.Tummala@Sun.COM uint_t id_rc_rx_rwqe_thresh; 119312163SRamaswamy.Tummala@Sun.COM uint_t id_rc_num_swqe; 119412163SRamaswamy.Tummala@Sun.COM uint_t id_rc_num_rwqe; 119512163SRamaswamy.Tummala@Sun.COM uint_t id_rc_num_srq; 11960Sstevel@tonic-gate } ibd_state_t; 11970Sstevel@tonic-gate 119811642SKevin.Ge@Sun.COM /* 119911642SKevin.Ge@Sun.COM * Structures to track global IBTF data, data that is shared 120011642SKevin.Ge@Sun.COM * among the IBD device instances. This includes the one ibt_hdl 120111642SKevin.Ge@Sun.COM * and the list of service registrations. 120211642SKevin.Ge@Sun.COM */ 120311642SKevin.Ge@Sun.COM typedef struct ibd_service_s { 120411642SKevin.Ge@Sun.COM struct ibd_service_s *is_link; 120511642SKevin.Ge@Sun.COM ibt_srv_hdl_t is_srv_hdl; 120611642SKevin.Ge@Sun.COM ib_svc_id_t is_sid; 120711642SKevin.Ge@Sun.COM uint_t is_ref_cnt; 120811642SKevin.Ge@Sun.COM } ibd_service_t; 120911642SKevin.Ge@Sun.COM 121011642SKevin.Ge@Sun.COM typedef struct ibd_global_state_s { 121111642SKevin.Ge@Sun.COM kmutex_t ig_mutex; 121211642SKevin.Ge@Sun.COM ibt_clnt_hdl_t ig_ibt_hdl; 121311642SKevin.Ge@Sun.COM uint_t ig_ibt_hdl_ref_cnt; 121411642SKevin.Ge@Sun.COM ibd_service_t *ig_service_list; 121511642SKevin.Ge@Sun.COM } ibd_global_state_t; 121611642SKevin.Ge@Sun.COM 121711534SKevin.Ge@Sun.COM typedef struct ibd_rc_msg_hello_s { 121811534SKevin.Ge@Sun.COM uint32_t reserved_qpn; 121911534SKevin.Ge@Sun.COM uint32_t rx_mtu; 122011534SKevin.Ge@Sun.COM } ibd_rc_msg_hello_t; 122111534SKevin.Ge@Sun.COM 122211534SKevin.Ge@Sun.COM typedef struct ibd_rc_chan_s { 122311534SKevin.Ge@Sun.COM struct ibd_rc_chan_s *next; 122411534SKevin.Ge@Sun.COM /* channel hdl that we'll be using for Reliable Connected Mode */ 122511534SKevin.Ge@Sun.COM ibt_channel_hdl_t chan_hdl; 122611534SKevin.Ge@Sun.COM struct ibd_state_s *state; 122711534SKevin.Ge@Sun.COM ibd_ace_t *ace; 122811534SKevin.Ge@Sun.COM ibd_rc_chan_state_t chan_state; 122911534SKevin.Ge@Sun.COM 123011534SKevin.Ge@Sun.COM ibd_list_t tx_wqe_list; /* free wqe list */ 123111534SKevin.Ge@Sun.COM ibd_list_t tx_rel_list; /* for swqe recycle */ 123211534SKevin.Ge@Sun.COM 123311534SKevin.Ge@Sun.COM ibd_swqe_t *tx_wqes; 123411534SKevin.Ge@Sun.COM 123511534SKevin.Ge@Sun.COM /* start address of Tx Buffers */ 123611534SKevin.Ge@Sun.COM uint8_t *tx_mr_bufs; 123711534SKevin.Ge@Sun.COM ibt_mr_hdl_t tx_mr_hdl; 123811534SKevin.Ge@Sun.COM ibt_mr_desc_t tx_mr_desc; 123911534SKevin.Ge@Sun.COM 124011534SKevin.Ge@Sun.COM ibt_cq_hdl_t scq_hdl; /* Tx completion queue */ 124111534SKevin.Ge@Sun.COM ibt_wc_t tx_wc[IBD_RC_MAX_CQ_WC]; 124211534SKevin.Ge@Sun.COM ddi_softintr_t scq_softintr; 124311534SKevin.Ge@Sun.COM 124411534SKevin.Ge@Sun.COM /* For chained send */ 124511534SKevin.Ge@Sun.COM kmutex_t tx_post_lock; 124611534SKevin.Ge@Sun.COM ibd_swqe_t *tx_head; 124711534SKevin.Ge@Sun.COM ibd_swqe_t *tx_tail; 124811534SKevin.Ge@Sun.COM int tx_busy; 124911534SKevin.Ge@Sun.COM 125011534SKevin.Ge@Sun.COM /* For tx buffer recycle */ 125111534SKevin.Ge@Sun.COM kmutex_t tx_poll_lock; 125211534SKevin.Ge@Sun.COM int tx_poll_busy; 125311534SKevin.Ge@Sun.COM 125411534SKevin.Ge@Sun.COM /* Rx */ 125511534SKevin.Ge@Sun.COM ibd_list_t rx_wqe_list; /* used by ibt_post_recv */ 125611534SKevin.Ge@Sun.COM ibd_list_t rx_free_list; /* free rwqe list */ 125711534SKevin.Ge@Sun.COM 125811534SKevin.Ge@Sun.COM ibt_cq_hdl_t rcq_hdl; /* Rx completion queue */ 125911534SKevin.Ge@Sun.COM ibt_wc_t rx_wc[IBD_RC_MAX_CQ_WC]; 126011534SKevin.Ge@Sun.COM 126111534SKevin.Ge@Sun.COM ibd_rwqe_t *rx_rwqes; /* the chuck of whole rwqes */ 126211534SKevin.Ge@Sun.COM uint8_t *rx_bufs; /* the chuck of whole Rx bufs */ 126311534SKevin.Ge@Sun.COM ibt_mr_hdl_t rx_mr_hdl; /* ibt_mr_hdl_t for rx_bufs */ 126411534SKevin.Ge@Sun.COM ibt_mr_desc_t rx_mr_desc; /* ibt_mr_desc_t for rx_bufs */ 126511534SKevin.Ge@Sun.COM 126611534SKevin.Ge@Sun.COM /* For chained receive */ 126711534SKevin.Ge@Sun.COM kmutex_t rx_lock; 126811534SKevin.Ge@Sun.COM mblk_t *rx_mp; 126911534SKevin.Ge@Sun.COM mblk_t *rx_mp_tail; 127011534SKevin.Ge@Sun.COM uint32_t rx_mp_len; 127111534SKevin.Ge@Sun.COM 127211534SKevin.Ge@Sun.COM uint32_t rcq_size; 127311534SKevin.Ge@Sun.COM uint32_t scq_size; 127411534SKevin.Ge@Sun.COM /* 127511534SKevin.Ge@Sun.COM * We need two channels for each connection. 127611534SKevin.Ge@Sun.COM * One channel for Tx; another channel for Rx. 127711534SKevin.Ge@Sun.COM * If "is_tx_chan == B_TRUE", this is a Tx channel. 127811534SKevin.Ge@Sun.COM */ 127911534SKevin.Ge@Sun.COM boolean_t is_tx_chan; 1280*13030SKevin.Ge@Sun.COM 1281*13030SKevin.Ge@Sun.COM /* 1282*13030SKevin.Ge@Sun.COM * For the connection reaper routine ibd_rc_conn_timeout_call(). 1283*13030SKevin.Ge@Sun.COM * "is_used == B_FALSE" indicates this RC channel has not been used for 1284*13030SKevin.Ge@Sun.COM * a long (=ibd_rc_conn_timeout) time. 1285*13030SKevin.Ge@Sun.COM */ 1286*13030SKevin.Ge@Sun.COM boolean_t is_used; 1287*13030SKevin.Ge@Sun.COM /* 1288*13030SKevin.Ge@Sun.COM * When closing this channel, we need to make sure 1289*13030SKevin.Ge@Sun.COM * "chan->rcq_invoking == 0". 1290*13030SKevin.Ge@Sun.COM */ 1291*13030SKevin.Ge@Sun.COM uint32_t rcq_invoking; 129211534SKevin.Ge@Sun.COM } ibd_rc_chan_t; 129311534SKevin.Ge@Sun.COM 129411534SKevin.Ge@Sun.COM /* 129511534SKevin.Ge@Sun.COM * The following functions are defined in "ibd.c". 129611534SKevin.Ge@Sun.COM * They are also used by "ibd_cm.c" 129711534SKevin.Ge@Sun.COM */ 129811534SKevin.Ge@Sun.COM void ibd_print_warn(ibd_state_t *, char *, ...); 129911534SKevin.Ge@Sun.COM void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 130011534SKevin.Ge@Sun.COM void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 130111534SKevin.Ge@Sun.COM boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 130211534SKevin.Ge@Sun.COM void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *); 130311534SKevin.Ge@Sun.COM ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 130411534SKevin.Ge@Sun.COM 130511534SKevin.Ge@Sun.COM /* 130611534SKevin.Ge@Sun.COM * The following functions are defined in "ibd_cm.c". 130711534SKevin.Ge@Sun.COM * They are also used in "ibd.c". 130811534SKevin.Ge@Sun.COM */ 130911534SKevin.Ge@Sun.COM void ibd_async_rc_process_too_big(ibd_state_t *, ibd_req_t *); 131011534SKevin.Ge@Sun.COM void ibd_async_rc_close_act_chan(ibd_state_t *, ibd_req_t *); 131111534SKevin.Ge@Sun.COM void ibd_async_rc_recycle_ace(ibd_state_t *, ibd_req_t *); 131211534SKevin.Ge@Sun.COM 131311534SKevin.Ge@Sun.COM /* Connection Setup/Close Functions */ 131411534SKevin.Ge@Sun.COM ibt_status_t ibd_rc_listen(ibd_state_t *); 131511534SKevin.Ge@Sun.COM void ibd_rc_stop_listen(ibd_state_t *); 131611534SKevin.Ge@Sun.COM ibt_status_t ibd_rc_connect(ibd_state_t *, ibd_ace_t *, ibt_path_info_t *, 131711534SKevin.Ge@Sun.COM uint64_t); 131811534SKevin.Ge@Sun.COM void ibd_rc_try_connect(ibd_state_t *, ibd_ace_t *, ibt_path_info_t *); 131911534SKevin.Ge@Sun.COM void ibd_rc_signal_act_close(ibd_state_t *, ibd_ace_t *); 132011534SKevin.Ge@Sun.COM void ibd_rc_signal_ace_recycle(ibd_state_t *, ibd_ace_t *); 1321*13030SKevin.Ge@Sun.COM int ibd_rc_pas_close(ibd_rc_chan_t *, boolean_t, boolean_t); 132211642SKevin.Ge@Sun.COM void ibd_rc_close_all_chan(ibd_state_t *); 1323*13030SKevin.Ge@Sun.COM void ibd_rc_conn_timeout_call(void *carg); 132411534SKevin.Ge@Sun.COM 132511534SKevin.Ge@Sun.COM /* Receive Functions */ 132611534SKevin.Ge@Sun.COM int ibd_rc_init_srq_list(ibd_state_t *); 132711534SKevin.Ge@Sun.COM void ibd_rc_fini_srq_list(ibd_state_t *); 132811642SKevin.Ge@Sun.COM int ibd_rc_repost_srq_free_list(ibd_state_t *); 132911534SKevin.Ge@Sun.COM 133011534SKevin.Ge@Sun.COM /* Send Functions */ 133111534SKevin.Ge@Sun.COM int ibd_rc_init_tx_largebuf_list(ibd_state_t *); 133211534SKevin.Ge@Sun.COM void ibd_rc_fini_tx_largebuf_list(ibd_state_t *); 133311534SKevin.Ge@Sun.COM ibd_swqe_t *ibd_rc_acquire_swqes(ibd_rc_chan_t *); 133411534SKevin.Ge@Sun.COM void ibd_rc_post_send(ibd_rc_chan_t *, ibd_swqe_t *); 133511534SKevin.Ge@Sun.COM void ibd_rc_drain_scq(ibd_rc_chan_t *, ibt_cq_hdl_t); 133611534SKevin.Ge@Sun.COM void ibd_rc_tx_cleanup(ibd_swqe_t *); 133711534SKevin.Ge@Sun.COM 133811534SKevin.Ge@Sun.COM /* Others */ 133911534SKevin.Ge@Sun.COM void ibd_rc_get_conf(ibd_state_t *); 134011534SKevin.Ge@Sun.COM int ibd_rc_init_stats(ibd_state_t *); 134111534SKevin.Ge@Sun.COM 13420Sstevel@tonic-gate #endif /* _KERNEL && !_BOOT */ 13430Sstevel@tonic-gate 13440Sstevel@tonic-gate #ifdef __cplusplus 13450Sstevel@tonic-gate } 13460Sstevel@tonic-gate #endif 13470Sstevel@tonic-gate 13480Sstevel@tonic-gate #endif /* _SYS_IB_CLIENTS_IBD_H */ 1349