xref: /minix3/minix/net/lwip/tcpsock.c (revision ef8d499e2d2af900e9b2ab297171d7b088652482)
1*ef8d499eSDavid van Moolenbroek /* LWIP service - tcpsock.c - TCP sockets */
2*ef8d499eSDavid van Moolenbroek /*
3*ef8d499eSDavid van Moolenbroek  * This module implements support for TCP sockets based on lwIP's core TCP PCB
4*ef8d499eSDavid van Moolenbroek  * module, which is largely but not fully cooperative with exactly what we want
5*ef8d499eSDavid van Moolenbroek  * to achieve, with as a result that this module is rather complicated.
6*ef8d499eSDavid van Moolenbroek  *
7*ef8d499eSDavid van Moolenbroek  * Each socket has a send queue and a receive queue.  Both are using lwIP's own
8*ef8d499eSDavid van Moolenbroek  * (pbuf) buffers, which largely come out of the main 512-byte buffer pool.
9*ef8d499eSDavid van Moolenbroek  * The buffers on the send queue are allocated and freed by us--the latter only
10*ef8d499eSDavid van Moolenbroek  * once they are no longer in use by lwIP as well.  A bit counterintuitively,
11*ef8d499eSDavid van Moolenbroek  * we deliberately use a smaller lwIP per-PCB TCP send buffer limit
12*ef8d499eSDavid van Moolenbroek  * (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more
13*ef8d499eSDavid van Moolenbroek  * easily trigger conditions where we cannot enqueue data (or the final FIN)
14*ef8d499eSDavid van Moolenbroek  * right away.  This way, we get to test the internal logic of this module a
15*ef8d499eSDavid van Moolenbroek  * lot more easily.  The small lwIP send queue size should not have any impact
16*ef8d499eSDavid van Moolenbroek  * on performance, as our own per-socket send queues can be much larger and we
17*ef8d499eSDavid van Moolenbroek  * enqueue more of that on the lwIP PCB as soon as we can in all cases.
18*ef8d499eSDavid van Moolenbroek  *
19*ef8d499eSDavid van Moolenbroek  * The receive queue consists of whatever buffers were given to us by lwIP, but
20*ef8d499eSDavid van Moolenbroek  * since those may be many buffers with small amounts of data each, we perform
21*ef8d499eSDavid van Moolenbroek  * fairly aggressive merging of consecutive buffers.  The intended result is
22*ef8d499eSDavid van Moolenbroek  * that we waste no more than 50% of memory within the receive queue.  Merging
23*ef8d499eSDavid van Moolenbroek  * requires memory copies, which makes it expensive, but we do not configure
24*ef8d499eSDavid van Moolenbroek  * lwIP with enough buffers to make running out of buffers a non-issue, so this
25*ef8d499eSDavid van Moolenbroek  * trade-off is necessary.  Practical experience and measurements of the merge
26*ef8d499eSDavid van Moolenbroek  * policy will have to show whether and how the current policy may be improved.
27*ef8d499eSDavid van Moolenbroek  *
28*ef8d499eSDavid van Moolenbroek  * As can be expected, the connection close semantics are by far the most
29*ef8d499eSDavid van Moolenbroek  * complicated part of this module.  We attempt to get rid of the lwIP PCB as
30*ef8d499eSDavid van Moolenbroek  * soon as we can, letting lwIP take care of the TIME_WAIT state for example.
31*ef8d499eSDavid van Moolenbroek  * However, there are various conditions that have to be met before we can
32*ef8d499eSDavid van Moolenbroek  * forget about the PCB here--most importantly, that none of our sent data
33*ef8d499eSDavid van Moolenbroek  * blocks are still referenced by lwIP because they have not yet been sent or
34*ef8d499eSDavid van Moolenbroek  * acknowledged.  We can only free the data blocks once lwIP is done with them.
35*ef8d499eSDavid van Moolenbroek  *
36*ef8d499eSDavid van Moolenbroek  * We do consider the TCP state of lwIP's PCB, in order to avoid duplicating
37*ef8d499eSDavid van Moolenbroek  * full state tracking here.  However, we do not look at a socket's TCP state
38*ef8d499eSDavid van Moolenbroek  * while in a lwIP-generated event for that socket, because the state may not
39*ef8d499eSDavid van Moolenbroek  * necessarily reflect the (correct or new) TCP state of the connection, nor
40*ef8d499eSDavid van Moolenbroek  * may the PCB be available--this is the case for error events.  For these
41*ef8d499eSDavid van Moolenbroek  * reasons we use a few internal TCPF_ flags to perform partial state tracking.
42*ef8d499eSDavid van Moolenbroek  *
43*ef8d499eSDavid van Moolenbroek  * More generally, we tend to access lwIP PCB fields directly only when lwIP's
44*ef8d499eSDavid van Moolenbroek  * own BSD API implementation does that too and there is no better alternative.
45*ef8d499eSDavid van Moolenbroek  * One example of this is the check to see if our FIN was acknowledged, for
46*ef8d499eSDavid van Moolenbroek  * SO_LINGER support.  In terms of maintenance, our hope is that if lwIP's API
47*ef8d499eSDavid van Moolenbroek  * changes later, we can change our code to imitate whatever lwIP's BSD API
48*ef8d499eSDavid van Moolenbroek  * implementation does at that point.
49*ef8d499eSDavid van Moolenbroek  */
50*ef8d499eSDavid van Moolenbroek 
51*ef8d499eSDavid van Moolenbroek #include <sys/socketvar.h>
52*ef8d499eSDavid van Moolenbroek #include <netinet/in.h>
53*ef8d499eSDavid van Moolenbroek #include <netinet/tcp.h>
54*ef8d499eSDavid van Moolenbroek #include <netinet/ip_var.h>
55*ef8d499eSDavid van Moolenbroek #include <netinet/tcp_timer.h>
56*ef8d499eSDavid van Moolenbroek #include <netinet/tcp_var.h>
57*ef8d499eSDavid van Moolenbroek #include <netinet/tcp_fsm.h>
58*ef8d499eSDavid van Moolenbroek 
59*ef8d499eSDavid van Moolenbroek /*
60*ef8d499eSDavid van Moolenbroek  * Unfortunately, NetBSD and lwIP have different definitions of a few relevant
61*ef8d499eSDavid van Moolenbroek  * preprocessor variables.  Make sure we do not attempt to use the NetBSD one
62*ef8d499eSDavid van Moolenbroek  * where it matters.  We do need one of the NetBSD definitions though.
63*ef8d499eSDavid van Moolenbroek  */
64*ef8d499eSDavid van Moolenbroek static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY;
65*ef8d499eSDavid van Moolenbroek #undef TF_NODELAY
66*ef8d499eSDavid van Moolenbroek #undef TCP_MSS
67*ef8d499eSDavid van Moolenbroek 
68*ef8d499eSDavid van Moolenbroek #include "lwip.h"
69*ef8d499eSDavid van Moolenbroek #include "tcpisn.h"
70*ef8d499eSDavid van Moolenbroek 
71*ef8d499eSDavid van Moolenbroek #include "lwip/tcp.h"
72*ef8d499eSDavid van Moolenbroek #include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */
73*ef8d499eSDavid van Moolenbroek 
74*ef8d499eSDavid van Moolenbroek /*
75*ef8d499eSDavid van Moolenbroek  * The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration.
76*ef8d499eSDavid van Moolenbroek  */
77*ef8d499eSDavid van Moolenbroek 
78*ef8d499eSDavid van Moolenbroek /*
79*ef8d499eSDavid van Moolenbroek  * We fully control the send buffer, so we can let its size be set to whatever
80*ef8d499eSDavid van Moolenbroek  * we want.  The receive buffer is different: if it is smaller than the window
81*ef8d499eSDavid van Moolenbroek  * size, we may have to refuse data that lwIP hands us, at which point more
82*ef8d499eSDavid van Moolenbroek  * incoming data will cause lwIP to abort the TCP connection--even aside from
83*ef8d499eSDavid van Moolenbroek  * performance issues.  Therefore, we must make sure the receive buffer is
84*ef8d499eSDavid van Moolenbroek  * larger than the TCP window at all times.
85*ef8d499eSDavid van Moolenbroek  */
86*ef8d499eSDavid van Moolenbroek #define TCP_SNDBUF_MIN	1		/* minimum TCP send buffer size */
87*ef8d499eSDavid van Moolenbroek #define TCP_SNDBUF_DEF	32768		/* default TCP send buffer size */
88*ef8d499eSDavid van Moolenbroek #define TCP_SNDBUF_MAX	131072		/* maximum TCP send buffer size */
89*ef8d499eSDavid van Moolenbroek #define TCP_RCVBUF_MIN	TCP_WND		/* minimum TCP receive buffer size */
90*ef8d499eSDavid van Moolenbroek #define TCP_RCVBUF_DEF	MAX(TCP_WND, 32768) /* default TCP recv buffer size */
91*ef8d499eSDavid van Moolenbroek #define TCP_RCVBUF_MAX	MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */
92*ef8d499eSDavid van Moolenbroek 
93*ef8d499eSDavid van Moolenbroek /*
94*ef8d499eSDavid van Moolenbroek  * The total number of buffers that may in use for TCP socket send queues.  The
95*ef8d499eSDavid van Moolenbroek  * goal is to allow at least some progress to be made on receiving from TCP
96*ef8d499eSDavid van Moolenbroek  * sockets and on differently-typed sockets, at least as long as the LWIP
97*ef8d499eSDavid van Moolenbroek  * service can manage to allocate the memory it wants.  For the case that it
98*ef8d499eSDavid van Moolenbroek  * does not, we can only reactively kill off TCP sockets and/or free enqueued
99*ef8d499eSDavid van Moolenbroek  * ethernet packets, neither of which is currently implemented (TODO).
100*ef8d499eSDavid van Moolenbroek  */
101*ef8d499eSDavid van Moolenbroek #define TCP_MAX_SENDBUFS	(mempool_max_buffers() * 3 / 4)
102*ef8d499eSDavid van Moolenbroek 
103*ef8d499eSDavid van Moolenbroek /* Polling intervals, in 500-millsecond units. */
104*ef8d499eSDavid van Moolenbroek #define TCP_POLL_REG_INTERVAL	10	/* interval for reattempting sends */
105*ef8d499eSDavid van Moolenbroek #define TCP_POLL_CLOSE_INTERVAL	1	/* interval while closing connection */
106*ef8d499eSDavid van Moolenbroek 
107*ef8d499eSDavid van Moolenbroek static struct tcpsock {
108*ef8d499eSDavid van Moolenbroek 	struct ipsock tcp_ipsock;		/* IP socket, MUST be first */
109*ef8d499eSDavid van Moolenbroek 	struct tcp_pcb *tcp_pcb;		/* lwIP TCP control block */
110*ef8d499eSDavid van Moolenbroek 	union pxfer_tcp_queue {			/* free/accept queue */
111*ef8d499eSDavid van Moolenbroek 		TAILQ_ENTRY(tcpsock) tq_next;	/* next in queue */
112*ef8d499eSDavid van Moolenbroek 		TAILQ_HEAD(, tcpsock) tq_head;	/* head of queue */
113*ef8d499eSDavid van Moolenbroek 	} tcp_queue;
114*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp_listener;		/* listener if on accept q. */
115*ef8d499eSDavid van Moolenbroek 	struct {				/* send queue */
116*ef8d499eSDavid van Moolenbroek 		struct pbuf *ts_head;		/* first pbuf w/unacked data */
117*ef8d499eSDavid van Moolenbroek 		struct pbuf *ts_unsent;		/* first pbuf w/unsent data */
118*ef8d499eSDavid van Moolenbroek 		struct pbuf *ts_tail;		/* most recently added data */
119*ef8d499eSDavid van Moolenbroek 		size_t ts_len;			/* total sent + unsent */
120*ef8d499eSDavid van Moolenbroek 		unsigned short ts_head_off;	/* offset into head pbuf */
121*ef8d499eSDavid van Moolenbroek 		unsigned short ts_unsent_off;	/* offset into unsent pbuf */
122*ef8d499eSDavid van Moolenbroek 	} tcp_snd;
123*ef8d499eSDavid van Moolenbroek 	struct {				/* receive queue */
124*ef8d499eSDavid van Moolenbroek 		struct pbuf *tr_head;		/* first pbuf w/unrecvd data */
125*ef8d499eSDavid van Moolenbroek 		struct pbuf **tr_pre_tailp;	/* ptr-ptr to newest pbuf */
126*ef8d499eSDavid van Moolenbroek 		size_t tr_len;			/* bytes on receive queue */
127*ef8d499eSDavid van Moolenbroek 		unsigned short tr_head_off;	/* offset into head pbuf */
128*ef8d499eSDavid van Moolenbroek 		unsigned short tr_unacked;	/* current window reduction */
129*ef8d499eSDavid van Moolenbroek 	} tcp_rcv;
130*ef8d499eSDavid van Moolenbroek } tcp_array[NR_TCPSOCK];
131*ef8d499eSDavid van Moolenbroek 
132*ef8d499eSDavid van Moolenbroek static TAILQ_HEAD(, tcpsock) tcp_freelist;	/* list of free TCP sockets */
133*ef8d499eSDavid van Moolenbroek 
134*ef8d499eSDavid van Moolenbroek static const struct sockevent_ops tcpsock_ops;
135*ef8d499eSDavid van Moolenbroek 
136*ef8d499eSDavid van Moolenbroek static unsigned int tcpsock_sendbufs;		/* # send buffers in use */
137*ef8d499eSDavid van Moolenbroek static unsigned int tcpsock_recvbufs;		/* # receive buffers in use */
138*ef8d499eSDavid van Moolenbroek 
139*ef8d499eSDavid van Moolenbroek /* A bunch of macros that are just for convenience. */
140*ef8d499eSDavid van Moolenbroek #define tcpsock_get_id(tcp)	(SOCKID_TCP | (sockid_t)((tcp) - tcp_array))
141*ef8d499eSDavid van Moolenbroek #define tcpsock_get_ipsock(tcp)	(&(tcp)->tcp_ipsock)
142*ef8d499eSDavid van Moolenbroek #define tcpsock_get_sock(tcp)	(ipsock_get_sock(tcpsock_get_ipsock(tcp)))
143*ef8d499eSDavid van Moolenbroek #define tcpsock_get_sndbuf(tcp)	(ipsock_get_sndbuf(tcpsock_get_ipsock(tcp)))
144*ef8d499eSDavid van Moolenbroek #define tcpsock_get_rcvbuf(tcp)	(ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp)))
145*ef8d499eSDavid van Moolenbroek #define tcpsock_is_ipv6(tcp)	(ipsock_is_ipv6(tcpsock_get_ipsock(tcp)))
146*ef8d499eSDavid van Moolenbroek #define tcpsock_is_shutdown(tcp,fl) \
147*ef8d499eSDavid van Moolenbroek 	(sockevent_is_shutdown(tcpsock_get_sock(tcp), fl))
148*ef8d499eSDavid van Moolenbroek #define tcpsock_is_listening(tcp) \
149*ef8d499eSDavid van Moolenbroek 	(sockevent_is_listening(tcpsock_get_sock(tcp)))
150*ef8d499eSDavid van Moolenbroek #define tcpsock_get_flags(tcp)	(ipsock_get_flags(tcpsock_get_ipsock(tcp)))
151*ef8d499eSDavid van Moolenbroek #define tcpsock_set_flag(tcp,fl) \
152*ef8d499eSDavid van Moolenbroek 	(ipsock_set_flag(tcpsock_get_ipsock(tcp), fl))
153*ef8d499eSDavid van Moolenbroek #define tcpsock_clear_flag(tcp,fl) \
154*ef8d499eSDavid van Moolenbroek 	(ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl))
155*ef8d499eSDavid van Moolenbroek 
156*ef8d499eSDavid van Moolenbroek static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *,
157*ef8d499eSDavid van Moolenbroek 	struct rmib_oldp *, struct rmib_newp *);
158*ef8d499eSDavid van Moolenbroek 
159*ef8d499eSDavid van Moolenbroek /* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */
160*ef8d499eSDavid van Moolenbroek /* TODO: add many more and make some of them writable.. */
161*ef8d499eSDavid van Moolenbroek static struct rmib_node net_inet_tcp_table[] = {
162*ef8d499eSDavid van Moolenbroek /* 2*/	[TCPCTL_SENDSPACE]	= RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF,
163*ef8d499eSDavid van Moolenbroek 				    "sendspace",
164*ef8d499eSDavid van Moolenbroek 				    "Default TCP send buffer size"),
165*ef8d499eSDavid van Moolenbroek /* 3*/	[TCPCTL_RECVSPACE]	= RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF,
166*ef8d499eSDavid van Moolenbroek 				    "recvspace",
167*ef8d499eSDavid van Moolenbroek 				    "Default TCP receive buffer size"),
168*ef8d499eSDavid van Moolenbroek /*29*/	[TCPCTL_LOOPBACKCKSUM]	= RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int),
169*ef8d499eSDavid van Moolenbroek 				    loopif_cksum, "do_loopback_cksum",
170*ef8d499eSDavid van Moolenbroek 				    "Perform TCP checksum on loopback"),
171*ef8d499eSDavid van Moolenbroek /*+0*/	[TCPCTL_MAXID]		= RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0,
172*ef8d499eSDavid van Moolenbroek 				    tcpsock_pcblist, "pcblist",
173*ef8d499eSDavid van Moolenbroek 				    "TCP protocol control block list"),
174*ef8d499eSDavid van Moolenbroek /*+1*/	[TCPCTL_MAXID + 1]	= RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE |
175*ef8d499eSDavid van Moolenbroek 				    CTLFLAG_HIDDEN | CTLTYPE_STRING,
176*ef8d499eSDavid van Moolenbroek 				    TCPISN_SECRET_HEX_LENGTH, tcpisn_secret,
177*ef8d499eSDavid van Moolenbroek 				    "isn_secret",
178*ef8d499eSDavid van Moolenbroek 				    "TCP ISN secret (MINIX 3 specific)")
179*ef8d499eSDavid van Moolenbroek };
180*ef8d499eSDavid van Moolenbroek 
181*ef8d499eSDavid van Moolenbroek static struct rmib_node net_inet_tcp_node =
182*ef8d499eSDavid van Moolenbroek     RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings");
183*ef8d499eSDavid van Moolenbroek static struct rmib_node net_inet6_tcp6_node =
184*ef8d499eSDavid van Moolenbroek     RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings");
185*ef8d499eSDavid van Moolenbroek 
186*ef8d499eSDavid van Moolenbroek /*
187*ef8d499eSDavid van Moolenbroek  * Initialize the TCP sockets module.
188*ef8d499eSDavid van Moolenbroek  */
189*ef8d499eSDavid van Moolenbroek void
tcpsock_init(void)190*ef8d499eSDavid van Moolenbroek tcpsock_init(void)
191*ef8d499eSDavid van Moolenbroek {
192*ef8d499eSDavid van Moolenbroek 	unsigned int slot;
193*ef8d499eSDavid van Moolenbroek 
194*ef8d499eSDavid van Moolenbroek 	/* Initialize the list of free TCP sockets. */
195*ef8d499eSDavid van Moolenbroek 	TAILQ_INIT(&tcp_freelist);
196*ef8d499eSDavid van Moolenbroek 
197*ef8d499eSDavid van Moolenbroek 	for (slot = 0; slot < __arraycount(tcp_array); slot++)
198*ef8d499eSDavid van Moolenbroek 		TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot],
199*ef8d499eSDavid van Moolenbroek 		    tcp_queue.tq_next);
200*ef8d499eSDavid van Moolenbroek 
201*ef8d499eSDavid van Moolenbroek 	/* Initialize other variables. */
202*ef8d499eSDavid van Moolenbroek 	tcpsock_sendbufs = 0;
203*ef8d499eSDavid van Moolenbroek 
204*ef8d499eSDavid van Moolenbroek 	/* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */
205*ef8d499eSDavid van Moolenbroek 	mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node);
206*ef8d499eSDavid van Moolenbroek 	mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node);
207*ef8d499eSDavid van Moolenbroek }
208*ef8d499eSDavid van Moolenbroek 
209*ef8d499eSDavid van Moolenbroek /*
210*ef8d499eSDavid van Moolenbroek  * Initialize the state of a TCP socket's send queue.
211*ef8d499eSDavid van Moolenbroek  */
212*ef8d499eSDavid van Moolenbroek static void
tcpsock_reset_send(struct tcpsock * tcp)213*ef8d499eSDavid van Moolenbroek tcpsock_reset_send(struct tcpsock * tcp)
214*ef8d499eSDavid van Moolenbroek {
215*ef8d499eSDavid van Moolenbroek 
216*ef8d499eSDavid van Moolenbroek 	tcp->tcp_snd.ts_tail = NULL;
217*ef8d499eSDavid van Moolenbroek 	tcp->tcp_snd.ts_unsent = NULL;
218*ef8d499eSDavid van Moolenbroek 	tcp->tcp_snd.ts_head = NULL;
219*ef8d499eSDavid van Moolenbroek 	tcp->tcp_snd.ts_len = 0;
220*ef8d499eSDavid van Moolenbroek 	tcp->tcp_snd.ts_unsent_off = 0;
221*ef8d499eSDavid van Moolenbroek 	tcp->tcp_snd.ts_head_off = 0;
222*ef8d499eSDavid van Moolenbroek }
223*ef8d499eSDavid van Moolenbroek 
224*ef8d499eSDavid van Moolenbroek /*
225*ef8d499eSDavid van Moolenbroek  * Initialize the state of a TCP socket's receive queue.
226*ef8d499eSDavid van Moolenbroek  */
227*ef8d499eSDavid van Moolenbroek static void
tcpsock_reset_recv(struct tcpsock * tcp)228*ef8d499eSDavid van Moolenbroek tcpsock_reset_recv(struct tcpsock * tcp)
229*ef8d499eSDavid van Moolenbroek {
230*ef8d499eSDavid van Moolenbroek 
231*ef8d499eSDavid van Moolenbroek 	tcp->tcp_rcv.tr_pre_tailp = NULL;
232*ef8d499eSDavid van Moolenbroek 	tcp->tcp_rcv.tr_head = NULL;
233*ef8d499eSDavid van Moolenbroek 	tcp->tcp_rcv.tr_len = 0;
234*ef8d499eSDavid van Moolenbroek 	tcp->tcp_rcv.tr_head_off = 0;
235*ef8d499eSDavid van Moolenbroek 	tcp->tcp_rcv.tr_unacked = 0;
236*ef8d499eSDavid van Moolenbroek }
237*ef8d499eSDavid van Moolenbroek 
238*ef8d499eSDavid van Moolenbroek /*
239*ef8d499eSDavid van Moolenbroek  * Create a TCP socket.
240*ef8d499eSDavid van Moolenbroek  */
241*ef8d499eSDavid van Moolenbroek sockid_t
tcpsock_socket(int domain,int protocol,struct sock ** sockp,const struct sockevent_ops ** ops)242*ef8d499eSDavid van Moolenbroek tcpsock_socket(int domain, int protocol, struct sock ** sockp,
243*ef8d499eSDavid van Moolenbroek 	const struct sockevent_ops ** ops)
244*ef8d499eSDavid van Moolenbroek {
245*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp;
246*ef8d499eSDavid van Moolenbroek 	uint8_t ip_type;
247*ef8d499eSDavid van Moolenbroek 
248*ef8d499eSDavid van Moolenbroek 	switch (protocol) {
249*ef8d499eSDavid van Moolenbroek 	case 0:
250*ef8d499eSDavid van Moolenbroek 	case IPPROTO_TCP:
251*ef8d499eSDavid van Moolenbroek 		break;
252*ef8d499eSDavid van Moolenbroek 
253*ef8d499eSDavid van Moolenbroek 	default:
254*ef8d499eSDavid van Moolenbroek 		return EPROTONOSUPPORT;
255*ef8d499eSDavid van Moolenbroek 	}
256*ef8d499eSDavid van Moolenbroek 
257*ef8d499eSDavid van Moolenbroek 	if (TAILQ_EMPTY(&tcp_freelist))
258*ef8d499eSDavid van Moolenbroek 		return ENOBUFS;
259*ef8d499eSDavid van Moolenbroek 
260*ef8d499eSDavid van Moolenbroek 	tcp = TAILQ_FIRST(&tcp_freelist);
261*ef8d499eSDavid van Moolenbroek 
262*ef8d499eSDavid van Moolenbroek 	/*
263*ef8d499eSDavid van Moolenbroek 	 * Initialize the structure.  Do not memset it to zero, as it is still
264*ef8d499eSDavid van Moolenbroek 	 * part of the linked free list.  Initialization may still fail.  When
265*ef8d499eSDavid van Moolenbroek 	 * adding new fields, make sure to change tcpsock_clone() accordingly.
266*ef8d499eSDavid van Moolenbroek 	 */
267*ef8d499eSDavid van Moolenbroek 
268*ef8d499eSDavid van Moolenbroek 	ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain,
269*ef8d499eSDavid van Moolenbroek 	    TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp);
270*ef8d499eSDavid van Moolenbroek 
271*ef8d499eSDavid van Moolenbroek 	if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL)
272*ef8d499eSDavid van Moolenbroek 		return ENOBUFS;
273*ef8d499eSDavid van Moolenbroek 	tcp_arg(tcp->tcp_pcb, tcp);
274*ef8d499eSDavid van Moolenbroek 
275*ef8d499eSDavid van Moolenbroek 	tcp->tcp_listener = NULL;
276*ef8d499eSDavid van Moolenbroek 
277*ef8d499eSDavid van Moolenbroek 	tcpsock_reset_send(tcp);
278*ef8d499eSDavid van Moolenbroek 	tcpsock_reset_recv(tcp);
279*ef8d499eSDavid van Moolenbroek 
280*ef8d499eSDavid van Moolenbroek 	TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
281*ef8d499eSDavid van Moolenbroek 
282*ef8d499eSDavid van Moolenbroek 	*ops = &tcpsock_ops;
283*ef8d499eSDavid van Moolenbroek 	return tcpsock_get_id(tcp);
284*ef8d499eSDavid van Moolenbroek }
285*ef8d499eSDavid van Moolenbroek 
286*ef8d499eSDavid van Moolenbroek /*
287*ef8d499eSDavid van Moolenbroek  * Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection
288*ef8d499eSDavid van Moolenbroek  * incoming on listening socket 'listener'.  The new socket is essentially a
289*ef8d499eSDavid van Moolenbroek  * "clone" of the listening TCP socket, in that it should inherit any settings
290*ef8d499eSDavid van Moolenbroek  * from the listening socket.  The socket has not yet been accepted by userland
291*ef8d499eSDavid van Moolenbroek  * so add it to the queue of connetions pending for the listening socket.  On
292*ef8d499eSDavid van Moolenbroek  * success, return OK.  On failure, return a negative error code.
293*ef8d499eSDavid van Moolenbroek  */
294*ef8d499eSDavid van Moolenbroek static int
tcpsock_clone(struct tcpsock * listener,struct tcp_pcb * pcb)295*ef8d499eSDavid van Moolenbroek tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb)
296*ef8d499eSDavid van Moolenbroek {
297*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp;
298*ef8d499eSDavid van Moolenbroek 
299*ef8d499eSDavid van Moolenbroek 	if (TAILQ_EMPTY(&tcp_freelist))
300*ef8d499eSDavid van Moolenbroek 		return ENOBUFS;
301*ef8d499eSDavid van Moolenbroek 
302*ef8d499eSDavid van Moolenbroek 	tcp = TAILQ_FIRST(&tcp_freelist);
303*ef8d499eSDavid van Moolenbroek 
304*ef8d499eSDavid van Moolenbroek 	/*
305*ef8d499eSDavid van Moolenbroek 	 * Initialize the structure.  Do not memset it to zero, as it is still
306*ef8d499eSDavid van Moolenbroek 	 * part of the linked free list.  Initialization may still fail.  Most
307*ef8d499eSDavid van Moolenbroek 	 * settings should be inherited from the listening socket here, rather
308*ef8d499eSDavid van Moolenbroek 	 * than being initialized to their default state.
309*ef8d499eSDavid van Moolenbroek 	 */
310*ef8d499eSDavid van Moolenbroek 
311*ef8d499eSDavid van Moolenbroek 	ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp),
312*ef8d499eSDavid van Moolenbroek 	    tcpsock_get_id(tcp));
313*ef8d499eSDavid van Moolenbroek 
314*ef8d499eSDavid van Moolenbroek 	tcp->tcp_pcb = pcb;
315*ef8d499eSDavid van Moolenbroek 	tcp_arg(pcb, tcp);
316*ef8d499eSDavid van Moolenbroek 
317*ef8d499eSDavid van Moolenbroek 	tcpsock_reset_send(tcp);
318*ef8d499eSDavid van Moolenbroek 	tcpsock_reset_recv(tcp);
319*ef8d499eSDavid van Moolenbroek 
320*ef8d499eSDavid van Moolenbroek 	/*
321*ef8d499eSDavid van Moolenbroek 	 * Remove the new socket from the free list, and add it to the queue of
322*ef8d499eSDavid van Moolenbroek 	 * the listening socket--in this order, because the same next pointer
323*ef8d499eSDavid van Moolenbroek 	 * is used for both.
324*ef8d499eSDavid van Moolenbroek 	 */
325*ef8d499eSDavid van Moolenbroek 	TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
326*ef8d499eSDavid van Moolenbroek 
327*ef8d499eSDavid van Moolenbroek 	TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp,
328*ef8d499eSDavid van Moolenbroek 	    tcp_queue.tq_next);
329*ef8d499eSDavid van Moolenbroek 	tcp->tcp_listener = listener;
330*ef8d499eSDavid van Moolenbroek 
331*ef8d499eSDavid van Moolenbroek 	return OK;
332*ef8d499eSDavid van Moolenbroek }
333*ef8d499eSDavid van Moolenbroek 
334*ef8d499eSDavid van Moolenbroek /*
335*ef8d499eSDavid van Moolenbroek  * Allocate a buffer from the pool, using the standard pool size.  The returned
336*ef8d499eSDavid van Moolenbroek  * buffer is a single element--never a chain.
337*ef8d499eSDavid van Moolenbroek  */
338*ef8d499eSDavid van Moolenbroek static struct pbuf *
tcpsock_alloc_buf(void)339*ef8d499eSDavid van Moolenbroek tcpsock_alloc_buf(void)
340*ef8d499eSDavid van Moolenbroek {
341*ef8d499eSDavid van Moolenbroek 	struct pbuf *pbuf;
342*ef8d499eSDavid van Moolenbroek 
343*ef8d499eSDavid van Moolenbroek 	pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM);
344*ef8d499eSDavid van Moolenbroek 
345*ef8d499eSDavid van Moolenbroek 	assert(pbuf == NULL || pbuf->len == pbuf->tot_len);
346*ef8d499eSDavid van Moolenbroek 
347*ef8d499eSDavid van Moolenbroek 	return pbuf;
348*ef8d499eSDavid van Moolenbroek }
349*ef8d499eSDavid van Moolenbroek 
350*ef8d499eSDavid van Moolenbroek /*
351*ef8d499eSDavid van Moolenbroek  * Free the given buffer.  Ensure that pbuf_free() will not attempt to free the
352*ef8d499eSDavid van Moolenbroek  * next buffer(s) in the chain as well.  This may be called for pbufs other
353*ef8d499eSDavid van Moolenbroek  * than those allocated with tcpsock_alloc_buf().
354*ef8d499eSDavid van Moolenbroek  */
355*ef8d499eSDavid van Moolenbroek static void
tcpsock_free_buf(struct pbuf * pbuf)356*ef8d499eSDavid van Moolenbroek tcpsock_free_buf(struct pbuf * pbuf)
357*ef8d499eSDavid van Moolenbroek {
358*ef8d499eSDavid van Moolenbroek 
359*ef8d499eSDavid van Moolenbroek 	/*
360*ef8d499eSDavid van Moolenbroek 	 * Resetting the length is currently not necessary, but better safe
361*ef8d499eSDavid van Moolenbroek 	 * than sorry..
362*ef8d499eSDavid van Moolenbroek 	 */
363*ef8d499eSDavid van Moolenbroek 	pbuf->len = pbuf->tot_len;
364*ef8d499eSDavid van Moolenbroek 	pbuf->next = NULL;
365*ef8d499eSDavid van Moolenbroek 
366*ef8d499eSDavid van Moolenbroek 	pbuf_free(pbuf);
367*ef8d499eSDavid van Moolenbroek }
368*ef8d499eSDavid van Moolenbroek 
369*ef8d499eSDavid van Moolenbroek /*
370*ef8d499eSDavid van Moolenbroek  * Clear the send queue of a TCP socket.  The caller must ensure that lwIP will
371*ef8d499eSDavid van Moolenbroek  * no longer access any of data on the send queue.
372*ef8d499eSDavid van Moolenbroek  */
373*ef8d499eSDavid van Moolenbroek static void
tcpsock_clear_send(struct tcpsock * tcp)374*ef8d499eSDavid van Moolenbroek tcpsock_clear_send(struct tcpsock * tcp)
375*ef8d499eSDavid van Moolenbroek {
376*ef8d499eSDavid van Moolenbroek 	struct pbuf *phead;
377*ef8d499eSDavid van Moolenbroek 
378*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_pcb == NULL);
379*ef8d499eSDavid van Moolenbroek 
380*ef8d499eSDavid van Moolenbroek 	while ((phead = tcp->tcp_snd.ts_head) != NULL) {
381*ef8d499eSDavid van Moolenbroek 		tcp->tcp_snd.ts_head = phead->next;
382*ef8d499eSDavid van Moolenbroek 
383*ef8d499eSDavid van Moolenbroek 		assert(tcpsock_sendbufs > 0);
384*ef8d499eSDavid van Moolenbroek 		tcpsock_sendbufs--;
385*ef8d499eSDavid van Moolenbroek 
386*ef8d499eSDavid van Moolenbroek 		tcpsock_free_buf(phead);
387*ef8d499eSDavid van Moolenbroek 	}
388*ef8d499eSDavid van Moolenbroek 
389*ef8d499eSDavid van Moolenbroek 	tcpsock_reset_send(tcp);
390*ef8d499eSDavid van Moolenbroek }
391*ef8d499eSDavid van Moolenbroek 
392*ef8d499eSDavid van Moolenbroek /*
393*ef8d499eSDavid van Moolenbroek  * Clear the receive queue of a TCP socket.  If 'ack_data' is set, also
394*ef8d499eSDavid van Moolenbroek  * acknowledge the previous contents of the receive queue to lwIP.
395*ef8d499eSDavid van Moolenbroek  */
396*ef8d499eSDavid van Moolenbroek static size_t
tcpsock_clear_recv(struct tcpsock * tcp,int ack_data)397*ef8d499eSDavid van Moolenbroek tcpsock_clear_recv(struct tcpsock * tcp, int ack_data)
398*ef8d499eSDavid van Moolenbroek {
399*ef8d499eSDavid van Moolenbroek 	struct pbuf *phead;
400*ef8d499eSDavid van Moolenbroek 	size_t rlen;
401*ef8d499eSDavid van Moolenbroek 
402*ef8d499eSDavid van Moolenbroek 	rlen = tcp->tcp_rcv.tr_len;
403*ef8d499eSDavid van Moolenbroek 
404*ef8d499eSDavid van Moolenbroek 	while ((phead = tcp->tcp_rcv.tr_head) != NULL) {
405*ef8d499eSDavid van Moolenbroek 		tcp->tcp_rcv.tr_head = phead->next;
406*ef8d499eSDavid van Moolenbroek 
407*ef8d499eSDavid van Moolenbroek 		assert(tcpsock_recvbufs > 0);
408*ef8d499eSDavid van Moolenbroek 		tcpsock_recvbufs--;
409*ef8d499eSDavid van Moolenbroek 
410*ef8d499eSDavid van Moolenbroek 		tcpsock_free_buf(phead);
411*ef8d499eSDavid van Moolenbroek 	}
412*ef8d499eSDavid van Moolenbroek 
413*ef8d499eSDavid van Moolenbroek 	/*
414*ef8d499eSDavid van Moolenbroek 	 * From now on, we will basically be discarding incoming data as fast
415*ef8d499eSDavid van Moolenbroek 	 * as possible, to keep the full window open at all times.
416*ef8d499eSDavid van Moolenbroek 	 */
417*ef8d499eSDavid van Moolenbroek 	if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0)
418*ef8d499eSDavid van Moolenbroek 		tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked);
419*ef8d499eSDavid van Moolenbroek 
420*ef8d499eSDavid van Moolenbroek 	tcpsock_reset_recv(tcp);
421*ef8d499eSDavid van Moolenbroek 
422*ef8d499eSDavid van Moolenbroek 	return rlen;
423*ef8d499eSDavid van Moolenbroek }
424*ef8d499eSDavid van Moolenbroek 
425*ef8d499eSDavid van Moolenbroek /*
426*ef8d499eSDavid van Moolenbroek  * The TCP socket's PCB has been detached from the socket, typically because
427*ef8d499eSDavid van Moolenbroek  * the connection was aborted, either by us or by lwIP.  Either way, any TCP
428*ef8d499eSDavid van Moolenbroek  * connection is gone.  Clear the socket's send queue, remove the socket from
429*ef8d499eSDavid van Moolenbroek  * a listening socket's queue, and if the socket itself is ready and allowed to
430*ef8d499eSDavid van Moolenbroek  * be freed, free it now.  The socket is ready to be freed if it was either on
431*ef8d499eSDavid van Moolenbroek  * a listening queue or being closed already.  The socket is allowed to be
432*ef8d499eSDavid van Moolenbroek  * freed only if 'may_free' is TRUE.  If the socket is not freed, its receive
433*ef8d499eSDavid van Moolenbroek  * queue is left as is, as it may still have data to be received by userland.
434*ef8d499eSDavid van Moolenbroek  */
435*ef8d499eSDavid van Moolenbroek static int
tcpsock_cleanup(struct tcpsock * tcp,int may_free)436*ef8d499eSDavid van Moolenbroek tcpsock_cleanup(struct tcpsock * tcp, int may_free)
437*ef8d499eSDavid van Moolenbroek {
438*ef8d499eSDavid van Moolenbroek 	int destroy;
439*ef8d499eSDavid van Moolenbroek 
440*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_pcb == NULL);
441*ef8d499eSDavid van Moolenbroek 
442*ef8d499eSDavid van Moolenbroek 	/*
443*ef8d499eSDavid van Moolenbroek 	 * Free any data on the send queue.  This is safe to do right now,
444*ef8d499eSDavid van Moolenbroek 	 * because the PCB has been aborted (or was already gone).  We must be
445*ef8d499eSDavid van Moolenbroek 	 * very careful about clearing the send queue in all other situations.
446*ef8d499eSDavid van Moolenbroek 	 */
447*ef8d499eSDavid van Moolenbroek 	tcpsock_clear_send(tcp);
448*ef8d499eSDavid van Moolenbroek 
449*ef8d499eSDavid van Moolenbroek 	/*
450*ef8d499eSDavid van Moolenbroek 	 * If this was a socket pending acceptance, remove it from the
451*ef8d499eSDavid van Moolenbroek 	 * corresponding listener socket's queue, and free it.  Otherwise, free
452*ef8d499eSDavid van Moolenbroek 	 * the socket only if it suspended a graceful close operation.
453*ef8d499eSDavid van Moolenbroek 	 */
454*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_listener != NULL) {
455*ef8d499eSDavid van Moolenbroek 		TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp,
456*ef8d499eSDavid van Moolenbroek 		    tcp_queue.tq_next);
457*ef8d499eSDavid van Moolenbroek 		tcp->tcp_listener = NULL;
458*ef8d499eSDavid van Moolenbroek 
459*ef8d499eSDavid van Moolenbroek 		/*
460*ef8d499eSDavid van Moolenbroek 		 * The listener socket's backlog count should be adjusted by
461*ef8d499eSDavid van Moolenbroek 		 * lwIP whenever the PCB is freed up, so we need (and must) not
462*ef8d499eSDavid van Moolenbroek 		 * attempt to do that here.
463*ef8d499eSDavid van Moolenbroek 		 */
464*ef8d499eSDavid van Moolenbroek 
465*ef8d499eSDavid van Moolenbroek 		destroy = TRUE;
466*ef8d499eSDavid van Moolenbroek 	} else
467*ef8d499eSDavid van Moolenbroek 		destroy = sockevent_is_closing(tcpsock_get_sock(tcp));
468*ef8d499eSDavid van Moolenbroek 
469*ef8d499eSDavid van Moolenbroek 	/*
470*ef8d499eSDavid van Moolenbroek 	 * Do not free the socket if 'may_free' is FALSE.  That flag may be set
471*ef8d499eSDavid van Moolenbroek 	 * if we are currently in the second tcpsock_close() call on the
472*ef8d499eSDavid van Moolenbroek 	 * socket, in which case sockevent_is_closing() is TRUE but we must
473*ef8d499eSDavid van Moolenbroek 	 * still not free the socket now: doing so would derail libsockevent.
474*ef8d499eSDavid van Moolenbroek 	 */
475*ef8d499eSDavid van Moolenbroek 	if (destroy && may_free) {
476*ef8d499eSDavid van Moolenbroek 		(void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
477*ef8d499eSDavid van Moolenbroek 
478*ef8d499eSDavid van Moolenbroek 		sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
479*ef8d499eSDavid van Moolenbroek 	}
480*ef8d499eSDavid van Moolenbroek 
481*ef8d499eSDavid van Moolenbroek 	return destroy;
482*ef8d499eSDavid van Moolenbroek }
483*ef8d499eSDavid van Moolenbroek 
484*ef8d499eSDavid van Moolenbroek /*
485*ef8d499eSDavid van Moolenbroek  * Abort the lwIP PCB for the given socket, using tcp_abort().  If the PCB is
486*ef8d499eSDavid van Moolenbroek  * connected, this will cause the connection to be reset.  The PCB, which must
487*ef8d499eSDavid van Moolenbroek  * have still been present before the call, will be gone after the call.
488*ef8d499eSDavid van Moolenbroek  */
489*ef8d499eSDavid van Moolenbroek static void
tcpsock_pcb_abort(struct tcpsock * tcp)490*ef8d499eSDavid van Moolenbroek tcpsock_pcb_abort(struct tcpsock * tcp)
491*ef8d499eSDavid van Moolenbroek {
492*ef8d499eSDavid van Moolenbroek 
493*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_pcb != NULL);
494*ef8d499eSDavid van Moolenbroek 	assert(!tcpsock_is_listening(tcp));
495*ef8d499eSDavid van Moolenbroek 
496*ef8d499eSDavid van Moolenbroek 	tcp_recv(tcp->tcp_pcb, NULL);
497*ef8d499eSDavid van Moolenbroek 	tcp_sent(tcp->tcp_pcb, NULL);
498*ef8d499eSDavid van Moolenbroek 	tcp_err(tcp->tcp_pcb, NULL);
499*ef8d499eSDavid van Moolenbroek 	tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
500*ef8d499eSDavid van Moolenbroek 
501*ef8d499eSDavid van Moolenbroek 	tcp_arg(tcp->tcp_pcb, NULL);
502*ef8d499eSDavid van Moolenbroek 
503*ef8d499eSDavid van Moolenbroek 	tcp_abort(tcp->tcp_pcb);
504*ef8d499eSDavid van Moolenbroek 
505*ef8d499eSDavid van Moolenbroek 	tcp->tcp_pcb = NULL;
506*ef8d499eSDavid van Moolenbroek }
507*ef8d499eSDavid van Moolenbroek 
508*ef8d499eSDavid van Moolenbroek /*
509*ef8d499eSDavid van Moolenbroek  * Close the lwIP PCB for the given socket, using tcp_close().  If the PCB is
510*ef8d499eSDavid van Moolenbroek  * connected, its graceful close will be finished by lwIP in the background.
511*ef8d499eSDavid van Moolenbroek  * The PCB, which must have still been present before the call, will be gone
512*ef8d499eSDavid van Moolenbroek  * after the call.
513*ef8d499eSDavid van Moolenbroek  */
514*ef8d499eSDavid van Moolenbroek static void
tcpsock_pcb_close(struct tcpsock * tcp)515*ef8d499eSDavid van Moolenbroek tcpsock_pcb_close(struct tcpsock * tcp)
516*ef8d499eSDavid van Moolenbroek {
517*ef8d499eSDavid van Moolenbroek 	err_t err;
518*ef8d499eSDavid van Moolenbroek 
519*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_pcb != NULL);
520*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_snd.ts_len == 0);
521*ef8d499eSDavid van Moolenbroek 
522*ef8d499eSDavid van Moolenbroek 	if (!tcpsock_is_listening(tcp)) {
523*ef8d499eSDavid van Moolenbroek 		tcp_recv(tcp->tcp_pcb, NULL);
524*ef8d499eSDavid van Moolenbroek 		tcp_sent(tcp->tcp_pcb, NULL);
525*ef8d499eSDavid van Moolenbroek 		tcp_err(tcp->tcp_pcb, NULL);
526*ef8d499eSDavid van Moolenbroek 		tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
527*ef8d499eSDavid van Moolenbroek 	}
528*ef8d499eSDavid van Moolenbroek 
529*ef8d499eSDavid van Moolenbroek 	tcp_arg(tcp->tcp_pcb, NULL);
530*ef8d499eSDavid van Moolenbroek 
531*ef8d499eSDavid van Moolenbroek 	if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK)
532*ef8d499eSDavid van Moolenbroek 		panic("unexpected TCP close failure: %d", err);
533*ef8d499eSDavid van Moolenbroek 
534*ef8d499eSDavid van Moolenbroek 	tcp->tcp_pcb = NULL;
535*ef8d499eSDavid van Moolenbroek }
536*ef8d499eSDavid van Moolenbroek 
537*ef8d499eSDavid van Moolenbroek /*
538*ef8d499eSDavid van Moolenbroek  * Return TRUE if all conditions are met for closing the TCP socket's PCB, or
539*ef8d499eSDavid van Moolenbroek  * FALSE if they are not.  Upon calling this function, the socket's PCB must
540*ef8d499eSDavid van Moolenbroek  * still be around.
541*ef8d499eSDavid van Moolenbroek  */
542*ef8d499eSDavid van Moolenbroek static int
tcpsock_may_close(struct tcpsock * tcp)543*ef8d499eSDavid van Moolenbroek tcpsock_may_close(struct tcpsock * tcp)
544*ef8d499eSDavid van Moolenbroek {
545*ef8d499eSDavid van Moolenbroek 
546*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_pcb != NULL);
547*ef8d499eSDavid van Moolenbroek 
548*ef8d499eSDavid van Moolenbroek 	/*
549*ef8d499eSDavid van Moolenbroek 	 * Regular closing of the PCB requires three conditions to be met:
550*ef8d499eSDavid van Moolenbroek 	 *
551*ef8d499eSDavid van Moolenbroek 	 * 1. all our data has been transmitted AND acknowledged, so that we do
552*ef8d499eSDavid van Moolenbroek 	 *    not risk corruption in case there are still unsent or unack'ed
553*ef8d499eSDavid van Moolenbroek 	 *    data buffers that may otherwise be recycled too soon;
554*ef8d499eSDavid van Moolenbroek 	 * 2. we have sent our FIN to the peer; and,
555*ef8d499eSDavid van Moolenbroek 	 * 3. we have received a FIN from the peer.
556*ef8d499eSDavid van Moolenbroek 	 */
557*ef8d499eSDavid van Moolenbroek 	return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) ==
558*ef8d499eSDavid van Moolenbroek 	    (TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0);
559*ef8d499eSDavid van Moolenbroek }
560*ef8d499eSDavid van Moolenbroek 
561*ef8d499eSDavid van Moolenbroek /*
562*ef8d499eSDavid van Moolenbroek  * The given socket is ready to be closed as per the tcpsock_may_close() rules.
563*ef8d499eSDavid van Moolenbroek  * This implies that its send queue is already empty.  Gracefully close the
564*ef8d499eSDavid van Moolenbroek  * PCB.  In addition, if the socket is being closed gracefully, meaning we
565*ef8d499eSDavid van Moolenbroek  * suspended an earlier tcpsock_close() call (and as such already emptied the
566*ef8d499eSDavid van Moolenbroek  * receive queue as well), then tell libsockevent that the close is finished,
567*ef8d499eSDavid van Moolenbroek  * freeing the socket.  Return TRUE if the socket has indeed been freed this
568*ef8d499eSDavid van Moolenbroek  * way, or FALSE if the socket is still around.
569*ef8d499eSDavid van Moolenbroek  */
570*ef8d499eSDavid van Moolenbroek static int
tcpsock_finish_close(struct tcpsock * tcp)571*ef8d499eSDavid van Moolenbroek tcpsock_finish_close(struct tcpsock * tcp)
572*ef8d499eSDavid van Moolenbroek {
573*ef8d499eSDavid van Moolenbroek 
574*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_snd.ts_len == 0);
575*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_listener == NULL);
576*ef8d499eSDavid van Moolenbroek 
577*ef8d499eSDavid van Moolenbroek 	/*
578*ef8d499eSDavid van Moolenbroek 	 * If we get here, we have already shut down the sending side of the
579*ef8d499eSDavid van Moolenbroek 	 * PCB.  Technically, we are interested only in shutting down the
580*ef8d499eSDavid van Moolenbroek 	 * receiving side of the PCB here, so that lwIP may decide to recycle
581*ef8d499eSDavid van Moolenbroek 	 * the socket later etcetera.  We call tcp_close() because we do not
582*ef8d499eSDavid van Moolenbroek 	 * want to rely on tcp_shutdown(RX) doing the exact same thing.
583*ef8d499eSDavid van Moolenbroek 	 * However, we do rely on the fact that the PCB is not immediately
584*ef8d499eSDavid van Moolenbroek 	 * destroyed by the tcp_close() call: otherwise we may have to return
585*ef8d499eSDavid van Moolenbroek 	 * ERR_ABRT if this function is called from a lwIP-generated event.
586*ef8d499eSDavid van Moolenbroek 	 */
587*ef8d499eSDavid van Moolenbroek 	tcpsock_pcb_close(tcp);
588*ef8d499eSDavid van Moolenbroek 
589*ef8d499eSDavid van Moolenbroek 	/*
590*ef8d499eSDavid van Moolenbroek 	 * If we suspended an earlier tcpsock_close() call, we have to tell
591*ef8d499eSDavid van Moolenbroek 	 * libsockevent that the close operation is now complete.
592*ef8d499eSDavid van Moolenbroek 	 */
593*ef8d499eSDavid van Moolenbroek 	if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
594*ef8d499eSDavid van Moolenbroek 		assert(tcp->tcp_rcv.tr_len == 0);
595*ef8d499eSDavid van Moolenbroek 
596*ef8d499eSDavid van Moolenbroek 		sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
597*ef8d499eSDavid van Moolenbroek 
598*ef8d499eSDavid van Moolenbroek 		return TRUE;
599*ef8d499eSDavid van Moolenbroek 	} else
600*ef8d499eSDavid van Moolenbroek 		return FALSE;
601*ef8d499eSDavid van Moolenbroek }
602*ef8d499eSDavid van Moolenbroek 
603*ef8d499eSDavid van Moolenbroek /*
604*ef8d499eSDavid van Moolenbroek  * Attempt to start or resume enqueuing data and/or a FIN to send on the given
605*ef8d499eSDavid van Moolenbroek  * TCP socket.  Return TRUE if anything at all could be newly enqueued on the
606*ef8d499eSDavid van Moolenbroek  * lwIP PCB, even if less than desired.  In that case, the caller should try to
607*ef8d499eSDavid van Moolenbroek  * send whatever was enqueued, and if applicable, check if the socket may now
608*ef8d499eSDavid van Moolenbroek  * be closed (due to the FIN being enqueued).  In particular, in any situation
609*ef8d499eSDavid van Moolenbroek  * where the socket may be in the process of being closed, the caller must use
610*ef8d499eSDavid van Moolenbroek  * tcpsock_may_close() if TRUE is returned.  Return FALSE if nothing new could
611*ef8d499eSDavid van Moolenbroek  * be enqueued, in which case no send attempt need to be made either.
612*ef8d499eSDavid van Moolenbroek  */
613*ef8d499eSDavid van Moolenbroek static int
tcpsock_pcb_enqueue(struct tcpsock * tcp)614*ef8d499eSDavid van Moolenbroek tcpsock_pcb_enqueue(struct tcpsock * tcp)
615*ef8d499eSDavid van Moolenbroek {
616*ef8d499eSDavid van Moolenbroek 	struct pbuf *punsent;
617*ef8d499eSDavid van Moolenbroek 	size_t space, chunk;
618*ef8d499eSDavid van Moolenbroek 	unsigned int flags;
619*ef8d499eSDavid van Moolenbroek 	err_t err;
620*ef8d499eSDavid van Moolenbroek 	int enqueued;
621*ef8d499eSDavid van Moolenbroek 
622*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_pcb != NULL);
623*ef8d499eSDavid van Moolenbroek 
624*ef8d499eSDavid van Moolenbroek 	if (tcpsock_get_flags(tcp) & TCPF_FULL)
625*ef8d499eSDavid van Moolenbroek 		return FALSE;
626*ef8d499eSDavid van Moolenbroek 
627*ef8d499eSDavid van Moolenbroek 	/*
628*ef8d499eSDavid van Moolenbroek 	 * Attempt to enqueue more unsent data, if any, on the PCB's send
629*ef8d499eSDavid van Moolenbroek 	 * queue.
630*ef8d499eSDavid van Moolenbroek 	 */
631*ef8d499eSDavid van Moolenbroek 	enqueued = FALSE;
632*ef8d499eSDavid van Moolenbroek 
633*ef8d499eSDavid van Moolenbroek 	while (tcp->tcp_snd.ts_unsent != NULL) {
634*ef8d499eSDavid van Moolenbroek 		if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0)
635*ef8d499eSDavid van Moolenbroek 			break;
636*ef8d499eSDavid van Moolenbroek 
637*ef8d499eSDavid van Moolenbroek 		/*
638*ef8d499eSDavid van Moolenbroek 		 * We may maintain a non-NULL unsent pointer even when there is
639*ef8d499eSDavid van Moolenbroek 		 * nothing more to send right now, because the tail buffer may
640*ef8d499eSDavid van Moolenbroek 		 * be filled up further later on.
641*ef8d499eSDavid van Moolenbroek 		 */
642*ef8d499eSDavid van Moolenbroek 		punsent = tcp->tcp_snd.ts_unsent;
643*ef8d499eSDavid van Moolenbroek 
644*ef8d499eSDavid van Moolenbroek 		assert(punsent->len >= tcp->tcp_snd.ts_unsent_off);
645*ef8d499eSDavid van Moolenbroek 
646*ef8d499eSDavid van Moolenbroek 		chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off;
647*ef8d499eSDavid van Moolenbroek 		if (chunk == 0)
648*ef8d499eSDavid van Moolenbroek 			break;
649*ef8d499eSDavid van Moolenbroek 
650*ef8d499eSDavid van Moolenbroek 		if (chunk > space)
651*ef8d499eSDavid van Moolenbroek 			chunk = space;
652*ef8d499eSDavid van Moolenbroek 
653*ef8d499eSDavid van Moolenbroek 		/* Try to enqueue more data for sending. */
654*ef8d499eSDavid van Moolenbroek 		if (chunk < punsent->len || punsent->next != NULL)
655*ef8d499eSDavid van Moolenbroek 			flags = TCP_WRITE_FLAG_MORE;
656*ef8d499eSDavid van Moolenbroek 		else
657*ef8d499eSDavid van Moolenbroek 			flags = 0;
658*ef8d499eSDavid van Moolenbroek 
659*ef8d499eSDavid van Moolenbroek 		err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload +
660*ef8d499eSDavid van Moolenbroek 		    tcp->tcp_snd.ts_unsent_off, chunk, flags);
661*ef8d499eSDavid van Moolenbroek 
662*ef8d499eSDavid van Moolenbroek 		/*
663*ef8d499eSDavid van Moolenbroek 		 * Since tcp_write() enqueues data only, it should only return
664*ef8d499eSDavid van Moolenbroek 		 * out-of-memory errors; no fatal ones.  In any case, stop.
665*ef8d499eSDavid van Moolenbroek 		 */
666*ef8d499eSDavid van Moolenbroek 		if (err != ERR_OK) {
667*ef8d499eSDavid van Moolenbroek 			assert(err == ERR_MEM);
668*ef8d499eSDavid van Moolenbroek 
669*ef8d499eSDavid van Moolenbroek 			break;
670*ef8d499eSDavid van Moolenbroek 		}
671*ef8d499eSDavid van Moolenbroek 
672*ef8d499eSDavid van Moolenbroek 		/* We have successfully enqueued data. */
673*ef8d499eSDavid van Moolenbroek 		enqueued = TRUE;
674*ef8d499eSDavid van Moolenbroek 
675*ef8d499eSDavid van Moolenbroek 		tcp->tcp_snd.ts_unsent_off += chunk;
676*ef8d499eSDavid van Moolenbroek 
677*ef8d499eSDavid van Moolenbroek 		if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) {
678*ef8d499eSDavid van Moolenbroek 			assert(tcp->tcp_snd.ts_unsent_off < punsent->len ||
679*ef8d499eSDavid van Moolenbroek 			    punsent->next == NULL);
680*ef8d499eSDavid van Moolenbroek 
681*ef8d499eSDavid van Moolenbroek 			break;
682*ef8d499eSDavid van Moolenbroek 		}
683*ef8d499eSDavid van Moolenbroek 
684*ef8d499eSDavid van Moolenbroek 		tcp->tcp_snd.ts_unsent = punsent->next;
685*ef8d499eSDavid van Moolenbroek 		tcp->tcp_snd.ts_unsent_off = 0;
686*ef8d499eSDavid van Moolenbroek 	}
687*ef8d499eSDavid van Moolenbroek 
688*ef8d499eSDavid van Moolenbroek 	/*
689*ef8d499eSDavid van Moolenbroek 	 * If all pending data has been enqueued for sending, and we should
690*ef8d499eSDavid van Moolenbroek 	 * shut down the sending end of the socket, try that now.
691*ef8d499eSDavid van Moolenbroek 	 */
692*ef8d499eSDavid van Moolenbroek 	if ((tcp->tcp_snd.ts_unsent == NULL ||
693*ef8d499eSDavid van Moolenbroek 	    tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) &&
694*ef8d499eSDavid van Moolenbroek 	    tcpsock_is_shutdown(tcp, SFL_SHUT_WR) &&
695*ef8d499eSDavid van Moolenbroek 	    !(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) {
696*ef8d499eSDavid van Moolenbroek 		err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/);
697*ef8d499eSDavid van Moolenbroek 
698*ef8d499eSDavid van Moolenbroek 		if (err == ERR_OK) {
699*ef8d499eSDavid van Moolenbroek 			/*
700*ef8d499eSDavid van Moolenbroek 			 * We have successfully enqueued a FIN.  The caller is
701*ef8d499eSDavid van Moolenbroek 			 * now responsible for checking whether the PCB and
702*ef8d499eSDavid van Moolenbroek 			 * possibly even the socket object can now be freed.
703*ef8d499eSDavid van Moolenbroek 			 */
704*ef8d499eSDavid van Moolenbroek 			tcpsock_set_flag(tcp, TCPF_SENT_FIN);
705*ef8d499eSDavid van Moolenbroek 
706*ef8d499eSDavid van Moolenbroek 			enqueued = TRUE;
707*ef8d499eSDavid van Moolenbroek 		} else {
708*ef8d499eSDavid van Moolenbroek 			assert(err == ERR_MEM);
709*ef8d499eSDavid van Moolenbroek 
710*ef8d499eSDavid van Moolenbroek 			/*
711*ef8d499eSDavid van Moolenbroek 			 * FIXME: the resolution for lwIP bug #47485 has taken
712*ef8d499eSDavid van Moolenbroek 			 * away even more control over the closing process from
713*ef8d499eSDavid van Moolenbroek 			 * us, making tracking sockets especially for SO_LINGER
714*ef8d499eSDavid van Moolenbroek 			 * even harder.  For now, we simply effectively undo
715*ef8d499eSDavid van Moolenbroek 			 * the patch by clearing TF_CLOSEPEND if tcp_shutdown()
716*ef8d499eSDavid van Moolenbroek 			 * returns ERR_MEM.  This will not be sustainable in
717*ef8d499eSDavid van Moolenbroek 			 * the long term, though.
718*ef8d499eSDavid van Moolenbroek 			 */
719*ef8d499eSDavid van Moolenbroek 			tcp->tcp_pcb->flags &= ~TF_CLOSEPEND;
720*ef8d499eSDavid van Moolenbroek 
721*ef8d499eSDavid van Moolenbroek 			tcpsock_set_flag(tcp, TCPF_FULL);
722*ef8d499eSDavid van Moolenbroek 		}
723*ef8d499eSDavid van Moolenbroek 	}
724*ef8d499eSDavid van Moolenbroek 
725*ef8d499eSDavid van Moolenbroek 	return enqueued;
726*ef8d499eSDavid van Moolenbroek }
727*ef8d499eSDavid van Moolenbroek 
728*ef8d499eSDavid van Moolenbroek /*
729*ef8d499eSDavid van Moolenbroek  * Request lwIP to start sending any enqueued data and/or FIN on the TCP
730*ef8d499eSDavid van Moolenbroek  * socket's lwIP PCB.  On success, return OK.  On failure, return a negative
731*ef8d499eSDavid van Moolenbroek  * error code, after cleaning up the socket, freeing the PCB.  If the socket
732*ef8d499eSDavid van Moolenbroek  * was already being closed, also free the socket object in that case; the
733*ef8d499eSDavid van Moolenbroek  * caller must then not touch the socket object anymore upon return.  If the
734*ef8d499eSDavid van Moolenbroek  * socket object is not freed, and if 'raise_error' is TRUE, raise the error
735*ef8d499eSDavid van Moolenbroek  * on the socket object.
736*ef8d499eSDavid van Moolenbroek  */
737*ef8d499eSDavid van Moolenbroek static int
tcpsock_pcb_send(struct tcpsock * tcp,int raise_error)738*ef8d499eSDavid van Moolenbroek tcpsock_pcb_send(struct tcpsock * tcp, int raise_error)
739*ef8d499eSDavid van Moolenbroek {
740*ef8d499eSDavid van Moolenbroek 	err_t err;
741*ef8d499eSDavid van Moolenbroek 	int r;
742*ef8d499eSDavid van Moolenbroek 
743*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_pcb != NULL);
744*ef8d499eSDavid van Moolenbroek 
745*ef8d499eSDavid van Moolenbroek 	/*
746*ef8d499eSDavid van Moolenbroek 	 * If we have enqueued something, ask lwIP to send TCP packets now.
747*ef8d499eSDavid van Moolenbroek 	 * This may result in a fatal error, in which case we clean up the
748*ef8d499eSDavid van Moolenbroek 	 * socket and return the error to the caller.  Since cleaning up the
749*ef8d499eSDavid van Moolenbroek 	 * socket may free the socket object, and the caller cannot tell
750*ef8d499eSDavid van Moolenbroek 	 * whether that will happen or has happened, also possibly raise the
751*ef8d499eSDavid van Moolenbroek 	 * error on the socket object if it is not gone.  As such, callers that
752*ef8d499eSDavid van Moolenbroek 	 * set 'raise_error' to FALSE must know for sure that the socket was
753*ef8d499eSDavid van Moolenbroek 	 * not being closed, for example because the caller is processing a
754*ef8d499eSDavid van Moolenbroek 	 * (send) call from userland.
755*ef8d499eSDavid van Moolenbroek 	 */
756*ef8d499eSDavid van Moolenbroek 	err = tcp_output(tcp->tcp_pcb);
757*ef8d499eSDavid van Moolenbroek 
758*ef8d499eSDavid van Moolenbroek 	if (err != ERR_OK && err != ERR_MEM) {
759*ef8d499eSDavid van Moolenbroek 		tcpsock_pcb_abort(tcp);
760*ef8d499eSDavid van Moolenbroek 
761*ef8d499eSDavid van Moolenbroek 		r = util_convert_err(err);
762*ef8d499eSDavid van Moolenbroek 
763*ef8d499eSDavid van Moolenbroek 		if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
764*ef8d499eSDavid van Moolenbroek 			if (raise_error)
765*ef8d499eSDavid van Moolenbroek 				sockevent_set_error(tcpsock_get_sock(tcp), r);
766*ef8d499eSDavid van Moolenbroek 		}
767*ef8d499eSDavid van Moolenbroek 		/* Otherwise, do not touch the socket object anymore! */
768*ef8d499eSDavid van Moolenbroek 
769*ef8d499eSDavid van Moolenbroek 		return r;
770*ef8d499eSDavid van Moolenbroek 	} else
771*ef8d499eSDavid van Moolenbroek 		return OK;
772*ef8d499eSDavid van Moolenbroek }
773*ef8d499eSDavid van Moolenbroek 
774*ef8d499eSDavid van Moolenbroek /*
775*ef8d499eSDavid van Moolenbroek  * Callback from lwIP.  The given number of data bytes have been acknowledged
776*ef8d499eSDavid van Moolenbroek  * as received by the remote end.  Dequeue and free data from the TCP socket's
777*ef8d499eSDavid van Moolenbroek  * send queue as appropriate.
778*ef8d499eSDavid van Moolenbroek  */
779*ef8d499eSDavid van Moolenbroek static err_t
tcpsock_event_sent(void * arg,struct tcp_pcb * pcb __unused,uint16_t len)780*ef8d499eSDavid van Moolenbroek tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len)
781*ef8d499eSDavid van Moolenbroek {
782*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)arg;
783*ef8d499eSDavid van Moolenbroek 	struct pbuf *phead;
784*ef8d499eSDavid van Moolenbroek 	size_t left;
785*ef8d499eSDavid van Moolenbroek 
786*ef8d499eSDavid van Moolenbroek 	assert(tcp != NULL);
787*ef8d499eSDavid van Moolenbroek 	assert(pcb == tcp->tcp_pcb);
788*ef8d499eSDavid van Moolenbroek 	assert(len > 0);
789*ef8d499eSDavid van Moolenbroek 
790*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_snd.ts_len >= len);
791*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_snd.ts_head != NULL);
792*ef8d499eSDavid van Moolenbroek 
793*ef8d499eSDavid van Moolenbroek 	left = len;
794*ef8d499eSDavid van Moolenbroek 
795*ef8d499eSDavid van Moolenbroek 	/*
796*ef8d499eSDavid van Moolenbroek 	 * First see if we can free up whole buffers.  Check against the head
797*ef8d499eSDavid van Moolenbroek 	 * buffer's 'len' rather than 'tot_len', or we may end up leaving an
798*ef8d499eSDavid van Moolenbroek 	 * empty buffer on the chain.
799*ef8d499eSDavid van Moolenbroek 	 */
800*ef8d499eSDavid van Moolenbroek 	while ((phead = tcp->tcp_snd.ts_head) != NULL &&
801*ef8d499eSDavid van Moolenbroek 	    left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) {
802*ef8d499eSDavid van Moolenbroek 		left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off;
803*ef8d499eSDavid van Moolenbroek 
804*ef8d499eSDavid van Moolenbroek 		tcp->tcp_snd.ts_head = phead->next;
805*ef8d499eSDavid van Moolenbroek 		tcp->tcp_snd.ts_head_off = 0;
806*ef8d499eSDavid van Moolenbroek 
807*ef8d499eSDavid van Moolenbroek 		if (phead == tcp->tcp_snd.ts_unsent) {
808*ef8d499eSDavid van Moolenbroek 			assert(tcp->tcp_snd.ts_unsent_off == phead->len);
809*ef8d499eSDavid van Moolenbroek 
810*ef8d499eSDavid van Moolenbroek 			tcp->tcp_snd.ts_unsent = phead->next;
811*ef8d499eSDavid van Moolenbroek 			tcp->tcp_snd.ts_unsent_off = 0;
812*ef8d499eSDavid van Moolenbroek 		}
813*ef8d499eSDavid van Moolenbroek 
814*ef8d499eSDavid van Moolenbroek 		assert(tcpsock_sendbufs > 0);
815*ef8d499eSDavid van Moolenbroek 		tcpsock_sendbufs--;
816*ef8d499eSDavid van Moolenbroek 
817*ef8d499eSDavid van Moolenbroek 		tcpsock_free_buf(phead);
818*ef8d499eSDavid van Moolenbroek 	}
819*ef8d499eSDavid van Moolenbroek 
820*ef8d499eSDavid van Moolenbroek 	/*
821*ef8d499eSDavid van Moolenbroek 	 * The rest of the given length is for less than the current head
822*ef8d499eSDavid van Moolenbroek 	 * buffer.
823*ef8d499eSDavid van Moolenbroek 	 */
824*ef8d499eSDavid van Moolenbroek 	if (left > 0) {
825*ef8d499eSDavid van Moolenbroek 		assert(tcp->tcp_snd.ts_head != NULL);
826*ef8d499eSDavid van Moolenbroek 		assert((size_t)tcp->tcp_snd.ts_head->len -
827*ef8d499eSDavid van Moolenbroek 		    tcp->tcp_snd.ts_head_off > left);
828*ef8d499eSDavid van Moolenbroek 
829*ef8d499eSDavid van Moolenbroek 		tcp->tcp_snd.ts_head_off += left;
830*ef8d499eSDavid van Moolenbroek 	}
831*ef8d499eSDavid van Moolenbroek 
832*ef8d499eSDavid van Moolenbroek 	tcp->tcp_snd.ts_len -= (size_t)len;
833*ef8d499eSDavid van Moolenbroek 
834*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_snd.ts_head == NULL) {
835*ef8d499eSDavid van Moolenbroek 		assert(tcp->tcp_snd.ts_len == 0);
836*ef8d499eSDavid van Moolenbroek 		assert(tcp->tcp_snd.ts_unsent == NULL);
837*ef8d499eSDavid van Moolenbroek 		tcp->tcp_snd.ts_tail = NULL;
838*ef8d499eSDavid van Moolenbroek 	} else
839*ef8d499eSDavid van Moolenbroek 		assert(tcp->tcp_snd.ts_len > 0);
840*ef8d499eSDavid van Moolenbroek 
841*ef8d499eSDavid van Moolenbroek 	/*
842*ef8d499eSDavid van Moolenbroek 	 * If we emptied the send queue, and we already managed to send a FIN
843*ef8d499eSDavid van Moolenbroek 	 * earlier, we may now have met all requirements to close the socket's
844*ef8d499eSDavid van Moolenbroek 	 * PCB.  Otherwise, we may also be able to send more now, so try to
845*ef8d499eSDavid van Moolenbroek 	 * resume sending.  Since we are invoked from the "sent" event,
846*ef8d499eSDavid van Moolenbroek 	 * tcp_output() will not actually process anything, and so we do not
847*ef8d499eSDavid van Moolenbroek 	 * call it either.  If we did, we would have to deal with errors here.
848*ef8d499eSDavid van Moolenbroek 	 */
849*ef8d499eSDavid van Moolenbroek 	if (tcpsock_may_close(tcp)) {
850*ef8d499eSDavid van Moolenbroek 		if (tcpsock_finish_close(tcp))
851*ef8d499eSDavid van Moolenbroek 			return ERR_OK;
852*ef8d499eSDavid van Moolenbroek 	} else {
853*ef8d499eSDavid van Moolenbroek 		tcpsock_clear_flag(tcp, TCPF_FULL);
854*ef8d499eSDavid van Moolenbroek 
855*ef8d499eSDavid van Moolenbroek 		/*
856*ef8d499eSDavid van Moolenbroek 		 * If we now manage to enqueue a FIN, we may be ready to close
857*ef8d499eSDavid van Moolenbroek 		 * the PCB after all.
858*ef8d499eSDavid van Moolenbroek 		 */
859*ef8d499eSDavid van Moolenbroek 		if (tcpsock_pcb_enqueue(tcp)) {
860*ef8d499eSDavid van Moolenbroek 			if (tcpsock_may_close(tcp) &&
861*ef8d499eSDavid van Moolenbroek 			    tcpsock_finish_close(tcp))
862*ef8d499eSDavid van Moolenbroek 				return ERR_OK;
863*ef8d499eSDavid van Moolenbroek 		}
864*ef8d499eSDavid van Moolenbroek 	}
865*ef8d499eSDavid van Moolenbroek 
866*ef8d499eSDavid van Moolenbroek 	/* The user may also be able to send more now. */
867*ef8d499eSDavid van Moolenbroek 	sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
868*ef8d499eSDavid van Moolenbroek 
869*ef8d499eSDavid van Moolenbroek 	return ERR_OK;
870*ef8d499eSDavid van Moolenbroek }
871*ef8d499eSDavid van Moolenbroek 
872*ef8d499eSDavid van Moolenbroek /*
873*ef8d499eSDavid van Moolenbroek  * Check whether any (additional) data previously received on a TCP socket
874*ef8d499eSDavid van Moolenbroek  * should be acknowledged, possibly allowing the remote end to send additional
875*ef8d499eSDavid van Moolenbroek  * data as a result.
876*ef8d499eSDavid van Moolenbroek  */
877*ef8d499eSDavid van Moolenbroek static void
tcpsock_ack_recv(struct tcpsock * tcp)878*ef8d499eSDavid van Moolenbroek tcpsock_ack_recv(struct tcpsock * tcp)
879*ef8d499eSDavid van Moolenbroek {
880*ef8d499eSDavid van Moolenbroek 	size_t rcvbuf, left, delta, ack;
881*ef8d499eSDavid van Moolenbroek 
882*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_pcb != NULL);
883*ef8d499eSDavid van Moolenbroek 
884*ef8d499eSDavid van Moolenbroek 	/*
885*ef8d499eSDavid van Moolenbroek 	 * We must make sure that at all times, we can still add an entire
886*ef8d499eSDavid van Moolenbroek 	 * window's worth of data to the receive queue.  If the amount of free
887*ef8d499eSDavid van Moolenbroek 	 * space drops below that threshold, we stop acknowledging received
888*ef8d499eSDavid van Moolenbroek 	 * data.  The user may change the receive buffer size at all times; we
889*ef8d499eSDavid van Moolenbroek 	 * update the window size lazily as appropriate.
890*ef8d499eSDavid van Moolenbroek 	 */
891*ef8d499eSDavid van Moolenbroek 	rcvbuf = tcpsock_get_rcvbuf(tcp);
892*ef8d499eSDavid van Moolenbroek 
893*ef8d499eSDavid van Moolenbroek 	if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) {
894*ef8d499eSDavid van Moolenbroek 		/*
895*ef8d499eSDavid van Moolenbroek 		 * The number of bytes that lwIP can still give us at any time
896*ef8d499eSDavid van Moolenbroek 		 * is represented as 'left'.  The number of bytes that we still
897*ef8d499eSDavid van Moolenbroek 		 * allow to be stored in the receive queue is represented as
898*ef8d499eSDavid van Moolenbroek 		 * 'delta'.  We must make sure that 'left' does not ever exceed
899*ef8d499eSDavid van Moolenbroek 		 * 'delta' while acknowledging as many bytes as possible under
900*ef8d499eSDavid van Moolenbroek 		 * that rule.
901*ef8d499eSDavid van Moolenbroek 		 */
902*ef8d499eSDavid van Moolenbroek 		left = TCP_WND - tcp->tcp_rcv.tr_unacked;
903*ef8d499eSDavid van Moolenbroek 		delta = rcvbuf - tcp->tcp_rcv.tr_len;
904*ef8d499eSDavid van Moolenbroek 
905*ef8d499eSDavid van Moolenbroek 		if (left < delta) {
906*ef8d499eSDavid van Moolenbroek 			ack = delta - left;
907*ef8d499eSDavid van Moolenbroek 
908*ef8d499eSDavid van Moolenbroek 			if (ack > tcp->tcp_rcv.tr_unacked)
909*ef8d499eSDavid van Moolenbroek 				ack = tcp->tcp_rcv.tr_unacked;
910*ef8d499eSDavid van Moolenbroek 
911*ef8d499eSDavid van Moolenbroek 			tcp_recved(tcp->tcp_pcb, ack);
912*ef8d499eSDavid van Moolenbroek 
913*ef8d499eSDavid van Moolenbroek 			tcp->tcp_rcv.tr_unacked -= ack;
914*ef8d499eSDavid van Moolenbroek 
915*ef8d499eSDavid van Moolenbroek 			assert(tcp->tcp_rcv.tr_len + TCP_WND -
916*ef8d499eSDavid van Moolenbroek 			    tcp->tcp_rcv.tr_unacked <= rcvbuf);
917*ef8d499eSDavid van Moolenbroek 		}
918*ef8d499eSDavid van Moolenbroek 	}
919*ef8d499eSDavid van Moolenbroek }
920*ef8d499eSDavid van Moolenbroek 
921*ef8d499eSDavid van Moolenbroek /*
922*ef8d499eSDavid van Moolenbroek  * Attempt to merge two consecutive underfilled buffers in the receive queue of
923*ef8d499eSDavid van Moolenbroek  * a TCP socket, freeing up one of the two buffers as a result.  The first
924*ef8d499eSDavid van Moolenbroek  * (oldest) buffer is 'ptail', and the pointer to this buffer is stored at
925*ef8d499eSDavid van Moolenbroek  * 'pnext'.  The second (new) buffer is 'pbuf', which is already attached to
926*ef8d499eSDavid van Moolenbroek  * the first buffer.  The second buffer may be followed by additional buffers
927*ef8d499eSDavid van Moolenbroek  * with even more new data.  Return TRUE if buffers have been merged, in which
928*ef8d499eSDavid van Moolenbroek  * case the pointer at 'pnext' may have changed, and no assumptions should be
929*ef8d499eSDavid van Moolenbroek  * made about whether 'ptail' and 'pbuf' still exist in any form.  Return FALSE
930*ef8d499eSDavid van Moolenbroek  * if no merging was necessary or if no new buffer could be allocated.
931*ef8d499eSDavid van Moolenbroek  */
932*ef8d499eSDavid van Moolenbroek static int
tcpsock_try_merge(struct pbuf ** pnext,struct pbuf * ptail,struct pbuf * pbuf)933*ef8d499eSDavid van Moolenbroek tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf)
934*ef8d499eSDavid van Moolenbroek {
935*ef8d499eSDavid van Moolenbroek 	struct pbuf *pnew;
936*ef8d499eSDavid van Moolenbroek 
937*ef8d499eSDavid van Moolenbroek 	assert(*pnext == ptail);
938*ef8d499eSDavid van Moolenbroek 	assert(ptail->next == pbuf);
939*ef8d499eSDavid van Moolenbroek 
940*ef8d499eSDavid van Moolenbroek 	/*
941*ef8d499eSDavid van Moolenbroek 	 * Unfortunately, we cannot figure out what kind of pbuf we were given
942*ef8d499eSDavid van Moolenbroek 	 * by the lower layers, so we cannot merge two buffers without first
943*ef8d499eSDavid van Moolenbroek 	 * allocating a third.  Once we have done that, though, we can easily
944*ef8d499eSDavid van Moolenbroek 	 * merge more into that new buffer.  For now we use the following
945*ef8d499eSDavid van Moolenbroek 	 * policies:
946*ef8d499eSDavid van Moolenbroek 	 *
947*ef8d499eSDavid van Moolenbroek 	 * 1. if two consecutive lwIP-provided buffers are both used less than
948*ef8d499eSDavid van Moolenbroek 	 *    half the size of a full buffer, try to allocate a new buffer and
949*ef8d499eSDavid van Moolenbroek 	 *    copy both lwIP-provided buffers into that new buffer, freeing up
950*ef8d499eSDavid van Moolenbroek 	 *    the pair afterwards;
951*ef8d499eSDavid van Moolenbroek 	 * 2. if the tail buffer on the chain is allocated by us and not yet
952*ef8d499eSDavid van Moolenbroek 	 *    full, and the next buffer's contents can be added to the tail
953*ef8d499eSDavid van Moolenbroek 	 *    buffer in their entirety, do just that.
954*ef8d499eSDavid van Moolenbroek 	 *
955*ef8d499eSDavid van Moolenbroek 	 * Obviously there is a trade-off between the performance overhead of
956*ef8d499eSDavid van Moolenbroek 	 * copying and the resource overhead of keeping less-than-full buffers
957*ef8d499eSDavid van Moolenbroek 	 * on the receive queue, but this policy should both keep actual memory
958*ef8d499eSDavid van Moolenbroek 	 * usage to no more than twice the receive queue length and prevent
959*ef8d499eSDavid van Moolenbroek 	 * excessive copying.  The policy deliberately performs more aggressive
960*ef8d499eSDavid van Moolenbroek 	 * merging into a buffer that we allocated ourselves.
961*ef8d499eSDavid van Moolenbroek 	 */
962*ef8d499eSDavid van Moolenbroek 	if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 &&
963*ef8d499eSDavid van Moolenbroek 	    pbuf->len <= MEMPOOL_BUFSIZE / 2) {
964*ef8d499eSDavid van Moolenbroek 		/*
965*ef8d499eSDavid van Moolenbroek 		 * Case #1.
966*ef8d499eSDavid van Moolenbroek 		 */
967*ef8d499eSDavid van Moolenbroek 		assert(ptail->tot_len == ptail->len);
968*ef8d499eSDavid van Moolenbroek 		assert(pbuf->tot_len == pbuf->len);
969*ef8d499eSDavid van Moolenbroek 
970*ef8d499eSDavid van Moolenbroek 		pnew = tcpsock_alloc_buf();
971*ef8d499eSDavid van Moolenbroek 		if (pnew == NULL)
972*ef8d499eSDavid van Moolenbroek 			return FALSE;
973*ef8d499eSDavid van Moolenbroek 
974*ef8d499eSDavid van Moolenbroek 		memcpy(pnew->payload, ptail->payload, ptail->len);
975*ef8d499eSDavid van Moolenbroek 		memcpy((char *)pnew->payload + ptail->len, pbuf->payload,
976*ef8d499eSDavid van Moolenbroek 		    pbuf->len);
977*ef8d499eSDavid van Moolenbroek 		pnew->len = ptail->len + pbuf->len;
978*ef8d499eSDavid van Moolenbroek 		assert(pnew->len <= pnew->tot_len);
979*ef8d499eSDavid van Moolenbroek 
980*ef8d499eSDavid van Moolenbroek 		pnew->next = pbuf->next;
981*ef8d499eSDavid van Moolenbroek 		/* For now, we need not inherit any flags from either pbuf. */
982*ef8d499eSDavid van Moolenbroek 
983*ef8d499eSDavid van Moolenbroek 		*pnext = pnew;
984*ef8d499eSDavid van Moolenbroek 
985*ef8d499eSDavid van Moolenbroek 		/* One allocated, two about to be deallocated. */
986*ef8d499eSDavid van Moolenbroek 		assert(tcpsock_recvbufs > 0);
987*ef8d499eSDavid van Moolenbroek 		tcpsock_recvbufs--;
988*ef8d499eSDavid van Moolenbroek 
989*ef8d499eSDavid van Moolenbroek 		tcpsock_free_buf(ptail);
990*ef8d499eSDavid van Moolenbroek 		tcpsock_free_buf(pbuf);
991*ef8d499eSDavid van Moolenbroek 
992*ef8d499eSDavid van Moolenbroek 		return TRUE;
993*ef8d499eSDavid van Moolenbroek 	} else if (ptail->tot_len - ptail->len >= pbuf->len) {
994*ef8d499eSDavid van Moolenbroek 		/*
995*ef8d499eSDavid van Moolenbroek 		 * Case #2.
996*ef8d499eSDavid van Moolenbroek 		 */
997*ef8d499eSDavid van Moolenbroek 		memcpy((char *)ptail->payload + ptail->len, pbuf->payload,
998*ef8d499eSDavid van Moolenbroek 		    pbuf->len);
999*ef8d499eSDavid van Moolenbroek 
1000*ef8d499eSDavid van Moolenbroek 		ptail->len += pbuf->len;
1001*ef8d499eSDavid van Moolenbroek 
1002*ef8d499eSDavid van Moolenbroek 		ptail->next = pbuf->next;
1003*ef8d499eSDavid van Moolenbroek 
1004*ef8d499eSDavid van Moolenbroek 		assert(tcpsock_recvbufs > 0);
1005*ef8d499eSDavid van Moolenbroek 		tcpsock_recvbufs--;
1006*ef8d499eSDavid van Moolenbroek 
1007*ef8d499eSDavid van Moolenbroek 		tcpsock_free_buf(pbuf);
1008*ef8d499eSDavid van Moolenbroek 
1009*ef8d499eSDavid van Moolenbroek 		return TRUE;
1010*ef8d499eSDavid van Moolenbroek 	} else
1011*ef8d499eSDavid van Moolenbroek 		return FALSE;
1012*ef8d499eSDavid van Moolenbroek }
1013*ef8d499eSDavid van Moolenbroek 
1014*ef8d499eSDavid van Moolenbroek /*
1015*ef8d499eSDavid van Moolenbroek  * Callback from lwIP.  New data or flags have been received on a TCP socket.
1016*ef8d499eSDavid van Moolenbroek  */
1017*ef8d499eSDavid van Moolenbroek static err_t
tcpsock_event_recv(void * arg,struct tcp_pcb * pcb __unused,struct pbuf * pbuf,err_t err)1018*ef8d499eSDavid van Moolenbroek tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused,
1019*ef8d499eSDavid van Moolenbroek 	struct pbuf * pbuf, err_t err)
1020*ef8d499eSDavid van Moolenbroek {
1021*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)arg;
1022*ef8d499eSDavid van Moolenbroek 	struct pbuf *ptail, **pprevp;
1023*ef8d499eSDavid van Moolenbroek 	size_t len;
1024*ef8d499eSDavid van Moolenbroek 
1025*ef8d499eSDavid van Moolenbroek 	assert(tcp != NULL);
1026*ef8d499eSDavid van Moolenbroek 	assert(pcb == tcp->tcp_pcb);
1027*ef8d499eSDavid van Moolenbroek 
1028*ef8d499eSDavid van Moolenbroek 	/*
1029*ef8d499eSDavid van Moolenbroek 	 * lwIP should never provide anything other than ERR_OK in 'err', and
1030*ef8d499eSDavid van Moolenbroek 	 * it is not clear what we should do if it would.  If lwIP ever changes
1031*ef8d499eSDavid van Moolenbroek 	 * in this regard, we will likely have to change this code accordingly.
1032*ef8d499eSDavid van Moolenbroek 	 */
1033*ef8d499eSDavid van Moolenbroek 	if (err != ERR_OK)
1034*ef8d499eSDavid van Moolenbroek 		panic("TCP receive event with error: %d", err);
1035*ef8d499eSDavid van Moolenbroek 
1036*ef8d499eSDavid van Moolenbroek 	/* If the given buffer is NULL, we have received a FIN. */
1037*ef8d499eSDavid van Moolenbroek 	if (pbuf == NULL) {
1038*ef8d499eSDavid van Moolenbroek 		tcpsock_set_flag(tcp, TCPF_RCVD_FIN);
1039*ef8d499eSDavid van Moolenbroek 
1040*ef8d499eSDavid van Moolenbroek 		/* Userland may now receive EOF. */
1041*ef8d499eSDavid van Moolenbroek 		if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD))
1042*ef8d499eSDavid van Moolenbroek 			sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
1043*ef8d499eSDavid van Moolenbroek 
1044*ef8d499eSDavid van Moolenbroek 		/*
1045*ef8d499eSDavid van Moolenbroek 		 * If we were in the process of closing the socket, and we
1046*ef8d499eSDavid van Moolenbroek 		 * receive a FIN before our FIN got acknowledged, we close the
1047*ef8d499eSDavid van Moolenbroek 		 * socket anyway, as described in tcpsock_close().  However, if
1048*ef8d499eSDavid van Moolenbroek 		 * there is still unacknowledged outgoing data or we did not
1049*ef8d499eSDavid van Moolenbroek 		 * even manage to send our FIN yet, hold off closing the socket
1050*ef8d499eSDavid van Moolenbroek 		 * for now.
1051*ef8d499eSDavid van Moolenbroek 		 */
1052*ef8d499eSDavid van Moolenbroek 		if (tcpsock_may_close(tcp))
1053*ef8d499eSDavid van Moolenbroek 			(void)tcpsock_finish_close(tcp);
1054*ef8d499eSDavid van Moolenbroek 
1055*ef8d499eSDavid van Moolenbroek 		return ERR_OK;
1056*ef8d499eSDavid van Moolenbroek 	}
1057*ef8d499eSDavid van Moolenbroek 
1058*ef8d499eSDavid van Moolenbroek 	/*
1059*ef8d499eSDavid van Moolenbroek 	 * If the socket is being closed, receiving new data should cause a
1060*ef8d499eSDavid van Moolenbroek 	 * reset.
1061*ef8d499eSDavid van Moolenbroek 	 */
1062*ef8d499eSDavid van Moolenbroek 	if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
1063*ef8d499eSDavid van Moolenbroek 		tcpsock_pcb_abort(tcp);
1064*ef8d499eSDavid van Moolenbroek 
1065*ef8d499eSDavid van Moolenbroek 		(void)tcpsock_cleanup(tcp, TRUE /*may_free*/);
1066*ef8d499eSDavid van Moolenbroek 		/* Do not touch the socket object anymore! */
1067*ef8d499eSDavid van Moolenbroek 
1068*ef8d499eSDavid van Moolenbroek 		pbuf_free(pbuf);
1069*ef8d499eSDavid van Moolenbroek 
1070*ef8d499eSDavid van Moolenbroek 		return ERR_ABRT;
1071*ef8d499eSDavid van Moolenbroek 	}
1072*ef8d499eSDavid van Moolenbroek 
1073*ef8d499eSDavid van Moolenbroek 	/*
1074*ef8d499eSDavid van Moolenbroek 	 * If the socket has already been shut down for reading, discard the
1075*ef8d499eSDavid van Moolenbroek 	 * incoming data and do nothing else.
1076*ef8d499eSDavid van Moolenbroek 	 */
1077*ef8d499eSDavid van Moolenbroek 	if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) {
1078*ef8d499eSDavid van Moolenbroek 		tcp_recved(tcp->tcp_pcb, pbuf->tot_len);
1079*ef8d499eSDavid van Moolenbroek 
1080*ef8d499eSDavid van Moolenbroek 		pbuf_free(pbuf);
1081*ef8d499eSDavid van Moolenbroek 
1082*ef8d499eSDavid van Moolenbroek 		return ERR_OK;
1083*ef8d499eSDavid van Moolenbroek 	}
1084*ef8d499eSDavid van Moolenbroek 
1085*ef8d499eSDavid van Moolenbroek 	/*
1086*ef8d499eSDavid van Moolenbroek 	 * We deliberately ignore the PBUF_FLAG_PUSH flag.  This flag would
1087*ef8d499eSDavid van Moolenbroek 	 * enable the receive functionality to delay delivering "un-pushed"
1088*ef8d499eSDavid van Moolenbroek 	 * data to applications.  The implementation of this scheme could track
1089*ef8d499eSDavid van Moolenbroek 	 * the amount of data up to and including the last-pushed segment using
1090*ef8d499eSDavid van Moolenbroek 	 * a "tr_push_len" field or so.  Deciding when to deliver "un-pushed"
1091*ef8d499eSDavid van Moolenbroek 	 * data after all is a bit tricker though.  As far as I can tell, the
1092*ef8d499eSDavid van Moolenbroek 	 * BSDs do not implement anything like that.  Windows does, and this
1093*ef8d499eSDavid van Moolenbroek 	 * results in interaction problems with even more lightweight TCP/IP
1094*ef8d499eSDavid van Moolenbroek 	 * stacks that do not send the TCP PSH flag.  Currently, there is no
1095*ef8d499eSDavid van Moolenbroek 	 * obvious benefit for us to support delaying data delivery like that.
1096*ef8d499eSDavid van Moolenbroek 	 * In addition, testing its implementation reliably would be difficult.
1097*ef8d499eSDavid van Moolenbroek 	 */
1098*ef8d499eSDavid van Moolenbroek 
1099*ef8d499eSDavid van Moolenbroek 	len = (size_t)pbuf->tot_len;
1100*ef8d499eSDavid van Moolenbroek 
1101*ef8d499eSDavid van Moolenbroek 	/*
1102*ef8d499eSDavid van Moolenbroek 	 * Count the number of buffers that are now owned by us.  The new total
1103*ef8d499eSDavid van Moolenbroek 	 * of buffers owned by us must not exceed the size of the memory pool.
1104*ef8d499eSDavid van Moolenbroek 	 * Any more would indicate an accounting error.  Note that
1105*ef8d499eSDavid van Moolenbroek 	 * tcpsock_recvbufs is currently used for debugging only!
1106*ef8d499eSDavid van Moolenbroek 	 */
1107*ef8d499eSDavid van Moolenbroek 	tcpsock_recvbufs += pbuf_clen(pbuf);
1108*ef8d499eSDavid van Moolenbroek 	assert(tcpsock_recvbufs < mempool_cur_buffers());
1109*ef8d499eSDavid van Moolenbroek 
1110*ef8d499eSDavid van Moolenbroek 	/*
1111*ef8d499eSDavid van Moolenbroek 	 * The pre-tail pointer points to whatever is pointing to the tail
1112*ef8d499eSDavid van Moolenbroek 	 * buffer.  The latter pointer may be the 'tr_head' field in our
1113*ef8d499eSDavid van Moolenbroek 	 * tcpsock structure, or the 'next' field in the penultimate buffer,
1114*ef8d499eSDavid van Moolenbroek 	 * or NULL if there are currently no buffers on the receive queue.
1115*ef8d499eSDavid van Moolenbroek 	 */
1116*ef8d499eSDavid van Moolenbroek 	if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) {
1117*ef8d499eSDavid van Moolenbroek 		ptail = *pprevp;
1118*ef8d499eSDavid van Moolenbroek 
1119*ef8d499eSDavid van Moolenbroek 		assert(ptail != NULL);
1120*ef8d499eSDavid van Moolenbroek 		assert(ptail->next == NULL);
1121*ef8d499eSDavid van Moolenbroek 		assert(tcp->tcp_rcv.tr_head != NULL);
1122*ef8d499eSDavid van Moolenbroek 
1123*ef8d499eSDavid van Moolenbroek 		ptail->next = pbuf;
1124*ef8d499eSDavid van Moolenbroek 		pbuf->tot_len = pbuf->len;	/* to help freeing on merges */
1125*ef8d499eSDavid van Moolenbroek 
1126*ef8d499eSDavid van Moolenbroek 		if (tcpsock_try_merge(pprevp, ptail, pbuf)) {
1127*ef8d499eSDavid van Moolenbroek 			ptail = *pprevp;
1128*ef8d499eSDavid van Moolenbroek 			pbuf = ptail->next;
1129*ef8d499eSDavid van Moolenbroek 		}
1130*ef8d499eSDavid van Moolenbroek 
1131*ef8d499eSDavid van Moolenbroek 		if (pbuf != NULL)
1132*ef8d499eSDavid van Moolenbroek 			pprevp = &ptail->next;
1133*ef8d499eSDavid van Moolenbroek 	} else {
1134*ef8d499eSDavid van Moolenbroek 		assert(tcp->tcp_rcv.tr_head == NULL);
1135*ef8d499eSDavid van Moolenbroek 		assert(tcp->tcp_rcv.tr_head_off == 0);
1136*ef8d499eSDavid van Moolenbroek 
1137*ef8d499eSDavid van Moolenbroek 		tcp->tcp_rcv.tr_head = pbuf;
1138*ef8d499eSDavid van Moolenbroek 
1139*ef8d499eSDavid van Moolenbroek 		pprevp = &tcp->tcp_rcv.tr_head;
1140*ef8d499eSDavid van Moolenbroek 	}
1141*ef8d499eSDavid van Moolenbroek 
1142*ef8d499eSDavid van Moolenbroek 	/*
1143*ef8d499eSDavid van Moolenbroek 	 * Chop up the chain into individual buffers.  This is necessary as we
1144*ef8d499eSDavid van Moolenbroek 	 * overload 'tot_len' to mean "space available in the buffer", as we
1145*ef8d499eSDavid van Moolenbroek 	 * want for buffers allocated by us as part of buffer merges.  Also get
1146*ef8d499eSDavid van Moolenbroek 	 * a pointer to the pointer to the new penultimate tail buffer.  Due to
1147*ef8d499eSDavid van Moolenbroek 	 * merging, the chain may already be empty by now, though.
1148*ef8d499eSDavid van Moolenbroek 	 */
1149*ef8d499eSDavid van Moolenbroek 	if (pbuf != NULL) {
1150*ef8d499eSDavid van Moolenbroek 		for (; pbuf->next != NULL; pbuf = pbuf->next) {
1151*ef8d499eSDavid van Moolenbroek 			pbuf->tot_len = pbuf->len;
1152*ef8d499eSDavid van Moolenbroek 
1153*ef8d499eSDavid van Moolenbroek 			pprevp = &pbuf->next;
1154*ef8d499eSDavid van Moolenbroek 		}
1155*ef8d499eSDavid van Moolenbroek 		assert(pbuf->len == pbuf->tot_len);
1156*ef8d499eSDavid van Moolenbroek 	}
1157*ef8d499eSDavid van Moolenbroek 
1158*ef8d499eSDavid van Moolenbroek 	assert(*pprevp != NULL);
1159*ef8d499eSDavid van Moolenbroek 	assert((*pprevp)->next == NULL);
1160*ef8d499eSDavid van Moolenbroek 	tcp->tcp_rcv.tr_pre_tailp = pprevp;
1161*ef8d499eSDavid van Moolenbroek 
1162*ef8d499eSDavid van Moolenbroek 	tcp->tcp_rcv.tr_len += len;
1163*ef8d499eSDavid van Moolenbroek 	tcp->tcp_rcv.tr_unacked += len;
1164*ef8d499eSDavid van Moolenbroek 
1165*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_rcv.tr_unacked <= TCP_WND);
1166*ef8d499eSDavid van Moolenbroek 
1167*ef8d499eSDavid van Moolenbroek 	/*
1168*ef8d499eSDavid van Moolenbroek 	 * Note that tr_len may now exceed the receive buffer size in the
1169*ef8d499eSDavid van Moolenbroek 	 * highly exceptional case that the user is adjusting the latter after
1170*ef8d499eSDavid van Moolenbroek 	 * the socket had already received data.
1171*ef8d499eSDavid van Moolenbroek 	 */
1172*ef8d499eSDavid van Moolenbroek 
1173*ef8d499eSDavid van Moolenbroek 	/* See if we can immediately acknowledge some or all of the data. */
1174*ef8d499eSDavid van Moolenbroek 	tcpsock_ack_recv(tcp);
1175*ef8d499eSDavid van Moolenbroek 
1176*ef8d499eSDavid van Moolenbroek 	/* Also wake up any receivers now. */
1177*ef8d499eSDavid van Moolenbroek 	sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
1178*ef8d499eSDavid van Moolenbroek 
1179*ef8d499eSDavid van Moolenbroek 	return ERR_OK;
1180*ef8d499eSDavid van Moolenbroek }
1181*ef8d499eSDavid van Moolenbroek 
1182*ef8d499eSDavid van Moolenbroek /*
1183*ef8d499eSDavid van Moolenbroek  * Callback from lwIP.  The PCB corresponding to the socket identified by 'arg'
1184*ef8d499eSDavid van Moolenbroek  * has been closed by lwIP, with the reason specified in 'err': either the
1185*ef8d499eSDavid van Moolenbroek  * connection has been aborted locally (ERR_ABRT), it has been reset by the
1186*ef8d499eSDavid van Moolenbroek  * remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD).
1187*ef8d499eSDavid van Moolenbroek  */
1188*ef8d499eSDavid van Moolenbroek static void
tcpsock_event_err(void * arg,err_t err)1189*ef8d499eSDavid van Moolenbroek tcpsock_event_err(void * arg, err_t err)
1190*ef8d499eSDavid van Moolenbroek {
1191*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)arg;
1192*ef8d499eSDavid van Moolenbroek 	int r;
1193*ef8d499eSDavid van Moolenbroek 
1194*ef8d499eSDavid van Moolenbroek 	assert(tcp != NULL);
1195*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_pcb != NULL);
1196*ef8d499eSDavid van Moolenbroek 	assert(err != ERR_OK);
1197*ef8d499eSDavid van Moolenbroek 
1198*ef8d499eSDavid van Moolenbroek 	/* The original PCB is now gone, or will be shortly. */
1199*ef8d499eSDavid van Moolenbroek 	tcp->tcp_pcb = NULL;
1200*ef8d499eSDavid van Moolenbroek 
1201*ef8d499eSDavid van Moolenbroek 	/*
1202*ef8d499eSDavid van Moolenbroek 	 * Clean up the socket.  As a result it may be freed, in which case we
1203*ef8d499eSDavid van Moolenbroek 	 * must not touch it anymore.  No need to return ERR_ABRT from here, as
1204*ef8d499eSDavid van Moolenbroek 	 * the PCB has been aborted already.
1205*ef8d499eSDavid van Moolenbroek 	 */
1206*ef8d499eSDavid van Moolenbroek 	if (tcpsock_cleanup(tcp, TRUE /*may_free*/))
1207*ef8d499eSDavid van Moolenbroek 		return;
1208*ef8d499eSDavid van Moolenbroek 
1209*ef8d499eSDavid van Moolenbroek 	if (err == ERR_CLSD) {
1210*ef8d499eSDavid van Moolenbroek 		/*
1211*ef8d499eSDavid van Moolenbroek 		 * We may get here if the socket is shut down for writing and
1212*ef8d499eSDavid van Moolenbroek 		 * we already received a FIN from the remote side, thus putting
1213*ef8d499eSDavid van Moolenbroek 		 * the socket in LAST_ACK state, and we receive that last
1214*ef8d499eSDavid van Moolenbroek 		 * acknowledgment.  There is nothing more we need to do.
1215*ef8d499eSDavid van Moolenbroek 		 *
1216*ef8d499eSDavid van Moolenbroek 		 * We will never get here in the other case that ERR_CLSD is
1217*ef8d499eSDavid van Moolenbroek 		 * raised, which is when the socket is reset because of
1218*ef8d499eSDavid van Moolenbroek 		 * unacknowledged data while closing: we handle the
1219*ef8d499eSDavid van Moolenbroek 		 * reset-on-ACK case ourselves in tcpsock_close(), and the
1220*ef8d499eSDavid van Moolenbroek 		 * socket is in closing state after that.
1221*ef8d499eSDavid van Moolenbroek 		 */
1222*ef8d499eSDavid van Moolenbroek 		assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
1223*ef8d499eSDavid van Moolenbroek 		assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
1224*ef8d499eSDavid van Moolenbroek 	} else {
1225*ef8d499eSDavid van Moolenbroek 		/*
1226*ef8d499eSDavid van Moolenbroek 		 * Anything else should be an error directly from lwIP;
1227*ef8d499eSDavid van Moolenbroek 		 * currently either ERR_ABRT and ERR_RST.  Covert it to a
1228*ef8d499eSDavid van Moolenbroek 		 * regular error and set it on the socket.  Doing so will also
1229*ef8d499eSDavid van Moolenbroek 		 * raise the appropriate events.
1230*ef8d499eSDavid van Moolenbroek 		 */
1231*ef8d499eSDavid van Moolenbroek 		/*
1232*ef8d499eSDavid van Moolenbroek 		 * Unfortunately, lwIP is not throwing accurate errors even
1233*ef8d499eSDavid van Moolenbroek 		 * when it can.  We convert some errors to reflect more
1234*ef8d499eSDavid van Moolenbroek 		 * accurately the most likely cause.
1235*ef8d499eSDavid van Moolenbroek 		 *
1236*ef8d499eSDavid van Moolenbroek 		 * TODO: fix lwIP in this regard..
1237*ef8d499eSDavid van Moolenbroek 		 */
1238*ef8d499eSDavid van Moolenbroek 		r = util_convert_err(err);
1239*ef8d499eSDavid van Moolenbroek 
1240*ef8d499eSDavid van Moolenbroek 		if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) {
1241*ef8d499eSDavid van Moolenbroek 			switch (err) {
1242*ef8d499eSDavid van Moolenbroek 			case ERR_ABRT:	r = ETIMEDOUT;		break;
1243*ef8d499eSDavid van Moolenbroek 			case ERR_RST:	r = ECONNREFUSED;	break;
1244*ef8d499eSDavid van Moolenbroek 			}
1245*ef8d499eSDavid van Moolenbroek 		}
1246*ef8d499eSDavid van Moolenbroek 
1247*ef8d499eSDavid van Moolenbroek 		sockevent_set_error(tcpsock_get_sock(tcp), r);
1248*ef8d499eSDavid van Moolenbroek 	}
1249*ef8d499eSDavid van Moolenbroek }
1250*ef8d499eSDavid van Moolenbroek 
1251*ef8d499eSDavid van Moolenbroek /*
1252*ef8d499eSDavid van Moolenbroek  * Callback from lwIP.  Perform regular checks on a TCP socket.  This function
1253*ef8d499eSDavid van Moolenbroek  * is called one per five seconds on connected sockets, and twice per second on
1254*ef8d499eSDavid van Moolenbroek  * closing sockets.
1255*ef8d499eSDavid van Moolenbroek  */
1256*ef8d499eSDavid van Moolenbroek static err_t
tcpsock_event_poll(void * arg,struct tcp_pcb * pcb __unused)1257*ef8d499eSDavid van Moolenbroek tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused)
1258*ef8d499eSDavid van Moolenbroek {
1259*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)arg;
1260*ef8d499eSDavid van Moolenbroek 	err_t err;
1261*ef8d499eSDavid van Moolenbroek 	int r;
1262*ef8d499eSDavid van Moolenbroek 
1263*ef8d499eSDavid van Moolenbroek 	assert(tcp != NULL);
1264*ef8d499eSDavid van Moolenbroek 	assert(pcb == tcp->tcp_pcb);
1265*ef8d499eSDavid van Moolenbroek 
1266*ef8d499eSDavid van Moolenbroek 	/*
1267*ef8d499eSDavid van Moolenbroek 	 * If we ended up running out of buffers earlier, try resuming any send
1268*ef8d499eSDavid van Moolenbroek 	 * requests now, both for enqueuing TCP data with lwIP and for user
1269*ef8d499eSDavid van Moolenbroek 	 * requests.
1270*ef8d499eSDavid van Moolenbroek 	 */
1271*ef8d499eSDavid van Moolenbroek 	if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) {
1272*ef8d499eSDavid van Moolenbroek 		tcpsock_clear_flag(tcp, TCPF_FULL);
1273*ef8d499eSDavid van Moolenbroek 		tcpsock_clear_flag(tcp, TCPF_OOM);
1274*ef8d499eSDavid van Moolenbroek 
1275*ef8d499eSDavid van Moolenbroek 		/* See if we can enqueue more data with lwIP. */
1276*ef8d499eSDavid van Moolenbroek 		if (tcpsock_pcb_enqueue(tcp)) {
1277*ef8d499eSDavid van Moolenbroek 			/* In some cases, we can now close the PCB. */
1278*ef8d499eSDavid van Moolenbroek 			if (tcpsock_may_close(tcp)) {
1279*ef8d499eSDavid van Moolenbroek 				(void)tcpsock_finish_close(tcp);
1280*ef8d499eSDavid van Moolenbroek 				/*
1281*ef8d499eSDavid van Moolenbroek 				 * The PCB is definitely gone here, and the
1282*ef8d499eSDavid van Moolenbroek 				 * entire socket object may be gone now too.
1283*ef8d499eSDavid van Moolenbroek 				 * Do not touch either anymore!
1284*ef8d499eSDavid van Moolenbroek 				 */
1285*ef8d499eSDavid van Moolenbroek 
1286*ef8d499eSDavid van Moolenbroek 				return ERR_OK;
1287*ef8d499eSDavid van Moolenbroek 			}
1288*ef8d499eSDavid van Moolenbroek 
1289*ef8d499eSDavid van Moolenbroek 			/*
1290*ef8d499eSDavid van Moolenbroek 			 * If actually sending the data fails, the PCB will be
1291*ef8d499eSDavid van Moolenbroek 			 * gone, and the socket object may be gone as well.  Do
1292*ef8d499eSDavid van Moolenbroek 			 * not touch either anymore in that case!
1293*ef8d499eSDavid van Moolenbroek 			 */
1294*ef8d499eSDavid van Moolenbroek 			if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK)
1295*ef8d499eSDavid van Moolenbroek 				return ERR_ABRT;
1296*ef8d499eSDavid van Moolenbroek 		}
1297*ef8d499eSDavid van Moolenbroek 
1298*ef8d499eSDavid van Moolenbroek 		/*
1299*ef8d499eSDavid van Moolenbroek 		 * If we ran out of buffers earlier, it may be possible to take
1300*ef8d499eSDavid van Moolenbroek 		 * in more data from a user process now, even if we did not
1301*ef8d499eSDavid van Moolenbroek 		 * manage to enqueue any more pending data with lwIP.
1302*ef8d499eSDavid van Moolenbroek 		 */
1303*ef8d499eSDavid van Moolenbroek 		sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
1304*ef8d499eSDavid van Moolenbroek 
1305*ef8d499eSDavid van Moolenbroek 		assert(tcp->tcp_pcb != NULL);
1306*ef8d499eSDavid van Moolenbroek 	} else if (tcp->tcp_snd.ts_unsent != NULL &&
1307*ef8d499eSDavid van Moolenbroek 	    tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) {
1308*ef8d499eSDavid van Moolenbroek 		/*
1309*ef8d499eSDavid van Moolenbroek 		 * If the send buffer is full, we will no longer call
1310*ef8d499eSDavid van Moolenbroek 		 * tcp_output(), which means we may also miss out on fatal
1311*ef8d499eSDavid van Moolenbroek 		 * errors that would otherwise kill the connection (e.g., no
1312*ef8d499eSDavid van Moolenbroek 		 * route).  As a result, the connection may erroneously
1313*ef8d499eSDavid van Moolenbroek 		 * continue to exist for a long time.  To avoid this, we call
1314*ef8d499eSDavid van Moolenbroek 		 * tcp_output() every once in a while when there are still
1315*ef8d499eSDavid van Moolenbroek 		 * unsent data.
1316*ef8d499eSDavid van Moolenbroek 		 */
1317*ef8d499eSDavid van Moolenbroek 		err = tcp_output(tcp->tcp_pcb);
1318*ef8d499eSDavid van Moolenbroek 
1319*ef8d499eSDavid van Moolenbroek 		if (err != ERR_OK && err != ERR_MEM) {
1320*ef8d499eSDavid van Moolenbroek 			tcpsock_pcb_abort(tcp);
1321*ef8d499eSDavid van Moolenbroek 
1322*ef8d499eSDavid van Moolenbroek 			if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
1323*ef8d499eSDavid van Moolenbroek 				r = util_convert_err(err);
1324*ef8d499eSDavid van Moolenbroek 
1325*ef8d499eSDavid van Moolenbroek 				sockevent_set_error(tcpsock_get_sock(tcp), r);
1326*ef8d499eSDavid van Moolenbroek 			}
1327*ef8d499eSDavid van Moolenbroek 			/* Otherwise do not touch the socket object anymore! */
1328*ef8d499eSDavid van Moolenbroek 
1329*ef8d499eSDavid van Moolenbroek 			return ERR_ABRT;
1330*ef8d499eSDavid van Moolenbroek 		}
1331*ef8d499eSDavid van Moolenbroek 	}
1332*ef8d499eSDavid van Moolenbroek 
1333*ef8d499eSDavid van Moolenbroek 	/*
1334*ef8d499eSDavid van Moolenbroek 	 * If we are closing the socket, and we sent a FIN, see if the FIN got
1335*ef8d499eSDavid van Moolenbroek 	 * acknowledged.  If so, finish closing the socket.  Unfortunately, we
1336*ef8d499eSDavid van Moolenbroek 	 * can perform this check by polling only.  TODO: change lwIP..
1337*ef8d499eSDavid van Moolenbroek 	 */
1338*ef8d499eSDavid van Moolenbroek 	if (sockevent_is_closing(tcpsock_get_sock(tcp)) &&
1339*ef8d499eSDavid van Moolenbroek 	    (tcpsock_get_flags(tcp) & TCPF_SENT_FIN) &&
1340*ef8d499eSDavid van Moolenbroek 	    tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) {
1341*ef8d499eSDavid van Moolenbroek 		assert(tcp->tcp_snd.ts_len == 0);
1342*ef8d499eSDavid van Moolenbroek 
1343*ef8d499eSDavid van Moolenbroek 		tcpsock_finish_close(tcp);
1344*ef8d499eSDavid van Moolenbroek 	}
1345*ef8d499eSDavid van Moolenbroek 
1346*ef8d499eSDavid van Moolenbroek 	return ERR_OK;
1347*ef8d499eSDavid van Moolenbroek }
1348*ef8d499eSDavid van Moolenbroek 
1349*ef8d499eSDavid van Moolenbroek /*
1350*ef8d499eSDavid van Moolenbroek  * Bind a TCP socket to a local address.
1351*ef8d499eSDavid van Moolenbroek  */
1352*ef8d499eSDavid van Moolenbroek static int
tcpsock_bind(struct sock * sock,const struct sockaddr * addr,socklen_t addr_len,endpoint_t user_endpt)1353*ef8d499eSDavid van Moolenbroek tcpsock_bind(struct sock * sock, const struct sockaddr * addr,
1354*ef8d499eSDavid van Moolenbroek 	socklen_t addr_len, endpoint_t user_endpt)
1355*ef8d499eSDavid van Moolenbroek {
1356*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
1357*ef8d499eSDavid van Moolenbroek 	ip_addr_t ipaddr;
1358*ef8d499eSDavid van Moolenbroek 	uint16_t port;
1359*ef8d499eSDavid van Moolenbroek 	err_t err;
1360*ef8d499eSDavid van Moolenbroek 	int r;
1361*ef8d499eSDavid van Moolenbroek 
1362*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)
1363*ef8d499eSDavid van Moolenbroek 		return EINVAL;
1364*ef8d499eSDavid van Moolenbroek 
1365*ef8d499eSDavid van Moolenbroek 	if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1366*ef8d499eSDavid van Moolenbroek 	    user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port,
1367*ef8d499eSDavid van Moolenbroek 	    FALSE /*allow_mcast*/, &ipaddr, &port)) != OK)
1368*ef8d499eSDavid van Moolenbroek 		return r;
1369*ef8d499eSDavid van Moolenbroek 
1370*ef8d499eSDavid van Moolenbroek 	err = tcp_bind(tcp->tcp_pcb, &ipaddr, port);
1371*ef8d499eSDavid van Moolenbroek 
1372*ef8d499eSDavid van Moolenbroek 	return util_convert_err(err);
1373*ef8d499eSDavid van Moolenbroek }
1374*ef8d499eSDavid van Moolenbroek 
1375*ef8d499eSDavid van Moolenbroek /*
1376*ef8d499eSDavid van Moolenbroek  * Callback from lwIP.  A new connection 'pcb' has arrived on the listening
1377*ef8d499eSDavid van Moolenbroek  * socket identified by 'arg'.  Note that 'pcb' may be NULL in the case that
1378*ef8d499eSDavid van Moolenbroek  * lwIP could not accept the connection itself.
1379*ef8d499eSDavid van Moolenbroek  */
1380*ef8d499eSDavid van Moolenbroek static err_t
tcpsock_event_accept(void * arg,struct tcp_pcb * pcb,err_t err)1381*ef8d499eSDavid van Moolenbroek tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err)
1382*ef8d499eSDavid van Moolenbroek {
1383*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)arg;
1384*ef8d499eSDavid van Moolenbroek 
1385*ef8d499eSDavid van Moolenbroek 	assert(tcp != NULL);
1386*ef8d499eSDavid van Moolenbroek 	assert(tcpsock_is_listening(tcp));
1387*ef8d499eSDavid van Moolenbroek 
1388*ef8d499eSDavid van Moolenbroek 	/*
1389*ef8d499eSDavid van Moolenbroek 	 * If the given PCB is NULL, then lwIP ran out of memory allocating a
1390*ef8d499eSDavid van Moolenbroek 	 * PCB for the new connection.  There is nothing we can do with that
1391*ef8d499eSDavid van Moolenbroek 	 * information.  Also check 'err' just to make sure.
1392*ef8d499eSDavid van Moolenbroek 	 */
1393*ef8d499eSDavid van Moolenbroek 	if (pcb == NULL || err != OK)
1394*ef8d499eSDavid van Moolenbroek 		return ERR_OK;
1395*ef8d499eSDavid van Moolenbroek 
1396*ef8d499eSDavid van Moolenbroek 	/*
1397*ef8d499eSDavid van Moolenbroek 	 * The TCP socket is the listening socket, but the PCB is for the
1398*ef8d499eSDavid van Moolenbroek 	 * incoming connection.
1399*ef8d499eSDavid van Moolenbroek 	 */
1400*ef8d499eSDavid van Moolenbroek 	if (tcpsock_clone(tcp, pcb) != OK) {
1401*ef8d499eSDavid van Moolenbroek 		/*
1402*ef8d499eSDavid van Moolenbroek 		 * We could not allocate the resources necessary to accept the
1403*ef8d499eSDavid van Moolenbroek 		 * connection.  Abort it immediately.
1404*ef8d499eSDavid van Moolenbroek 		 */
1405*ef8d499eSDavid van Moolenbroek 		tcp_abort(pcb);
1406*ef8d499eSDavid van Moolenbroek 
1407*ef8d499eSDavid van Moolenbroek 		return ERR_ABRT;
1408*ef8d499eSDavid van Moolenbroek 	}
1409*ef8d499eSDavid van Moolenbroek 
1410*ef8d499eSDavid van Moolenbroek 	/*
1411*ef8d499eSDavid van Moolenbroek 	 * The connection has not yet been accepted, and thus should still be
1412*ef8d499eSDavid van Moolenbroek 	 * considered on the listen queue.
1413*ef8d499eSDavid van Moolenbroek 	 */
1414*ef8d499eSDavid van Moolenbroek 	tcp_backlog_delayed(pcb);
1415*ef8d499eSDavid van Moolenbroek 
1416*ef8d499eSDavid van Moolenbroek 	/* Set the callback functions. */
1417*ef8d499eSDavid van Moolenbroek 	tcp_recv(pcb, tcpsock_event_recv);
1418*ef8d499eSDavid van Moolenbroek 	tcp_sent(pcb, tcpsock_event_sent);
1419*ef8d499eSDavid van Moolenbroek 	tcp_err(pcb, tcpsock_event_err);
1420*ef8d499eSDavid van Moolenbroek 	tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
1421*ef8d499eSDavid van Moolenbroek 
1422*ef8d499eSDavid van Moolenbroek 	sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT);
1423*ef8d499eSDavid van Moolenbroek 
1424*ef8d499eSDavid van Moolenbroek 	return ERR_OK;
1425*ef8d499eSDavid van Moolenbroek }
1426*ef8d499eSDavid van Moolenbroek 
1427*ef8d499eSDavid van Moolenbroek /*
1428*ef8d499eSDavid van Moolenbroek  * Put a TCP socket in listening mode.
1429*ef8d499eSDavid van Moolenbroek  */
1430*ef8d499eSDavid van Moolenbroek static int
tcpsock_listen(struct sock * sock,int backlog)1431*ef8d499eSDavid van Moolenbroek tcpsock_listen(struct sock * sock, int backlog)
1432*ef8d499eSDavid van Moolenbroek {
1433*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
1434*ef8d499eSDavid van Moolenbroek 	struct tcp_pcb *pcb;
1435*ef8d499eSDavid van Moolenbroek 	err_t err;
1436*ef8d499eSDavid van Moolenbroek 
1437*ef8d499eSDavid van Moolenbroek 	/* The maximum backlog value must not exceed its field size. */
1438*ef8d499eSDavid van Moolenbroek 	assert(SOMAXCONN <= UINT8_MAX);
1439*ef8d499eSDavid van Moolenbroek 
1440*ef8d499eSDavid van Moolenbroek 	/*
1441*ef8d499eSDavid van Moolenbroek 	 * Allow only CLOSED sockets to enter listening mode.  If the socket
1442*ef8d499eSDavid van Moolenbroek 	 * was already in listening mode, allow its backlog value to be
1443*ef8d499eSDavid van Moolenbroek 	 * updated, even if it was shut down already (making this a no-op).
1444*ef8d499eSDavid van Moolenbroek 	 */
1445*ef8d499eSDavid van Moolenbroek 	if (!tcpsock_is_listening(tcp) &&
1446*ef8d499eSDavid van Moolenbroek 	    (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED))
1447*ef8d499eSDavid van Moolenbroek 		return EINVAL;
1448*ef8d499eSDavid van Moolenbroek 
1449*ef8d499eSDavid van Moolenbroek 	/*
1450*ef8d499eSDavid van Moolenbroek 	 * If the socket was not already in listening mode, put it in that mode
1451*ef8d499eSDavid van Moolenbroek 	 * now.  That involves switching PCBs as lwIP attempts to save memory
1452*ef8d499eSDavid van Moolenbroek 	 * by replacing the original PCB with a smaller one.  If the socket was
1453*ef8d499eSDavid van Moolenbroek 	 * already in listening mode, simply update its backlog value--this has
1454*ef8d499eSDavid van Moolenbroek 	 * no effect on the sockets already in the backlog.
1455*ef8d499eSDavid van Moolenbroek 	 */
1456*ef8d499eSDavid van Moolenbroek 	if (!tcpsock_is_listening(tcp)) {
1457*ef8d499eSDavid van Moolenbroek 		assert(tcp->tcp_pcb != NULL);
1458*ef8d499eSDavid van Moolenbroek 
1459*ef8d499eSDavid van Moolenbroek 		/*
1460*ef8d499eSDavid van Moolenbroek 		 * If the socket has not been bound to a port yet, do that
1461*ef8d499eSDavid van Moolenbroek 		 * first.  This does mean that the listen call may fail with
1462*ef8d499eSDavid van Moolenbroek 		 * side effects, but that is acceptable in this case.
1463*ef8d499eSDavid van Moolenbroek 		 */
1464*ef8d499eSDavid van Moolenbroek 		if (tcp->tcp_pcb->local_port == 0) {
1465*ef8d499eSDavid van Moolenbroek 			err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip,
1466*ef8d499eSDavid van Moolenbroek 			    0 /*port*/);
1467*ef8d499eSDavid van Moolenbroek 
1468*ef8d499eSDavid van Moolenbroek 			if (err != ERR_OK)
1469*ef8d499eSDavid van Moolenbroek 				return util_convert_err(err);
1470*ef8d499eSDavid van Moolenbroek 		}
1471*ef8d499eSDavid van Moolenbroek 
1472*ef8d499eSDavid van Moolenbroek 		/*
1473*ef8d499eSDavid van Moolenbroek 		 * Clear the argument on the PCB that is about to be replaced,
1474*ef8d499eSDavid van Moolenbroek 		 * because if we do not, once the PCB is reused (which does not
1475*ef8d499eSDavid van Moolenbroek 		 * clear the argument), we might get weird events.  Do this
1476*ef8d499eSDavid van Moolenbroek 		 * before the tcp_listen() call, because we should no longer
1477*ef8d499eSDavid van Moolenbroek 		 * access the old PCB afterwards (even if we can).
1478*ef8d499eSDavid van Moolenbroek 		 */
1479*ef8d499eSDavid van Moolenbroek 		tcp_arg(tcp->tcp_pcb, NULL);
1480*ef8d499eSDavid van Moolenbroek 
1481*ef8d499eSDavid van Moolenbroek 		pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog,
1482*ef8d499eSDavid van Moolenbroek 		    &err);
1483*ef8d499eSDavid van Moolenbroek 
1484*ef8d499eSDavid van Moolenbroek 		if (pcb == NULL) {
1485*ef8d499eSDavid van Moolenbroek 			tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */
1486*ef8d499eSDavid van Moolenbroek 
1487*ef8d499eSDavid van Moolenbroek 			return util_convert_err(err);
1488*ef8d499eSDavid van Moolenbroek 		}
1489*ef8d499eSDavid van Moolenbroek 
1490*ef8d499eSDavid van Moolenbroek 		tcp_arg(pcb, tcp);
1491*ef8d499eSDavid van Moolenbroek 		tcp->tcp_pcb = pcb;
1492*ef8d499eSDavid van Moolenbroek 
1493*ef8d499eSDavid van Moolenbroek 		tcp_accept(pcb, tcpsock_event_accept);
1494*ef8d499eSDavid van Moolenbroek 
1495*ef8d499eSDavid van Moolenbroek 		/* Initialize the queue head for sockets pending acceptance. */
1496*ef8d499eSDavid van Moolenbroek 		TAILQ_INIT(&tcp->tcp_queue.tq_head);
1497*ef8d499eSDavid van Moolenbroek 	} else if (tcp->tcp_pcb != NULL)
1498*ef8d499eSDavid van Moolenbroek 		tcp_backlog_set(tcp->tcp_pcb, backlog);
1499*ef8d499eSDavid van Moolenbroek 
1500*ef8d499eSDavid van Moolenbroek 	return OK;
1501*ef8d499eSDavid van Moolenbroek }
1502*ef8d499eSDavid van Moolenbroek 
1503*ef8d499eSDavid van Moolenbroek /*
1504*ef8d499eSDavid van Moolenbroek  * Callback from lwIP.  A socket connection attempt has succeeded.  Note that
1505*ef8d499eSDavid van Moolenbroek  * failed socket events will trigger the tcpsock_event_err() callback instead.
1506*ef8d499eSDavid van Moolenbroek  */
1507*ef8d499eSDavid van Moolenbroek static err_t
tcpsock_event_connected(void * arg,struct tcp_pcb * pcb __unused,err_t err)1508*ef8d499eSDavid van Moolenbroek tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err)
1509*ef8d499eSDavid van Moolenbroek {
1510*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)arg;
1511*ef8d499eSDavid van Moolenbroek 
1512*ef8d499eSDavid van Moolenbroek 	assert(tcp != NULL);
1513*ef8d499eSDavid van Moolenbroek 	assert(pcb == tcp->tcp_pcb);
1514*ef8d499eSDavid van Moolenbroek 	assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING);
1515*ef8d499eSDavid van Moolenbroek 
1516*ef8d499eSDavid van Moolenbroek 	/*
1517*ef8d499eSDavid van Moolenbroek 	 * If lwIP ever changes so that this callback is called for connect
1518*ef8d499eSDavid van Moolenbroek 	 * failures as well, then we need to change the code here accordingly.
1519*ef8d499eSDavid van Moolenbroek 	 */
1520*ef8d499eSDavid van Moolenbroek 	if (err != ERR_OK)
1521*ef8d499eSDavid van Moolenbroek 		panic("TCP connected event with error: %d", err);
1522*ef8d499eSDavid van Moolenbroek 
1523*ef8d499eSDavid van Moolenbroek 	tcpsock_clear_flag(tcp, TCPF_CONNECTING);
1524*ef8d499eSDavid van Moolenbroek 
1525*ef8d499eSDavid van Moolenbroek 	sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND);
1526*ef8d499eSDavid van Moolenbroek 
1527*ef8d499eSDavid van Moolenbroek 	return ERR_OK;
1528*ef8d499eSDavid van Moolenbroek }
1529*ef8d499eSDavid van Moolenbroek 
1530*ef8d499eSDavid van Moolenbroek /*
1531*ef8d499eSDavid van Moolenbroek  * Connect a TCP socket to a remote address.
1532*ef8d499eSDavid van Moolenbroek  */
1533*ef8d499eSDavid van Moolenbroek static int
tcpsock_connect(struct sock * sock,const struct sockaddr * addr,socklen_t addr_len,endpoint_t user_endpt)1534*ef8d499eSDavid van Moolenbroek tcpsock_connect(struct sock * sock, const struct sockaddr * addr,
1535*ef8d499eSDavid van Moolenbroek 	socklen_t addr_len, endpoint_t user_endpt)
1536*ef8d499eSDavid van Moolenbroek {
1537*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
1538*ef8d499eSDavid van Moolenbroek 	ip_addr_t dst_addr;
1539*ef8d499eSDavid van Moolenbroek 	uint16_t dst_port;
1540*ef8d499eSDavid van Moolenbroek 	err_t err;
1541*ef8d499eSDavid van Moolenbroek 	int r;
1542*ef8d499eSDavid van Moolenbroek 
1543*ef8d499eSDavid van Moolenbroek 	/*
1544*ef8d499eSDavid van Moolenbroek 	 * Listening sockets may not have a PCB, so we use higher-level flags
1545*ef8d499eSDavid van Moolenbroek 	 * to throw the correct error code for those instead.
1546*ef8d499eSDavid van Moolenbroek 	 */
1547*ef8d499eSDavid van Moolenbroek 	if (tcpsock_is_listening(tcp))
1548*ef8d499eSDavid van Moolenbroek 		return EOPNOTSUPP;
1549*ef8d499eSDavid van Moolenbroek 
1550*ef8d499eSDavid van Moolenbroek 	/*
1551*ef8d499eSDavid van Moolenbroek 	 * If there is no longer any PCB, we obviously cannot perform the
1552*ef8d499eSDavid van Moolenbroek 	 * connection, but POSIX is not clear on which error to return.  We
1553*ef8d499eSDavid van Moolenbroek 	 * copy NetBSD's.
1554*ef8d499eSDavid van Moolenbroek 	 */
1555*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb == NULL)
1556*ef8d499eSDavid van Moolenbroek 		return EINVAL;
1557*ef8d499eSDavid van Moolenbroek 
1558*ef8d499eSDavid van Moolenbroek 	/*
1559*ef8d499eSDavid van Moolenbroek 	 * The only state from which a connection can be initiated, is CLOSED.
1560*ef8d499eSDavid van Moolenbroek 	 * Some of the other states require distinct error codes, though.
1561*ef8d499eSDavid van Moolenbroek 	 */
1562*ef8d499eSDavid van Moolenbroek 	switch (tcp->tcp_pcb->state) {
1563*ef8d499eSDavid van Moolenbroek 	case CLOSED:
1564*ef8d499eSDavid van Moolenbroek 		break;
1565*ef8d499eSDavid van Moolenbroek 	case SYN_SENT:
1566*ef8d499eSDavid van Moolenbroek 		return EALREADY;
1567*ef8d499eSDavid van Moolenbroek 	case LISTEN:
1568*ef8d499eSDavid van Moolenbroek 		assert(0); /* we just checked.. */
1569*ef8d499eSDavid van Moolenbroek 	default:
1570*ef8d499eSDavid van Moolenbroek 		return EISCONN;
1571*ef8d499eSDavid van Moolenbroek 	}
1572*ef8d499eSDavid van Moolenbroek 
1573*ef8d499eSDavid van Moolenbroek 	/*
1574*ef8d499eSDavid van Moolenbroek 	 * Get the destination address, and attempt to start connecting.  If
1575*ef8d499eSDavid van Moolenbroek 	 * the socket was not bound before, or it was bound to a port only,
1576*ef8d499eSDavid van Moolenbroek 	 * then lwIP will select a source address for us.  We cannot do this
1577*ef8d499eSDavid van Moolenbroek 	 * ourselves even if we wanted to: it is impossible to re-bind a TCP
1578*ef8d499eSDavid van Moolenbroek 	 * PCB in the case it was previously bound to a port only.
1579*ef8d499eSDavid van Moolenbroek 	 */
1580*ef8d499eSDavid van Moolenbroek 	if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1581*ef8d499eSDavid van Moolenbroek 	    &tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK)
1582*ef8d499eSDavid van Moolenbroek 		return r;
1583*ef8d499eSDavid van Moolenbroek 
1584*ef8d499eSDavid van Moolenbroek 	err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port,
1585*ef8d499eSDavid van Moolenbroek 	    tcpsock_event_connected);
1586*ef8d499eSDavid van Moolenbroek 
1587*ef8d499eSDavid van Moolenbroek 	/*
1588*ef8d499eSDavid van Moolenbroek 	 * Note that various tcp_connect() error cases will leave the PCB with
1589*ef8d499eSDavid van Moolenbroek 	 * a newly set local and remote IP address anyway.  We should be
1590*ef8d499eSDavid van Moolenbroek 	 * careful not to rely on the addresses being as they were before.
1591*ef8d499eSDavid van Moolenbroek 	 */
1592*ef8d499eSDavid van Moolenbroek 	if (err != ERR_OK)
1593*ef8d499eSDavid van Moolenbroek 		return util_convert_err(err);
1594*ef8d499eSDavid van Moolenbroek 
1595*ef8d499eSDavid van Moolenbroek 	/* Set the other callback functions. */
1596*ef8d499eSDavid van Moolenbroek 	tcp_recv(tcp->tcp_pcb, tcpsock_event_recv);
1597*ef8d499eSDavid van Moolenbroek 	tcp_sent(tcp->tcp_pcb, tcpsock_event_sent);
1598*ef8d499eSDavid van Moolenbroek 	tcp_err(tcp->tcp_pcb, tcpsock_event_err);
1599*ef8d499eSDavid van Moolenbroek 	tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
1600*ef8d499eSDavid van Moolenbroek 
1601*ef8d499eSDavid van Moolenbroek 	/*
1602*ef8d499eSDavid van Moolenbroek 	 * Set a flag so that we can correct lwIP's error codes in case the
1603*ef8d499eSDavid van Moolenbroek 	 * connection fails.
1604*ef8d499eSDavid van Moolenbroek 	 */
1605*ef8d499eSDavid van Moolenbroek 	tcpsock_set_flag(tcp, TCPF_CONNECTING);
1606*ef8d499eSDavid van Moolenbroek 
1607*ef8d499eSDavid van Moolenbroek 	return SUSPEND;
1608*ef8d499eSDavid van Moolenbroek }
1609*ef8d499eSDavid van Moolenbroek 
1610*ef8d499eSDavid van Moolenbroek /*
1611*ef8d499eSDavid van Moolenbroek  * Test whether any new connections are pending on a listening TCP socket.
1612*ef8d499eSDavid van Moolenbroek  */
1613*ef8d499eSDavid van Moolenbroek static int
tcpsock_test_accept(struct sock * sock)1614*ef8d499eSDavid van Moolenbroek tcpsock_test_accept(struct sock * sock)
1615*ef8d499eSDavid van Moolenbroek {
1616*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
1617*ef8d499eSDavid van Moolenbroek 
1618*ef8d499eSDavid van Moolenbroek 	/* Is this socket in listening mode at all? */
1619*ef8d499eSDavid van Moolenbroek 	if (!tcpsock_is_listening(tcp))
1620*ef8d499eSDavid van Moolenbroek 		return EINVAL;
1621*ef8d499eSDavid van Moolenbroek 
1622*ef8d499eSDavid van Moolenbroek 	/* Are there any connections to accept right now? */
1623*ef8d499eSDavid van Moolenbroek 	if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head))
1624*ef8d499eSDavid van Moolenbroek 		return OK;
1625*ef8d499eSDavid van Moolenbroek 
1626*ef8d499eSDavid van Moolenbroek 	/* If the socket has been shut down, we return ECONNABORTED. */
1627*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb == NULL)
1628*ef8d499eSDavid van Moolenbroek 		return ECONNABORTED;
1629*ef8d499eSDavid van Moolenbroek 
1630*ef8d499eSDavid van Moolenbroek 	/* Otherwise, wait for a new connection first. */
1631*ef8d499eSDavid van Moolenbroek 	return SUSPEND;
1632*ef8d499eSDavid van Moolenbroek }
1633*ef8d499eSDavid van Moolenbroek 
1634*ef8d499eSDavid van Moolenbroek /*
1635*ef8d499eSDavid van Moolenbroek  * Accept a connection on a listening TCP socket, creating a new TCP socket.
1636*ef8d499eSDavid van Moolenbroek  */
1637*ef8d499eSDavid van Moolenbroek static sockid_t
tcpsock_accept(struct sock * sock,struct sockaddr * addr,socklen_t * addr_len,endpoint_t user_endpt __unused,struct sock ** newsockp)1638*ef8d499eSDavid van Moolenbroek tcpsock_accept(struct sock * sock, struct sockaddr * addr,
1639*ef8d499eSDavid van Moolenbroek 	socklen_t * addr_len, endpoint_t user_endpt __unused,
1640*ef8d499eSDavid van Moolenbroek 	struct sock ** newsockp)
1641*ef8d499eSDavid van Moolenbroek {
1642*ef8d499eSDavid van Moolenbroek 	struct tcpsock *listener = (struct tcpsock *)sock;
1643*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp;
1644*ef8d499eSDavid van Moolenbroek 	int r;
1645*ef8d499eSDavid van Moolenbroek 
1646*ef8d499eSDavid van Moolenbroek 	if ((r = tcpsock_test_accept(sock)) != OK)
1647*ef8d499eSDavid van Moolenbroek 		return r;
1648*ef8d499eSDavid van Moolenbroek 	/* Below, we must not assume that the listener has a PCB. */
1649*ef8d499eSDavid van Moolenbroek 
1650*ef8d499eSDavid van Moolenbroek 	tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head);
1651*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_listener == listener);
1652*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_pcb != NULL);
1653*ef8d499eSDavid van Moolenbroek 
1654*ef8d499eSDavid van Moolenbroek 	TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next);
1655*ef8d499eSDavid van Moolenbroek 	tcp->tcp_listener = NULL;
1656*ef8d499eSDavid van Moolenbroek 
1657*ef8d499eSDavid van Moolenbroek 	tcp_backlog_accepted(tcp->tcp_pcb);
1658*ef8d499eSDavid van Moolenbroek 
1659*ef8d499eSDavid van Moolenbroek 	ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1660*ef8d499eSDavid van Moolenbroek 	    &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
1661*ef8d499eSDavid van Moolenbroek 
1662*ef8d499eSDavid van Moolenbroek 	/*
1663*ef8d499eSDavid van Moolenbroek 	 * Set 'newsockp' to NULL so that libsockevent knows we already cloned
1664*ef8d499eSDavid van Moolenbroek 	 * the socket, and it must not be reinitialized anymore.
1665*ef8d499eSDavid van Moolenbroek 	 */
1666*ef8d499eSDavid van Moolenbroek 	*newsockp = NULL;
1667*ef8d499eSDavid van Moolenbroek 	return tcpsock_get_id(tcp);
1668*ef8d499eSDavid van Moolenbroek }
1669*ef8d499eSDavid van Moolenbroek 
1670*ef8d499eSDavid van Moolenbroek /*
1671*ef8d499eSDavid van Moolenbroek  * Perform preliminary checks on a send request.
1672*ef8d499eSDavid van Moolenbroek  */
1673*ef8d499eSDavid van Moolenbroek static int
tcpsock_pre_send(struct sock * sock,size_t len __unused,socklen_t ctl_len __unused,const struct sockaddr * addr __unused,socklen_t addr_len __unused,endpoint_t user_endpt __unused,int flags)1674*ef8d499eSDavid van Moolenbroek tcpsock_pre_send(struct sock * sock, size_t len __unused,
1675*ef8d499eSDavid van Moolenbroek 	socklen_t ctl_len __unused, const struct sockaddr * addr __unused,
1676*ef8d499eSDavid van Moolenbroek 	socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags)
1677*ef8d499eSDavid van Moolenbroek {
1678*ef8d499eSDavid van Moolenbroek 
1679*ef8d499eSDavid van Moolenbroek 	/*
1680*ef8d499eSDavid van Moolenbroek 	 * Reject calls with unknown flags.  Since libsockevent strips out the
1681*ef8d499eSDavid van Moolenbroek 	 * flags it handles itself here, we only have to test for ones we can
1682*ef8d499eSDavid van Moolenbroek 	 * not handle.  Currently, there are no send flags that we support.
1683*ef8d499eSDavid van Moolenbroek 	 */
1684*ef8d499eSDavid van Moolenbroek 	if (flags != 0)
1685*ef8d499eSDavid van Moolenbroek 		return EOPNOTSUPP;
1686*ef8d499eSDavid van Moolenbroek 
1687*ef8d499eSDavid van Moolenbroek 	return OK;
1688*ef8d499eSDavid van Moolenbroek }
1689*ef8d499eSDavid van Moolenbroek 
1690*ef8d499eSDavid van Moolenbroek /*
1691*ef8d499eSDavid van Moolenbroek  * Test whether the given number of data bytes can be sent on a TCP socket.
1692*ef8d499eSDavid van Moolenbroek  */
1693*ef8d499eSDavid van Moolenbroek static int
tcpsock_test_send(struct sock * sock,size_t min)1694*ef8d499eSDavid van Moolenbroek tcpsock_test_send(struct sock * sock, size_t min)
1695*ef8d499eSDavid van Moolenbroek {
1696*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
1697*ef8d499eSDavid van Moolenbroek 	size_t sndbuf;
1698*ef8d499eSDavid van Moolenbroek 
1699*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb == NULL)
1700*ef8d499eSDavid van Moolenbroek 		return EPIPE;
1701*ef8d499eSDavid van Moolenbroek 
1702*ef8d499eSDavid van Moolenbroek 	switch (tcp->tcp_pcb->state) {
1703*ef8d499eSDavid van Moolenbroek 	case CLOSED:			/* new */
1704*ef8d499eSDavid van Moolenbroek 	case LISTEN:			/* listening */
1705*ef8d499eSDavid van Moolenbroek 		return ENOTCONN;
1706*ef8d499eSDavid van Moolenbroek 	case SYN_SENT:			/* connecting */
1707*ef8d499eSDavid van Moolenbroek 	case SYN_RCVD:			/* simultaneous open, maybe someday? */
1708*ef8d499eSDavid van Moolenbroek 		return SUSPEND;
1709*ef8d499eSDavid van Moolenbroek 	case ESTABLISHED:		/* connected */
1710*ef8d499eSDavid van Moolenbroek 	case CLOSE_WAIT:		/* closed remotely */
1711*ef8d499eSDavid van Moolenbroek 		break;
1712*ef8d499eSDavid van Moolenbroek 	default:			/* shut down locally */
1713*ef8d499eSDavid van Moolenbroek 		assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
1714*ef8d499eSDavid van Moolenbroek 		return EPIPE;
1715*ef8d499eSDavid van Moolenbroek 	}
1716*ef8d499eSDavid van Moolenbroek 
1717*ef8d499eSDavid van Moolenbroek 	sndbuf = tcpsock_get_sndbuf(tcp);
1718*ef8d499eSDavid van Moolenbroek 	if (min > sndbuf)
1719*ef8d499eSDavid van Moolenbroek 		min = sndbuf;
1720*ef8d499eSDavid van Moolenbroek 
1721*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_snd.ts_len + min > sndbuf)
1722*ef8d499eSDavid van Moolenbroek 		return SUSPEND;
1723*ef8d499eSDavid van Moolenbroek 	else
1724*ef8d499eSDavid van Moolenbroek 		return OK;
1725*ef8d499eSDavid van Moolenbroek }
1726*ef8d499eSDavid van Moolenbroek 
1727*ef8d499eSDavid van Moolenbroek /*
1728*ef8d499eSDavid van Moolenbroek  * Send data on a TCP socket.
1729*ef8d499eSDavid van Moolenbroek  */
1730*ef8d499eSDavid van Moolenbroek static int
tcpsock_send(struct sock * sock,const struct sockdriver_data * data,size_t len,size_t * offp,const struct sockdriver_data * ctl __unused,socklen_t ctl_len __unused,socklen_t * ctl_off __unused,const struct sockaddr * addr __unused,socklen_t addr_len __unused,endpoint_t user_endpt __unused,int flags __unused,size_t min)1731*ef8d499eSDavid van Moolenbroek tcpsock_send(struct sock * sock, const struct sockdriver_data * data,
1732*ef8d499eSDavid van Moolenbroek 	size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
1733*ef8d499eSDavid van Moolenbroek 	socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
1734*ef8d499eSDavid van Moolenbroek 	const struct sockaddr * addr __unused, socklen_t addr_len __unused,
1735*ef8d499eSDavid van Moolenbroek 	endpoint_t user_endpt __unused, int flags __unused, size_t min)
1736*ef8d499eSDavid van Moolenbroek {
1737*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
1738*ef8d499eSDavid van Moolenbroek 	struct pbuf *ptail, *pfirst, *pnext, *plast;
1739*ef8d499eSDavid van Moolenbroek 	size_t off, tail_off, chunk, left, sndbuf;
1740*ef8d499eSDavid van Moolenbroek 	int r;
1741*ef8d499eSDavid van Moolenbroek 
1742*ef8d499eSDavid van Moolenbroek 	if ((r = tcpsock_test_send(sock, min)) != OK)
1743*ef8d499eSDavid van Moolenbroek 		return r;
1744*ef8d499eSDavid van Moolenbroek 
1745*ef8d499eSDavid van Moolenbroek 	if (len == 0)
1746*ef8d499eSDavid van Moolenbroek 		return OK;	/* nothing to do */
1747*ef8d499eSDavid van Moolenbroek 
1748*ef8d499eSDavid van Moolenbroek 	sndbuf = tcpsock_get_sndbuf(tcp);
1749*ef8d499eSDavid van Moolenbroek 	if (min > sndbuf)
1750*ef8d499eSDavid van Moolenbroek 		min = sndbuf;
1751*ef8d499eSDavid van Moolenbroek 	assert(min > 0);
1752*ef8d499eSDavid van Moolenbroek 
1753*ef8d499eSDavid van Moolenbroek 	assert(sndbuf > tcp->tcp_snd.ts_len);
1754*ef8d499eSDavid van Moolenbroek 	left = sndbuf - tcp->tcp_snd.ts_len;
1755*ef8d499eSDavid van Moolenbroek 	if (left > len)
1756*ef8d499eSDavid van Moolenbroek 		left = len;
1757*ef8d499eSDavid van Moolenbroek 
1758*ef8d499eSDavid van Moolenbroek 	/*
1759*ef8d499eSDavid van Moolenbroek 	 * First see if we can fit any more data in the current tail buffer.
1760*ef8d499eSDavid van Moolenbroek 	 * If so, we set 'ptail' to point to it and 'tail_off' to the previous
1761*ef8d499eSDavid van Moolenbroek 	 * length of the tail buffer, while optimistically extending it to
1762*ef8d499eSDavid van Moolenbroek 	 * include the new data.  If not, we set them to NULL/0.
1763*ef8d499eSDavid van Moolenbroek 	 */
1764*ef8d499eSDavid van Moolenbroek 	if ((ptail = tcp->tcp_snd.ts_tail) != NULL &&
1765*ef8d499eSDavid van Moolenbroek 	    ptail->len < ptail->tot_len) {
1766*ef8d499eSDavid van Moolenbroek 		assert(ptail->len > 0);
1767*ef8d499eSDavid van Moolenbroek 		tail_off = (size_t)ptail->len;
1768*ef8d499eSDavid van Moolenbroek 
1769*ef8d499eSDavid van Moolenbroek 		/*
1770*ef8d499eSDavid van Moolenbroek 		 * Optimistically extend the head buffer to include whatever
1771*ef8d499eSDavid van Moolenbroek 		 * fits in it.  This is needed for util_copy_data().
1772*ef8d499eSDavid van Moolenbroek 		 */
1773*ef8d499eSDavid van Moolenbroek 		assert(ptail->tot_len > ptail->len);
1774*ef8d499eSDavid van Moolenbroek 		off = (size_t)ptail->tot_len - (size_t)ptail->len;
1775*ef8d499eSDavid van Moolenbroek 		if (off > left)
1776*ef8d499eSDavid van Moolenbroek 			off = left;
1777*ef8d499eSDavid van Moolenbroek 		ptail->len += off;
1778*ef8d499eSDavid van Moolenbroek 	} else {
1779*ef8d499eSDavid van Moolenbroek 		ptail = NULL;
1780*ef8d499eSDavid van Moolenbroek 		tail_off = 0;
1781*ef8d499eSDavid van Moolenbroek 		off = 0;
1782*ef8d499eSDavid van Moolenbroek 	}
1783*ef8d499eSDavid van Moolenbroek 
1784*ef8d499eSDavid van Moolenbroek 	/*
1785*ef8d499eSDavid van Moolenbroek 	 * Then, if there is more to send, allocate new buffers as needed.  If
1786*ef8d499eSDavid van Moolenbroek 	 * we run out of memory, work with whatever we did manage to grab.
1787*ef8d499eSDavid van Moolenbroek 	 */
1788*ef8d499eSDavid van Moolenbroek 	pfirst = NULL;
1789*ef8d499eSDavid van Moolenbroek 	plast = NULL;
1790*ef8d499eSDavid van Moolenbroek 	while (off < left) {
1791*ef8d499eSDavid van Moolenbroek 		if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS ||
1792*ef8d499eSDavid van Moolenbroek 		    (pnext = tcpsock_alloc_buf()) == NULL) {
1793*ef8d499eSDavid van Moolenbroek 			/*
1794*ef8d499eSDavid van Moolenbroek 			 * Chances are that we will end up suspending this send
1795*ef8d499eSDavid van Moolenbroek 			 * request because of being out of buffers.  We try to
1796*ef8d499eSDavid van Moolenbroek 			 * resume such requests from the polling function.
1797*ef8d499eSDavid van Moolenbroek 			 */
1798*ef8d499eSDavid van Moolenbroek 			tcpsock_set_flag(tcp, TCPF_OOM);
1799*ef8d499eSDavid van Moolenbroek 
1800*ef8d499eSDavid van Moolenbroek 			break;
1801*ef8d499eSDavid van Moolenbroek 		}
1802*ef8d499eSDavid van Moolenbroek 
1803*ef8d499eSDavid van Moolenbroek 		tcpsock_sendbufs++;
1804*ef8d499eSDavid van Moolenbroek 
1805*ef8d499eSDavid van Moolenbroek 		if (pfirst == NULL)
1806*ef8d499eSDavid van Moolenbroek 			pfirst = pnext;
1807*ef8d499eSDavid van Moolenbroek 		else
1808*ef8d499eSDavid van Moolenbroek 			plast->next = pnext;
1809*ef8d499eSDavid van Moolenbroek 		plast = pnext;
1810*ef8d499eSDavid van Moolenbroek 
1811*ef8d499eSDavid van Moolenbroek 		chunk = (size_t)pnext->tot_len;
1812*ef8d499eSDavid van Moolenbroek 		if (chunk > left - off)
1813*ef8d499eSDavid van Moolenbroek 			chunk = left - off;
1814*ef8d499eSDavid van Moolenbroek 		pnext->len = chunk;
1815*ef8d499eSDavid van Moolenbroek 		off += chunk;
1816*ef8d499eSDavid van Moolenbroek 	}
1817*ef8d499eSDavid van Moolenbroek 
1818*ef8d499eSDavid van Moolenbroek 	/*
1819*ef8d499eSDavid van Moolenbroek 	 * Copy in the data and continue, unless we did not manage to find
1820*ef8d499eSDavid van Moolenbroek 	 * enough space to even meet the low send watermark, in which case we
1821*ef8d499eSDavid van Moolenbroek 	 * undo any allocation and suspend the call until later.
1822*ef8d499eSDavid van Moolenbroek 	 */
1823*ef8d499eSDavid van Moolenbroek 	if (off >= min) {
1824*ef8d499eSDavid van Moolenbroek 		/*
1825*ef8d499eSDavid van Moolenbroek 		 * Optimistically attach the new buffers to the tail, also for
1826*ef8d499eSDavid van Moolenbroek 		 * util_copy_data().  We undo all this if the copy fails.
1827*ef8d499eSDavid van Moolenbroek 		 */
1828*ef8d499eSDavid van Moolenbroek 		if (ptail != NULL) {
1829*ef8d499eSDavid van Moolenbroek 			ptail->next = pfirst;
1830*ef8d499eSDavid van Moolenbroek 
1831*ef8d499eSDavid van Moolenbroek 			pnext = ptail;
1832*ef8d499eSDavid van Moolenbroek 		} else
1833*ef8d499eSDavid van Moolenbroek 			pnext = pfirst;
1834*ef8d499eSDavid van Moolenbroek 
1835*ef8d499eSDavid van Moolenbroek 		assert(pnext != NULL);
1836*ef8d499eSDavid van Moolenbroek 
1837*ef8d499eSDavid van Moolenbroek 		r = util_copy_data(data, off, *offp, pnext, tail_off,
1838*ef8d499eSDavid van Moolenbroek 		    TRUE /*copy_in*/);
1839*ef8d499eSDavid van Moolenbroek 	} else
1840*ef8d499eSDavid van Moolenbroek 		r = SUSPEND;
1841*ef8d499eSDavid van Moolenbroek 
1842*ef8d499eSDavid van Moolenbroek 	if (r != OK) {
1843*ef8d499eSDavid van Moolenbroek 		/* Undo the modifications made so far. */
1844*ef8d499eSDavid van Moolenbroek 		while (pfirst != NULL) {
1845*ef8d499eSDavid van Moolenbroek 			pnext = pfirst->next;
1846*ef8d499eSDavid van Moolenbroek 
1847*ef8d499eSDavid van Moolenbroek 			assert(tcpsock_sendbufs > 0);
1848*ef8d499eSDavid van Moolenbroek 			tcpsock_sendbufs--;
1849*ef8d499eSDavid van Moolenbroek 
1850*ef8d499eSDavid van Moolenbroek 			tcpsock_free_buf(pfirst);
1851*ef8d499eSDavid van Moolenbroek 
1852*ef8d499eSDavid van Moolenbroek 			pfirst = pnext;
1853*ef8d499eSDavid van Moolenbroek 		}
1854*ef8d499eSDavid van Moolenbroek 
1855*ef8d499eSDavid van Moolenbroek 		if (ptail != NULL) {
1856*ef8d499eSDavid van Moolenbroek 			ptail->next = NULL;
1857*ef8d499eSDavid van Moolenbroek 
1858*ef8d499eSDavid van Moolenbroek 			ptail->len = tail_off;
1859*ef8d499eSDavid van Moolenbroek 		}
1860*ef8d499eSDavid van Moolenbroek 
1861*ef8d499eSDavid van Moolenbroek 		return r;
1862*ef8d499eSDavid van Moolenbroek 	}
1863*ef8d499eSDavid van Moolenbroek 
1864*ef8d499eSDavid van Moolenbroek 	/* Attach the new buffers, if any, to the buffer tail. */
1865*ef8d499eSDavid van Moolenbroek 	if (pfirst != NULL) {
1866*ef8d499eSDavid van Moolenbroek 		if ((ptail = tcp->tcp_snd.ts_tail) != NULL) {
1867*ef8d499eSDavid van Moolenbroek 			assert(ptail->len == ptail->tot_len);
1868*ef8d499eSDavid van Moolenbroek 
1869*ef8d499eSDavid van Moolenbroek 			/*
1870*ef8d499eSDavid van Moolenbroek 			 * Due to our earlier optimistic modifications, this
1871*ef8d499eSDavid van Moolenbroek 			 * may or may not be redundant.
1872*ef8d499eSDavid van Moolenbroek 			 */
1873*ef8d499eSDavid van Moolenbroek 			ptail->next = pfirst;
1874*ef8d499eSDavid van Moolenbroek 		}
1875*ef8d499eSDavid van Moolenbroek 
1876*ef8d499eSDavid van Moolenbroek 		assert(plast != NULL);
1877*ef8d499eSDavid van Moolenbroek 		tcp->tcp_snd.ts_tail = plast;
1878*ef8d499eSDavid van Moolenbroek 
1879*ef8d499eSDavid van Moolenbroek 		if (tcp->tcp_snd.ts_head == NULL) {
1880*ef8d499eSDavid van Moolenbroek 			tcp->tcp_snd.ts_head = pfirst;
1881*ef8d499eSDavid van Moolenbroek 			assert(tcp->tcp_snd.ts_head_off == 0);
1882*ef8d499eSDavid van Moolenbroek 		}
1883*ef8d499eSDavid van Moolenbroek 		if (tcp->tcp_snd.ts_unsent == NULL) {
1884*ef8d499eSDavid van Moolenbroek 			tcp->tcp_snd.ts_unsent = pfirst;
1885*ef8d499eSDavid van Moolenbroek 			assert(tcp->tcp_snd.ts_unsent_off == 0);
1886*ef8d499eSDavid van Moolenbroek 		}
1887*ef8d499eSDavid van Moolenbroek 	}
1888*ef8d499eSDavid van Moolenbroek 
1889*ef8d499eSDavid van Moolenbroek 	tcp->tcp_snd.ts_len += off;
1890*ef8d499eSDavid van Moolenbroek 
1891*ef8d499eSDavid van Moolenbroek 	/*
1892*ef8d499eSDavid van Moolenbroek 	 * See if we can send any of the data we just enqueued.  The socket is
1893*ef8d499eSDavid van Moolenbroek 	 * still open as we are still processing a call from userland on it;
1894*ef8d499eSDavid van Moolenbroek 	 * this saves us from having to deal with the cases that the following
1895*ef8d499eSDavid van Moolenbroek 	 * calls end up freeing the socket object.
1896*ef8d499eSDavid van Moolenbroek 	 */
1897*ef8d499eSDavid van Moolenbroek 	if (tcpsock_pcb_enqueue(tcp) &&
1898*ef8d499eSDavid van Moolenbroek 	    (r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) {
1899*ef8d499eSDavid van Moolenbroek 		/*
1900*ef8d499eSDavid van Moolenbroek 		 * That did not go well.  Return the error immediately if we
1901*ef8d499eSDavid van Moolenbroek 		 * had not made any progress earlier.  Otherwise, return our
1902*ef8d499eSDavid van Moolenbroek 		 * partial progress and leave the error to be picked up later.
1903*ef8d499eSDavid van Moolenbroek 		 */
1904*ef8d499eSDavid van Moolenbroek 		if (*offp > 0) {
1905*ef8d499eSDavid van Moolenbroek 			sockevent_set_error(tcpsock_get_sock(tcp), r);
1906*ef8d499eSDavid van Moolenbroek 
1907*ef8d499eSDavid van Moolenbroek 			return OK;
1908*ef8d499eSDavid van Moolenbroek 		} else
1909*ef8d499eSDavid van Moolenbroek 			return r;
1910*ef8d499eSDavid van Moolenbroek 	}
1911*ef8d499eSDavid van Moolenbroek 
1912*ef8d499eSDavid van Moolenbroek 	*offp += off;
1913*ef8d499eSDavid van Moolenbroek 	return (off < len) ? SUSPEND : OK;
1914*ef8d499eSDavid van Moolenbroek }
1915*ef8d499eSDavid van Moolenbroek 
1916*ef8d499eSDavid van Moolenbroek /*
1917*ef8d499eSDavid van Moolenbroek  * Perform preliminary checks on a receive request.
1918*ef8d499eSDavid van Moolenbroek  */
1919*ef8d499eSDavid van Moolenbroek static int
tcpsock_pre_recv(struct sock * sock __unused,endpoint_t user_endpt __unused,int flags)1920*ef8d499eSDavid van Moolenbroek tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
1921*ef8d499eSDavid van Moolenbroek 	int flags)
1922*ef8d499eSDavid van Moolenbroek {
1923*ef8d499eSDavid van Moolenbroek 
1924*ef8d499eSDavid van Moolenbroek 	/*
1925*ef8d499eSDavid van Moolenbroek 	 * Reject calls with unknown flags.  Since libsockevent strips out the
1926*ef8d499eSDavid van Moolenbroek 	 * flags it handles itself here, we only have to test for ones we can
1927*ef8d499eSDavid van Moolenbroek 	 * not handle.
1928*ef8d499eSDavid van Moolenbroek 	 */
1929*ef8d499eSDavid van Moolenbroek 	if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0)
1930*ef8d499eSDavid van Moolenbroek 		return EOPNOTSUPP;
1931*ef8d499eSDavid van Moolenbroek 
1932*ef8d499eSDavid van Moolenbroek 	return OK;
1933*ef8d499eSDavid van Moolenbroek }
1934*ef8d499eSDavid van Moolenbroek 
1935*ef8d499eSDavid van Moolenbroek /*
1936*ef8d499eSDavid van Moolenbroek  * Return TRUE if receive calls may wait for more data to come in on the
1937*ef8d499eSDavid van Moolenbroek  * connection, or FALSE if we already know that that is not going to happen.
1938*ef8d499eSDavid van Moolenbroek  */
1939*ef8d499eSDavid van Moolenbroek static int
tcpsock_may_wait(struct tcpsock * tcp)1940*ef8d499eSDavid van Moolenbroek tcpsock_may_wait(struct tcpsock * tcp)
1941*ef8d499eSDavid van Moolenbroek {
1942*ef8d499eSDavid van Moolenbroek 
1943*ef8d499eSDavid van Moolenbroek 	return (tcp->tcp_pcb != NULL &&
1944*ef8d499eSDavid van Moolenbroek 	    !(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN));
1945*ef8d499eSDavid van Moolenbroek }
1946*ef8d499eSDavid van Moolenbroek 
1947*ef8d499eSDavid van Moolenbroek /*
1948*ef8d499eSDavid van Moolenbroek  * Test whether data can be received on a TCP socket, and if so, how many bytes
1949*ef8d499eSDavid van Moolenbroek  * of data.
1950*ef8d499eSDavid van Moolenbroek  */
1951*ef8d499eSDavid van Moolenbroek static int
tcpsock_test_recv(struct sock * sock,size_t min,size_t * size)1952*ef8d499eSDavid van Moolenbroek tcpsock_test_recv(struct sock * sock, size_t min, size_t * size)
1953*ef8d499eSDavid van Moolenbroek {
1954*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
1955*ef8d499eSDavid van Moolenbroek 	int may_wait;
1956*ef8d499eSDavid van Moolenbroek 
1957*ef8d499eSDavid van Moolenbroek 	/* If there is and never was a connection, refuse the call at all. */
1958*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED ||
1959*ef8d499eSDavid van Moolenbroek 	    tcp->tcp_pcb->state == LISTEN))
1960*ef8d499eSDavid van Moolenbroek 		return ENOTCONN;
1961*ef8d499eSDavid van Moolenbroek 
1962*ef8d499eSDavid van Moolenbroek 	/*
1963*ef8d499eSDavid van Moolenbroek 	 * If we are certain that no more data will come in later, ignore the
1964*ef8d499eSDavid van Moolenbroek 	 * low receive watermark.  Otherwise, bound it to the size of the
1965*ef8d499eSDavid van Moolenbroek 	 * receive buffer, or receive calls may block forever.
1966*ef8d499eSDavid van Moolenbroek 	 */
1967*ef8d499eSDavid van Moolenbroek 	if (!(may_wait = tcpsock_may_wait(tcp)))
1968*ef8d499eSDavid van Moolenbroek 		min = 1;
1969*ef8d499eSDavid van Moolenbroek 	else if (min > tcpsock_get_rcvbuf(tcp))
1970*ef8d499eSDavid van Moolenbroek 		min = tcpsock_get_rcvbuf(tcp);
1971*ef8d499eSDavid van Moolenbroek 
1972*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_rcv.tr_len >= min) {
1973*ef8d499eSDavid van Moolenbroek 		if (size != NULL)
1974*ef8d499eSDavid van Moolenbroek 			*size = tcp->tcp_rcv.tr_len;
1975*ef8d499eSDavid van Moolenbroek 
1976*ef8d499eSDavid van Moolenbroek 		return OK;
1977*ef8d499eSDavid van Moolenbroek 	}
1978*ef8d499eSDavid van Moolenbroek 
1979*ef8d499eSDavid van Moolenbroek 	return (may_wait) ? SUSPEND : SOCKEVENT_EOF;
1980*ef8d499eSDavid van Moolenbroek }
1981*ef8d499eSDavid van Moolenbroek 
1982*ef8d499eSDavid van Moolenbroek /*
1983*ef8d499eSDavid van Moolenbroek  * Receive data on a TCP socket.
1984*ef8d499eSDavid van Moolenbroek  */
1985*ef8d499eSDavid van Moolenbroek static int
tcpsock_recv(struct sock * sock,const struct sockdriver_data * data,size_t len,size_t * offp,const struct sockdriver_data * ctl __unused,socklen_t ctl_len __unused,socklen_t * ctl_off __unused,struct sockaddr * addr __unused,socklen_t * addr_len __unused,endpoint_t user_endpt __unused,int flags,size_t min,int * rflags __unused)1986*ef8d499eSDavid van Moolenbroek tcpsock_recv(struct sock * sock, const struct sockdriver_data * data,
1987*ef8d499eSDavid van Moolenbroek 	size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
1988*ef8d499eSDavid van Moolenbroek 	socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
1989*ef8d499eSDavid van Moolenbroek 	struct sockaddr * addr __unused, socklen_t * addr_len __unused,
1990*ef8d499eSDavid van Moolenbroek 	endpoint_t user_endpt __unused, int flags, size_t min,
1991*ef8d499eSDavid van Moolenbroek 	int * rflags __unused)
1992*ef8d499eSDavid van Moolenbroek {
1993*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
1994*ef8d499eSDavid van Moolenbroek 	struct pbuf *ptail;
1995*ef8d499eSDavid van Moolenbroek 	size_t off, left;
1996*ef8d499eSDavid van Moolenbroek 	int r;
1997*ef8d499eSDavid van Moolenbroek 
1998*ef8d499eSDavid van Moolenbroek 	/* See if we can receive at all, and if so, how much at most. */
1999*ef8d499eSDavid van Moolenbroek 	if ((r = tcpsock_test_recv(sock, min, NULL)) != OK)
2000*ef8d499eSDavid van Moolenbroek 		return r;
2001*ef8d499eSDavid van Moolenbroek 
2002*ef8d499eSDavid van Moolenbroek 	if (len == 0)
2003*ef8d499eSDavid van Moolenbroek 		return OK;	/* nothing to do */
2004*ef8d499eSDavid van Moolenbroek 
2005*ef8d499eSDavid van Moolenbroek 	off = tcp->tcp_rcv.tr_len;
2006*ef8d499eSDavid van Moolenbroek 	if (off > len)
2007*ef8d499eSDavid van Moolenbroek 		off = len;
2008*ef8d499eSDavid van Moolenbroek 
2009*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_rcv.tr_head != NULL);
2010*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len);
2011*ef8d499eSDavid van Moolenbroek 
2012*ef8d499eSDavid van Moolenbroek 	/* Copy out the data to the caller. */
2013*ef8d499eSDavid van Moolenbroek 	if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head,
2014*ef8d499eSDavid van Moolenbroek 	    tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK)
2015*ef8d499eSDavid van Moolenbroek 		return r;
2016*ef8d499eSDavid van Moolenbroek 
2017*ef8d499eSDavid van Moolenbroek 	/* Unless peeking, remove the data from the receive queue. */
2018*ef8d499eSDavid van Moolenbroek 	if (!(flags & MSG_PEEK)) {
2019*ef8d499eSDavid van Moolenbroek 		left = off;
2020*ef8d499eSDavid van Moolenbroek 
2021*ef8d499eSDavid van Moolenbroek 		/* Dequeue and free as many entire buffers as possible. */
2022*ef8d499eSDavid van Moolenbroek 		while ((ptail = tcp->tcp_rcv.tr_head) != NULL &&
2023*ef8d499eSDavid van Moolenbroek 		    left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) {
2024*ef8d499eSDavid van Moolenbroek 			left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off;
2025*ef8d499eSDavid van Moolenbroek 
2026*ef8d499eSDavid van Moolenbroek 			tcp->tcp_rcv.tr_head = ptail->next;
2027*ef8d499eSDavid van Moolenbroek 			tcp->tcp_rcv.tr_head_off = 0;
2028*ef8d499eSDavid van Moolenbroek 
2029*ef8d499eSDavid van Moolenbroek 			if (tcp->tcp_rcv.tr_head == NULL)
2030*ef8d499eSDavid van Moolenbroek 				tcp->tcp_rcv.tr_pre_tailp = NULL;
2031*ef8d499eSDavid van Moolenbroek 			else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next)
2032*ef8d499eSDavid van Moolenbroek 				tcp->tcp_rcv.tr_pre_tailp =
2033*ef8d499eSDavid van Moolenbroek 				    &tcp->tcp_rcv.tr_head;
2034*ef8d499eSDavid van Moolenbroek 
2035*ef8d499eSDavid van Moolenbroek 			assert(tcpsock_recvbufs > 0);
2036*ef8d499eSDavid van Moolenbroek 			tcpsock_recvbufs--;
2037*ef8d499eSDavid van Moolenbroek 
2038*ef8d499eSDavid van Moolenbroek 			tcpsock_free_buf(ptail);
2039*ef8d499eSDavid van Moolenbroek 		}
2040*ef8d499eSDavid van Moolenbroek 
2041*ef8d499eSDavid van Moolenbroek 		/*
2042*ef8d499eSDavid van Moolenbroek 		 * If only part of the (new) head buffer is consumed, adjust
2043*ef8d499eSDavid van Moolenbroek 		 * the saved offset into that buffer.
2044*ef8d499eSDavid van Moolenbroek 		 */
2045*ef8d499eSDavid van Moolenbroek 		if (left > 0) {
2046*ef8d499eSDavid van Moolenbroek 			assert(tcp->tcp_rcv.tr_head != NULL);
2047*ef8d499eSDavid van Moolenbroek 			assert((size_t)tcp->tcp_rcv.tr_head->len -
2048*ef8d499eSDavid van Moolenbroek 			    tcp->tcp_rcv.tr_head_off > left);
2049*ef8d499eSDavid van Moolenbroek 
2050*ef8d499eSDavid van Moolenbroek 			tcp->tcp_rcv.tr_head_off += left;
2051*ef8d499eSDavid van Moolenbroek 		}
2052*ef8d499eSDavid van Moolenbroek 
2053*ef8d499eSDavid van Moolenbroek 		tcp->tcp_rcv.tr_len -= off;
2054*ef8d499eSDavid van Moolenbroek 
2055*ef8d499eSDavid van Moolenbroek 		if (tcp->tcp_rcv.tr_head != NULL) {
2056*ef8d499eSDavid van Moolenbroek 			assert(tcp->tcp_rcv.tr_pre_tailp != NULL);
2057*ef8d499eSDavid van Moolenbroek 			assert(tcp->tcp_rcv.tr_len > 0);
2058*ef8d499eSDavid van Moolenbroek 		} else {
2059*ef8d499eSDavid van Moolenbroek 			assert(tcp->tcp_rcv.tr_pre_tailp == NULL);
2060*ef8d499eSDavid van Moolenbroek 			assert(tcp->tcp_rcv.tr_len == 0);
2061*ef8d499eSDavid van Moolenbroek 		}
2062*ef8d499eSDavid van Moolenbroek 
2063*ef8d499eSDavid van Moolenbroek 		/*
2064*ef8d499eSDavid van Moolenbroek 		 * The receive buffer has shrunk, so there may now be space to
2065*ef8d499eSDavid van Moolenbroek 		 * receive more data.
2066*ef8d499eSDavid van Moolenbroek 		 */
2067*ef8d499eSDavid van Moolenbroek 		if (tcp->tcp_pcb != NULL)
2068*ef8d499eSDavid van Moolenbroek 			tcpsock_ack_recv(tcp);
2069*ef8d499eSDavid van Moolenbroek 	} else
2070*ef8d499eSDavid van Moolenbroek 		flags &= ~MSG_WAITALL; /* for the check below */
2071*ef8d499eSDavid van Moolenbroek 
2072*ef8d499eSDavid van Moolenbroek 	/* Advance the current copy position, and see if we are done. */
2073*ef8d499eSDavid van Moolenbroek 	*offp += off;
2074*ef8d499eSDavid van Moolenbroek 	if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp))
2075*ef8d499eSDavid van Moolenbroek 		return SUSPEND;
2076*ef8d499eSDavid van Moolenbroek 	else
2077*ef8d499eSDavid van Moolenbroek 		return OK;
2078*ef8d499eSDavid van Moolenbroek }
2079*ef8d499eSDavid van Moolenbroek 
2080*ef8d499eSDavid van Moolenbroek /*
2081*ef8d499eSDavid van Moolenbroek  * Update the set of flag-type socket options on a TCP socket.
2082*ef8d499eSDavid van Moolenbroek  */
2083*ef8d499eSDavid van Moolenbroek static void
tcpsock_setsockmask(struct sock * sock,unsigned int mask)2084*ef8d499eSDavid van Moolenbroek tcpsock_setsockmask(struct sock * sock, unsigned int mask)
2085*ef8d499eSDavid van Moolenbroek {
2086*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
2087*ef8d499eSDavid van Moolenbroek 
2088*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb == NULL)
2089*ef8d499eSDavid van Moolenbroek 		return;
2090*ef8d499eSDavid van Moolenbroek 
2091*ef8d499eSDavid van Moolenbroek 	if (mask & SO_REUSEADDR)
2092*ef8d499eSDavid van Moolenbroek 		ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR);
2093*ef8d499eSDavid van Moolenbroek 	else
2094*ef8d499eSDavid van Moolenbroek 		ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR);
2095*ef8d499eSDavid van Moolenbroek 
2096*ef8d499eSDavid van Moolenbroek 	if (mask & SO_KEEPALIVE)
2097*ef8d499eSDavid van Moolenbroek 		ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE);
2098*ef8d499eSDavid van Moolenbroek 	else
2099*ef8d499eSDavid van Moolenbroek 		ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE);
2100*ef8d499eSDavid van Moolenbroek }
2101*ef8d499eSDavid van Moolenbroek 
2102*ef8d499eSDavid van Moolenbroek /*
2103*ef8d499eSDavid van Moolenbroek  * Prepare a helper structure for IP-level option processing.
2104*ef8d499eSDavid van Moolenbroek  */
2105*ef8d499eSDavid van Moolenbroek static void
tcpsock_get_ipopts(struct tcpsock * tcp,struct ipopts * ipopts)2106*ef8d499eSDavid van Moolenbroek tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts)
2107*ef8d499eSDavid van Moolenbroek {
2108*ef8d499eSDavid van Moolenbroek 
2109*ef8d499eSDavid van Moolenbroek 	ipopts->local_ip = &tcp->tcp_pcb->local_ip;
2110*ef8d499eSDavid van Moolenbroek 	ipopts->remote_ip = &tcp->tcp_pcb->remote_ip;
2111*ef8d499eSDavid van Moolenbroek 	ipopts->tos = &tcp->tcp_pcb->tos;
2112*ef8d499eSDavid van Moolenbroek 	ipopts->ttl = &tcp->tcp_pcb->ttl;
2113*ef8d499eSDavid van Moolenbroek 	ipopts->sndmin = TCP_SNDBUF_MIN;
2114*ef8d499eSDavid van Moolenbroek 	ipopts->sndmax = TCP_SNDBUF_MAX;
2115*ef8d499eSDavid van Moolenbroek 	ipopts->rcvmin = TCP_RCVBUF_MIN;
2116*ef8d499eSDavid van Moolenbroek 	ipopts->rcvmax = TCP_RCVBUF_MAX;
2117*ef8d499eSDavid van Moolenbroek }
2118*ef8d499eSDavid van Moolenbroek 
2119*ef8d499eSDavid van Moolenbroek /*
2120*ef8d499eSDavid van Moolenbroek  * Set socket options on a TCP socket.
2121*ef8d499eSDavid van Moolenbroek  */
2122*ef8d499eSDavid van Moolenbroek static int
tcpsock_setsockopt(struct sock * sock,int level,int name,const struct sockdriver_data * data,socklen_t len)2123*ef8d499eSDavid van Moolenbroek tcpsock_setsockopt(struct sock * sock, int level, int name,
2124*ef8d499eSDavid van Moolenbroek 	const struct sockdriver_data * data, socklen_t len)
2125*ef8d499eSDavid van Moolenbroek {
2126*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
2127*ef8d499eSDavid van Moolenbroek 	struct ipopts ipopts;
2128*ef8d499eSDavid van Moolenbroek 	uint32_t uval;
2129*ef8d499eSDavid van Moolenbroek 	int r, val;
2130*ef8d499eSDavid van Moolenbroek 
2131*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb == NULL)
2132*ef8d499eSDavid van Moolenbroek 		return ECONNRESET;
2133*ef8d499eSDavid van Moolenbroek 
2134*ef8d499eSDavid van Moolenbroek 	/* Handle TCP-level options. */
2135*ef8d499eSDavid van Moolenbroek 	switch (level) {
2136*ef8d499eSDavid van Moolenbroek 	case IPPROTO_IPV6:
2137*ef8d499eSDavid van Moolenbroek 		switch (name) {
2138*ef8d499eSDavid van Moolenbroek 		case IPV6_RECVTCLASS:
2139*ef8d499eSDavid van Moolenbroek 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2140*ef8d499eSDavid van Moolenbroek 			    len)) != OK)
2141*ef8d499eSDavid van Moolenbroek 				return r;
2142*ef8d499eSDavid van Moolenbroek 
2143*ef8d499eSDavid van Moolenbroek 			/*
2144*ef8d499eSDavid van Moolenbroek 			 * This option is not supported for TCP sockets; it
2145*ef8d499eSDavid van Moolenbroek 			 * would not even make sense.  However, named(8)
2146*ef8d499eSDavid van Moolenbroek 			 * insists on trying to set it anyway.  We accept the
2147*ef8d499eSDavid van Moolenbroek 			 * request but ignore the value, not even returning
2148*ef8d499eSDavid van Moolenbroek 			 * what was set through getsockopt(2).
2149*ef8d499eSDavid van Moolenbroek 			 */
2150*ef8d499eSDavid van Moolenbroek 			return OK;
2151*ef8d499eSDavid van Moolenbroek 
2152*ef8d499eSDavid van Moolenbroek 		case IPV6_FAITH:
2153*ef8d499eSDavid van Moolenbroek 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2154*ef8d499eSDavid van Moolenbroek 			    len)) != OK)
2155*ef8d499eSDavid van Moolenbroek 				return r;
2156*ef8d499eSDavid van Moolenbroek 
2157*ef8d499eSDavid van Moolenbroek 			/*
2158*ef8d499eSDavid van Moolenbroek 			 * This option is not supported at all, but to save
2159*ef8d499eSDavid van Moolenbroek 			 * ourselves from having to remember the current state
2160*ef8d499eSDavid van Moolenbroek 			 * for getsockopt(2), we also refuse to enable it.
2161*ef8d499eSDavid van Moolenbroek 			 */
2162*ef8d499eSDavid van Moolenbroek 			if (val != 0)
2163*ef8d499eSDavid van Moolenbroek 				return EINVAL;
2164*ef8d499eSDavid van Moolenbroek 
2165*ef8d499eSDavid van Moolenbroek 			return OK;
2166*ef8d499eSDavid van Moolenbroek 		}
2167*ef8d499eSDavid van Moolenbroek 
2168*ef8d499eSDavid van Moolenbroek 		break;
2169*ef8d499eSDavid van Moolenbroek 
2170*ef8d499eSDavid van Moolenbroek 	case IPPROTO_TCP:
2171*ef8d499eSDavid van Moolenbroek 		switch (name) {
2172*ef8d499eSDavid van Moolenbroek 		case TCP_NODELAY:
2173*ef8d499eSDavid van Moolenbroek 			/*
2174*ef8d499eSDavid van Moolenbroek 			 * lwIP's listening TCP PCBs do not have this field.
2175*ef8d499eSDavid van Moolenbroek 			 * If this ever becomes an issue, we can create our own
2176*ef8d499eSDavid van Moolenbroek 			 * shadow flag and do the inheritance ourselves.
2177*ef8d499eSDavid van Moolenbroek 			 */
2178*ef8d499eSDavid van Moolenbroek 			if (tcp->tcp_pcb->state == LISTEN)
2179*ef8d499eSDavid van Moolenbroek 				return EINVAL;
2180*ef8d499eSDavid van Moolenbroek 
2181*ef8d499eSDavid van Moolenbroek 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2182*ef8d499eSDavid van Moolenbroek 			    len)) != OK)
2183*ef8d499eSDavid van Moolenbroek 				return r;
2184*ef8d499eSDavid van Moolenbroek 
2185*ef8d499eSDavid van Moolenbroek 			if (val)
2186*ef8d499eSDavid van Moolenbroek 				tcp_nagle_disable(tcp->tcp_pcb);
2187*ef8d499eSDavid van Moolenbroek 			else
2188*ef8d499eSDavid van Moolenbroek 				tcp_nagle_enable(tcp->tcp_pcb);
2189*ef8d499eSDavid van Moolenbroek 
2190*ef8d499eSDavid van Moolenbroek 			return OK;
2191*ef8d499eSDavid van Moolenbroek 
2192*ef8d499eSDavid van Moolenbroek 		case TCP_KEEPIDLE:
2193*ef8d499eSDavid van Moolenbroek 		case TCP_KEEPINTVL:
2194*ef8d499eSDavid van Moolenbroek 			/*
2195*ef8d499eSDavid van Moolenbroek 			 * lwIP's listening TCP PCBs do not have these fields.
2196*ef8d499eSDavid van Moolenbroek 			 */
2197*ef8d499eSDavid van Moolenbroek 			if (tcp->tcp_pcb->state == LISTEN)
2198*ef8d499eSDavid van Moolenbroek 				return EINVAL;
2199*ef8d499eSDavid van Moolenbroek 
2200*ef8d499eSDavid van Moolenbroek 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2201*ef8d499eSDavid van Moolenbroek 			    len)) != OK)
2202*ef8d499eSDavid van Moolenbroek 				return r;
2203*ef8d499eSDavid van Moolenbroek 
2204*ef8d499eSDavid van Moolenbroek 			if (val == 0)
2205*ef8d499eSDavid van Moolenbroek 				return EINVAL;
2206*ef8d499eSDavid van Moolenbroek 
2207*ef8d499eSDavid van Moolenbroek 			/*
2208*ef8d499eSDavid van Moolenbroek 			 * The given value is unsigned, but lwIP stores the
2209*ef8d499eSDavid van Moolenbroek 			 * value in milliseconds in a uint32_t field, so we
2210*ef8d499eSDavid van Moolenbroek 			 * have to limit large values to whatever fits in the
2211*ef8d499eSDavid van Moolenbroek 			 * field anyway.
2212*ef8d499eSDavid van Moolenbroek 			 */
2213*ef8d499eSDavid van Moolenbroek 			if (val < 0 || (uint32_t)val > UINT32_MAX / 1000)
2214*ef8d499eSDavid van Moolenbroek 				uval = UINT32_MAX;
2215*ef8d499eSDavid van Moolenbroek 			else
2216*ef8d499eSDavid van Moolenbroek 				uval = (uint32_t)val * 1000;
2217*ef8d499eSDavid van Moolenbroek 
2218*ef8d499eSDavid van Moolenbroek 			if (name == TCP_KEEPIDLE)
2219*ef8d499eSDavid van Moolenbroek 				tcp->tcp_pcb->keep_idle = uval;
2220*ef8d499eSDavid van Moolenbroek 			else
2221*ef8d499eSDavid van Moolenbroek 				tcp->tcp_pcb->keep_intvl = uval;
2222*ef8d499eSDavid van Moolenbroek 
2223*ef8d499eSDavid van Moolenbroek 			return OK;
2224*ef8d499eSDavid van Moolenbroek 
2225*ef8d499eSDavid van Moolenbroek 		case TCP_KEEPCNT:
2226*ef8d499eSDavid van Moolenbroek 			/* lwIP's listening TCP PCBs do not have this field. */
2227*ef8d499eSDavid van Moolenbroek 			if (tcp->tcp_pcb->state == LISTEN)
2228*ef8d499eSDavid van Moolenbroek 				return EINVAL;
2229*ef8d499eSDavid van Moolenbroek 
2230*ef8d499eSDavid van Moolenbroek 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2231*ef8d499eSDavid van Moolenbroek 			    len)) != OK)
2232*ef8d499eSDavid van Moolenbroek 				return r;
2233*ef8d499eSDavid van Moolenbroek 
2234*ef8d499eSDavid van Moolenbroek 			if (val == 0)
2235*ef8d499eSDavid van Moolenbroek 				return EINVAL;
2236*ef8d499eSDavid van Moolenbroek 
2237*ef8d499eSDavid van Moolenbroek 			tcp->tcp_pcb->keep_cnt = (uint32_t)val;
2238*ef8d499eSDavid van Moolenbroek 
2239*ef8d499eSDavid van Moolenbroek 			return OK;
2240*ef8d499eSDavid van Moolenbroek 		}
2241*ef8d499eSDavid van Moolenbroek 
2242*ef8d499eSDavid van Moolenbroek 		return EOPNOTSUPP;
2243*ef8d499eSDavid van Moolenbroek 	}
2244*ef8d499eSDavid van Moolenbroek 
2245*ef8d499eSDavid van Moolenbroek 	/* Handle all other options at the IP level. */
2246*ef8d499eSDavid van Moolenbroek 	tcpsock_get_ipopts(tcp, &ipopts);
2247*ef8d499eSDavid van Moolenbroek 
2248*ef8d499eSDavid van Moolenbroek 	return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data,
2249*ef8d499eSDavid van Moolenbroek 	    len, &ipopts);
2250*ef8d499eSDavid van Moolenbroek }
2251*ef8d499eSDavid van Moolenbroek 
2252*ef8d499eSDavid van Moolenbroek /*
2253*ef8d499eSDavid van Moolenbroek  * Retrieve socket options on a TCP socket.
2254*ef8d499eSDavid van Moolenbroek  */
2255*ef8d499eSDavid van Moolenbroek static int
tcpsock_getsockopt(struct sock * sock,int level,int name,const struct sockdriver_data * data,socklen_t * len)2256*ef8d499eSDavid van Moolenbroek tcpsock_getsockopt(struct sock * sock, int level, int name,
2257*ef8d499eSDavid van Moolenbroek 	const struct sockdriver_data * data, socklen_t * len)
2258*ef8d499eSDavid van Moolenbroek {
2259*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
2260*ef8d499eSDavid van Moolenbroek 	struct ipopts ipopts;
2261*ef8d499eSDavid van Moolenbroek 	int val;
2262*ef8d499eSDavid van Moolenbroek 
2263*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb == NULL)
2264*ef8d499eSDavid van Moolenbroek 		return ECONNRESET;
2265*ef8d499eSDavid van Moolenbroek 
2266*ef8d499eSDavid van Moolenbroek 	/* Handle TCP-level options. */
2267*ef8d499eSDavid van Moolenbroek 	switch (level) {
2268*ef8d499eSDavid van Moolenbroek 	case IPPROTO_IPV6:
2269*ef8d499eSDavid van Moolenbroek 		switch (name) {
2270*ef8d499eSDavid van Moolenbroek 		case IPV6_RECVTCLASS:
2271*ef8d499eSDavid van Moolenbroek 		case IPV6_FAITH:
2272*ef8d499eSDavid van Moolenbroek 			val = 0;
2273*ef8d499eSDavid van Moolenbroek 
2274*ef8d499eSDavid van Moolenbroek 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2275*ef8d499eSDavid van Moolenbroek 			    len);
2276*ef8d499eSDavid van Moolenbroek 		}
2277*ef8d499eSDavid van Moolenbroek 
2278*ef8d499eSDavid van Moolenbroek 		break;
2279*ef8d499eSDavid van Moolenbroek 
2280*ef8d499eSDavid van Moolenbroek 	case IPPROTO_TCP:
2281*ef8d499eSDavid van Moolenbroek 		switch (name) {
2282*ef8d499eSDavid van Moolenbroek 		case TCP_NODELAY:
2283*ef8d499eSDavid van Moolenbroek 			/* lwIP's listening TCP PCBs do not have this field. */
2284*ef8d499eSDavid van Moolenbroek 			if (tcp->tcp_pcb->state == LISTEN)
2285*ef8d499eSDavid van Moolenbroek 				return EINVAL;
2286*ef8d499eSDavid van Moolenbroek 
2287*ef8d499eSDavid van Moolenbroek 			val = tcp_nagle_disabled(tcp->tcp_pcb);
2288*ef8d499eSDavid van Moolenbroek 
2289*ef8d499eSDavid van Moolenbroek 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2290*ef8d499eSDavid van Moolenbroek 			    len);
2291*ef8d499eSDavid van Moolenbroek 
2292*ef8d499eSDavid van Moolenbroek 		case TCP_MAXSEG:
2293*ef8d499eSDavid van Moolenbroek 			/* lwIP's listening TCP PCBs do not have this field. */
2294*ef8d499eSDavid van Moolenbroek 			if (tcp->tcp_pcb->state == LISTEN)
2295*ef8d499eSDavid van Moolenbroek 				return EINVAL;
2296*ef8d499eSDavid van Moolenbroek 
2297*ef8d499eSDavid van Moolenbroek 			/* This option is read-only at this time. */
2298*ef8d499eSDavid van Moolenbroek 			val = tcp->tcp_pcb->mss;
2299*ef8d499eSDavid van Moolenbroek 
2300*ef8d499eSDavid van Moolenbroek 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2301*ef8d499eSDavid van Moolenbroek 			    len);
2302*ef8d499eSDavid van Moolenbroek 
2303*ef8d499eSDavid van Moolenbroek 		case TCP_KEEPIDLE:
2304*ef8d499eSDavid van Moolenbroek 			/* lwIP's listening TCP PCBs do not have this field. */
2305*ef8d499eSDavid van Moolenbroek 			if (tcp->tcp_pcb->state == LISTEN)
2306*ef8d499eSDavid van Moolenbroek 				return EINVAL;
2307*ef8d499eSDavid van Moolenbroek 
2308*ef8d499eSDavid van Moolenbroek 			val = (int)(tcp->tcp_pcb->keep_idle / 1000);
2309*ef8d499eSDavid van Moolenbroek 
2310*ef8d499eSDavid van Moolenbroek 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2311*ef8d499eSDavid van Moolenbroek 			    len);
2312*ef8d499eSDavid van Moolenbroek 
2313*ef8d499eSDavid van Moolenbroek 		case TCP_KEEPINTVL:
2314*ef8d499eSDavid van Moolenbroek 			/* lwIP's listening TCP PCBs do not have this field. */
2315*ef8d499eSDavid van Moolenbroek 			if (tcp->tcp_pcb->state == LISTEN)
2316*ef8d499eSDavid van Moolenbroek 				return EINVAL;
2317*ef8d499eSDavid van Moolenbroek 
2318*ef8d499eSDavid van Moolenbroek 			val = (int)(tcp->tcp_pcb->keep_intvl / 1000);
2319*ef8d499eSDavid van Moolenbroek 
2320*ef8d499eSDavid van Moolenbroek 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2321*ef8d499eSDavid van Moolenbroek 			    len);
2322*ef8d499eSDavid van Moolenbroek 
2323*ef8d499eSDavid van Moolenbroek 		case TCP_KEEPCNT:
2324*ef8d499eSDavid van Moolenbroek 			/* lwIP's listening TCP PCBs do not have this field. */
2325*ef8d499eSDavid van Moolenbroek 			if (tcp->tcp_pcb->state == LISTEN)
2326*ef8d499eSDavid van Moolenbroek 				return EINVAL;
2327*ef8d499eSDavid van Moolenbroek 
2328*ef8d499eSDavid van Moolenbroek 			val = (int)tcp->tcp_pcb->keep_cnt;
2329*ef8d499eSDavid van Moolenbroek 
2330*ef8d499eSDavid van Moolenbroek 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2331*ef8d499eSDavid van Moolenbroek 			    len);
2332*ef8d499eSDavid van Moolenbroek 		}
2333*ef8d499eSDavid van Moolenbroek 
2334*ef8d499eSDavid van Moolenbroek 		return EOPNOTSUPP;
2335*ef8d499eSDavid van Moolenbroek 	}
2336*ef8d499eSDavid van Moolenbroek 
2337*ef8d499eSDavid van Moolenbroek 	/* Handle all other options at the IP level. */
2338*ef8d499eSDavid van Moolenbroek 	tcpsock_get_ipopts(tcp, &ipopts);
2339*ef8d499eSDavid van Moolenbroek 
2340*ef8d499eSDavid van Moolenbroek 	return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data,
2341*ef8d499eSDavid van Moolenbroek 	    len, &ipopts);
2342*ef8d499eSDavid van Moolenbroek }
2343*ef8d499eSDavid van Moolenbroek 
2344*ef8d499eSDavid van Moolenbroek /*
2345*ef8d499eSDavid van Moolenbroek  * Retrieve the local socket address of a TCP socket.
2346*ef8d499eSDavid van Moolenbroek  */
2347*ef8d499eSDavid van Moolenbroek static int
tcpsock_getsockname(struct sock * sock,struct sockaddr * addr,socklen_t * addr_len)2348*ef8d499eSDavid van Moolenbroek tcpsock_getsockname(struct sock * sock, struct sockaddr * addr,
2349*ef8d499eSDavid van Moolenbroek 	socklen_t * addr_len)
2350*ef8d499eSDavid van Moolenbroek {
2351*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
2352*ef8d499eSDavid van Moolenbroek 
2353*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb == NULL)
2354*ef8d499eSDavid van Moolenbroek 		return EINVAL;
2355*ef8d499eSDavid van Moolenbroek 
2356*ef8d499eSDavid van Moolenbroek 	ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
2357*ef8d499eSDavid van Moolenbroek 	    &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port);
2358*ef8d499eSDavid van Moolenbroek 
2359*ef8d499eSDavid van Moolenbroek 	return OK;
2360*ef8d499eSDavid van Moolenbroek }
2361*ef8d499eSDavid van Moolenbroek 
2362*ef8d499eSDavid van Moolenbroek /*
2363*ef8d499eSDavid van Moolenbroek  * Retrieve the remote socket address of a TCP socket.
2364*ef8d499eSDavid van Moolenbroek  */
2365*ef8d499eSDavid van Moolenbroek static int
tcpsock_getpeername(struct sock * sock,struct sockaddr * addr,socklen_t * addr_len)2366*ef8d499eSDavid van Moolenbroek tcpsock_getpeername(struct sock * sock, struct sockaddr * addr,
2367*ef8d499eSDavid van Moolenbroek 	socklen_t * addr_len)
2368*ef8d499eSDavid van Moolenbroek {
2369*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
2370*ef8d499eSDavid van Moolenbroek 
2371*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED ||
2372*ef8d499eSDavid van Moolenbroek 	    tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT)
2373*ef8d499eSDavid van Moolenbroek 		return ENOTCONN;
2374*ef8d499eSDavid van Moolenbroek 
2375*ef8d499eSDavid van Moolenbroek 	ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
2376*ef8d499eSDavid van Moolenbroek 	    &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
2377*ef8d499eSDavid van Moolenbroek 
2378*ef8d499eSDavid van Moolenbroek 	return OK;
2379*ef8d499eSDavid van Moolenbroek }
2380*ef8d499eSDavid van Moolenbroek 
2381*ef8d499eSDavid van Moolenbroek /*
2382*ef8d499eSDavid van Moolenbroek  * Perform a TCP half-close on a TCP socket.  This operation may not complete
2383*ef8d499eSDavid van Moolenbroek  * immediately due to memory conditions, in which case it will be completed at
2384*ef8d499eSDavid van Moolenbroek  * a later time.
2385*ef8d499eSDavid van Moolenbroek  */
2386*ef8d499eSDavid van Moolenbroek static void
tcpsock_send_fin(struct tcpsock * tcp)2387*ef8d499eSDavid van Moolenbroek tcpsock_send_fin(struct tcpsock * tcp)
2388*ef8d499eSDavid van Moolenbroek {
2389*ef8d499eSDavid van Moolenbroek 
2390*ef8d499eSDavid van Moolenbroek 	sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR);
2391*ef8d499eSDavid van Moolenbroek 
2392*ef8d499eSDavid van Moolenbroek 	/*
2393*ef8d499eSDavid van Moolenbroek 	 * Attempt to send the FIN.  If a fatal error occurs as a result, raise
2394*ef8d499eSDavid van Moolenbroek 	 * it as an asynchronous error, because this function's callers cannot
2395*ef8d499eSDavid van Moolenbroek 	 * do much with it.  That happens to match the way these functions are
2396*ef8d499eSDavid van Moolenbroek 	 * used elsewhere.  In any case, as a result, the PCB may be closed.
2397*ef8d499eSDavid van Moolenbroek 	 * However, we are never called from a situation where the socket is
2398*ef8d499eSDavid van Moolenbroek 	 * being closed here, so the socket object will not be freed either.
2399*ef8d499eSDavid van Moolenbroek 	 */
2400*ef8d499eSDavid van Moolenbroek 	if (tcpsock_pcb_enqueue(tcp)) {
2401*ef8d499eSDavid van Moolenbroek 		assert(!sockevent_is_closing(tcpsock_get_sock(tcp)));
2402*ef8d499eSDavid van Moolenbroek 
2403*ef8d499eSDavid van Moolenbroek 		if (tcpsock_may_close(tcp))
2404*ef8d499eSDavid van Moolenbroek 			tcpsock_finish_close(tcp);
2405*ef8d499eSDavid van Moolenbroek 		else
2406*ef8d499eSDavid van Moolenbroek 			(void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/);
2407*ef8d499eSDavid van Moolenbroek 	}
2408*ef8d499eSDavid van Moolenbroek }
2409*ef8d499eSDavid van Moolenbroek 
2410*ef8d499eSDavid van Moolenbroek /*
2411*ef8d499eSDavid van Moolenbroek  * Shut down a TCP socket for reading and/or writing.
2412*ef8d499eSDavid van Moolenbroek  */
2413*ef8d499eSDavid van Moolenbroek static int
tcpsock_shutdown(struct sock * sock,unsigned int mask)2414*ef8d499eSDavid van Moolenbroek tcpsock_shutdown(struct sock * sock, unsigned int mask)
2415*ef8d499eSDavid van Moolenbroek {
2416*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
2417*ef8d499eSDavid van Moolenbroek 
2418*ef8d499eSDavid van Moolenbroek 	/*
2419*ef8d499eSDavid van Moolenbroek 	 * If the PCB is gone, we want to allow shutdowns for reading but not
2420*ef8d499eSDavid van Moolenbroek 	 * writing: shutting down for writing affects the PCB, shutting down
2421*ef8d499eSDavid van Moolenbroek 	 * for reading does not.  Also, if the PCB is in CLOSED state, we would
2422*ef8d499eSDavid van Moolenbroek 	 * not know how to deal with subsequent operations after a shutdown for
2423*ef8d499eSDavid van Moolenbroek 	 * writing, so forbid such calls altogether.
2424*ef8d499eSDavid van Moolenbroek 	 */
2425*ef8d499eSDavid van Moolenbroek 	if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) &&
2426*ef8d499eSDavid van Moolenbroek 	    (mask & SFL_SHUT_WR))
2427*ef8d499eSDavid van Moolenbroek 		return ENOTCONN;
2428*ef8d499eSDavid van Moolenbroek 
2429*ef8d499eSDavid van Moolenbroek 	/*
2430*ef8d499eSDavid van Moolenbroek 	 * Handle listening sockets as a special case.  Shutting down a
2431*ef8d499eSDavid van Moolenbroek 	 * listening socket frees its PCB.  Sockets pending on the accept queue
2432*ef8d499eSDavid van Moolenbroek 	 * may still be accepted, but after that, accept(2) will start
2433*ef8d499eSDavid van Moolenbroek 	 * returning ECONNABORTED.  This feature allows multi-process server
2434*ef8d499eSDavid van Moolenbroek 	 * applications to shut down gracefully, supposedly..
2435*ef8d499eSDavid van Moolenbroek 	 */
2436*ef8d499eSDavid van Moolenbroek 	if (tcpsock_is_listening(tcp)) {
2437*ef8d499eSDavid van Moolenbroek 		if (tcp->tcp_pcb != NULL)
2438*ef8d499eSDavid van Moolenbroek 			tcpsock_pcb_close(tcp);
2439*ef8d499eSDavid van Moolenbroek 
2440*ef8d499eSDavid van Moolenbroek 		return OK;
2441*ef8d499eSDavid van Moolenbroek 	}
2442*ef8d499eSDavid van Moolenbroek 
2443*ef8d499eSDavid van Moolenbroek 	/*
2444*ef8d499eSDavid van Moolenbroek 	 * We control shutdown-for-reading locally, and intentially do not tell
2445*ef8d499eSDavid van Moolenbroek 	 * lwIP about it: if we do that and also shut down for writing, the PCB
2446*ef8d499eSDavid van Moolenbroek 	 * may disappear (now or eventually), which is not what we want.
2447*ef8d499eSDavid van Moolenbroek 	 * Instead, we only tell lwIP to shut down for reading once we actually
2448*ef8d499eSDavid van Moolenbroek 	 * want to get rid of the PCB, using tcp_close().  In the meantime, if
2449*ef8d499eSDavid van Moolenbroek 	 * the socket is shut down for reading by the user, we simply discard
2450*ef8d499eSDavid van Moolenbroek 	 * received data as fast as we can--one out of a number of possible
2451*ef8d499eSDavid van Moolenbroek 	 * design choices there, and (reportedly) the one used by the BSDs.
2452*ef8d499eSDavid van Moolenbroek 	 */
2453*ef8d499eSDavid van Moolenbroek 	if (mask & SFL_SHUT_RD)
2454*ef8d499eSDavid van Moolenbroek 		(void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/);
2455*ef8d499eSDavid van Moolenbroek 
2456*ef8d499eSDavid van Moolenbroek 	/*
2457*ef8d499eSDavid van Moolenbroek 	 * Shutting down for writing a connecting socket simply closes its PCB.
2458*ef8d499eSDavid van Moolenbroek 	 * Closing a PCB in SYN_SENT state simply deallocates it, so this can
2459*ef8d499eSDavid van Moolenbroek 	 * not fail.  On the other hand, for connected sockets we want to send
2460*ef8d499eSDavid van Moolenbroek 	 * a FIN, which may fail due to memory shortage, in which case we have
2461*ef8d499eSDavid van Moolenbroek 	 * to try again later..
2462*ef8d499eSDavid van Moolenbroek 	 */
2463*ef8d499eSDavid van Moolenbroek 	if (mask & SFL_SHUT_WR) {
2464*ef8d499eSDavid van Moolenbroek 		if (tcp->tcp_pcb->state == SYN_SENT)
2465*ef8d499eSDavid van Moolenbroek 			tcpsock_pcb_close(tcp);
2466*ef8d499eSDavid van Moolenbroek 		else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
2467*ef8d499eSDavid van Moolenbroek 			tcpsock_send_fin(tcp);
2468*ef8d499eSDavid van Moolenbroek 	}
2469*ef8d499eSDavid van Moolenbroek 
2470*ef8d499eSDavid van Moolenbroek 	return OK;
2471*ef8d499eSDavid van Moolenbroek }
2472*ef8d499eSDavid van Moolenbroek 
2473*ef8d499eSDavid van Moolenbroek /*
2474*ef8d499eSDavid van Moolenbroek  * Close a TCP socket.  Complete the operation immediately if possible, or
2475*ef8d499eSDavid van Moolenbroek  * otherwise initiate the closing process and complete it later, notifying
2476*ef8d499eSDavid van Moolenbroek  * libsockevent about that as well.  Depending on linger settings, this
2477*ef8d499eSDavid van Moolenbroek  * function may be called twice on the same socket: the first time with the
2478*ef8d499eSDavid van Moolenbroek  * 'force' flag cleared, and the second time with the 'force' flag set.
2479*ef8d499eSDavid van Moolenbroek  */
2480*ef8d499eSDavid van Moolenbroek static int
tcpsock_close(struct sock * sock,int force)2481*ef8d499eSDavid van Moolenbroek tcpsock_close(struct sock * sock, int force)
2482*ef8d499eSDavid van Moolenbroek {
2483*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
2484*ef8d499eSDavid van Moolenbroek 	struct tcpsock *queued;
2485*ef8d499eSDavid van Moolenbroek 	size_t rlen;
2486*ef8d499eSDavid van Moolenbroek 
2487*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_listener == NULL);
2488*ef8d499eSDavid van Moolenbroek 
2489*ef8d499eSDavid van Moolenbroek 	/*
2490*ef8d499eSDavid van Moolenbroek 	 * If this was a listening socket, so abort and clean up any and all
2491*ef8d499eSDavid van Moolenbroek 	 * connections on its listener queue.  Note that the listening socket
2492*ef8d499eSDavid van Moolenbroek 	 * may or may not have a PCB at this point.
2493*ef8d499eSDavid van Moolenbroek 	 */
2494*ef8d499eSDavid van Moolenbroek 	if (tcpsock_is_listening(tcp)) {
2495*ef8d499eSDavid van Moolenbroek 		while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) {
2496*ef8d499eSDavid van Moolenbroek 			queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head);
2497*ef8d499eSDavid van Moolenbroek 
2498*ef8d499eSDavid van Moolenbroek 			tcpsock_pcb_abort(queued);
2499*ef8d499eSDavid van Moolenbroek 
2500*ef8d499eSDavid van Moolenbroek 			(void)tcpsock_cleanup(queued, TRUE /*may_free*/);
2501*ef8d499eSDavid van Moolenbroek 		}
2502*ef8d499eSDavid van Moolenbroek 	}
2503*ef8d499eSDavid van Moolenbroek 
2504*ef8d499eSDavid van Moolenbroek 	/*
2505*ef8d499eSDavid van Moolenbroek 	 * Clear the receive queue, and make sure that we no longer add new
2506*ef8d499eSDavid van Moolenbroek 	 * data to it.  The latter is relevant only for the case that we end up
2507*ef8d499eSDavid van Moolenbroek 	 * returning SUSPEND below.  Remember whether there were bytes left,
2508*ef8d499eSDavid van Moolenbroek 	 * because we should reset the connection if there were.
2509*ef8d499eSDavid van Moolenbroek 	 */
2510*ef8d499eSDavid van Moolenbroek 	rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
2511*ef8d499eSDavid van Moolenbroek 
2512*ef8d499eSDavid van Moolenbroek 	sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD);
2513*ef8d499eSDavid van Moolenbroek 
2514*ef8d499eSDavid van Moolenbroek 	/*
2515*ef8d499eSDavid van Moolenbroek 	 * If the socket is connected, perform a graceful shutdown, unless 1)
2516*ef8d499eSDavid van Moolenbroek 	 * we are asked to force-close the socket, or 2) if the local side has
2517*ef8d499eSDavid van Moolenbroek 	 * not consumed all data, as per RFC 1122 Sec.4.2.2.13.  Normally lwIP
2518*ef8d499eSDavid van Moolenbroek 	 * would take care of the second point, but we may have data in our
2519*ef8d499eSDavid van Moolenbroek 	 * receive buffer of which lwIP is not aware.
2520*ef8d499eSDavid van Moolenbroek 	 *
2521*ef8d499eSDavid van Moolenbroek 	 * Implementing proper linger support is somewhat difficult with lwIP.
2522*ef8d499eSDavid van Moolenbroek 	 * In particular, we cannot reliably wait for our FIN to be ACK'ed by
2523*ef8d499eSDavid van Moolenbroek 	 * the other side in all cases:
2524*ef8d499eSDavid van Moolenbroek 	 *
2525*ef8d499eSDavid van Moolenbroek 	 * - the lwIP TCP transition from states CLOSING to TIME_WAIT does not
2526*ef8d499eSDavid van Moolenbroek 	 *   trigger any event and once in the TIME_WAIT state, the poll event
2527*ef8d499eSDavid van Moolenbroek 	 *   no longer triggers either;
2528*ef8d499eSDavid van Moolenbroek 	 * - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to
2529*ef8d499eSDavid van Moolenbroek 	 *   TIME_WAIT will trigger a receive event, but it is not clear
2530*ef8d499eSDavid van Moolenbroek 	 *   whether we can reliably check that our FIN was ACK'ed from there.
2531*ef8d499eSDavid van Moolenbroek 	 *
2532*ef8d499eSDavid van Moolenbroek 	 * That means we have to compromise.  Instead of the proper approach,
2533*ef8d499eSDavid van Moolenbroek 	 * we complete our side of the close operation whenever:
2534*ef8d499eSDavid van Moolenbroek 	 *
2535*ef8d499eSDavid van Moolenbroek 	 * 1. all of or data was acknowledged, AND,
2536*ef8d499eSDavid van Moolenbroek 	 * 2. our FIN was sent, AND,
2537*ef8d499eSDavid van Moolenbroek 	 * 3a. our FIN was acknowledged, OR,
2538*ef8d499eSDavid van Moolenbroek 	 * 3b. we received a FIN from the other side.
2539*ef8d499eSDavid van Moolenbroek 	 *
2540*ef8d499eSDavid van Moolenbroek 	 * With the addition of the rule 3b, we do not run into the above
2541*ef8d499eSDavid van Moolenbroek 	 * reliability problems, but we may return from SO_LINGER-blocked close
2542*ef8d499eSDavid van Moolenbroek 	 * calls too early and thus give callers a false impression of success.
2543*ef8d499eSDavid van Moolenbroek 	 * TODO: if lwIP ever gets improved on this point, the code in this
2544*ef8d499eSDavid van Moolenbroek 	 * module should be rewritten to make use of the improvements.
2545*ef8d499eSDavid van Moolenbroek 	 *
2546*ef8d499eSDavid van Moolenbroek 	 * The set of rules is basically the same as for closing the PCB early
2547*ef8d499eSDavid van Moolenbroek 	 * as per tcpsock_may_close(), except with the check for our FIN being
2548*ef8d499eSDavid van Moolenbroek 	 * acknowledged.  Unfortunately only the FIN_WAIT_2, TIME_WAIT, and
2549*ef8d499eSDavid van Moolenbroek 	 * (reentered) CLOSED TCP states guarantee that there are no
2550*ef8d499eSDavid van Moolenbroek 	 * unacknowledged data segments anymore, so we may have to wait for
2551*ef8d499eSDavid van Moolenbroek 	 * reaching any one of these before we can actually finish closing the
2552*ef8d499eSDavid van Moolenbroek 	 * socket with tcp_close().
2553*ef8d499eSDavid van Moolenbroek 	 *
2554*ef8d499eSDavid van Moolenbroek 	 * In addition, lwIP does not tell us when our FIN gets acknowledged,
2555*ef8d499eSDavid van Moolenbroek 	 * so we have to use polling and direct access to lwIP's PCB fields
2556*ef8d499eSDavid van Moolenbroek 	 * instead, just like lwIP's BSD API does.  There is no other way.
2557*ef8d499eSDavid van Moolenbroek 	 * Also, we may not even be able to send the FIN right away, in which
2558*ef8d499eSDavid van Moolenbroek 	 * case we must defer that until later.
2559*ef8d499eSDavid van Moolenbroek 	 */
2560*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb != NULL) {
2561*ef8d499eSDavid van Moolenbroek 		switch (tcp->tcp_pcb->state) {
2562*ef8d499eSDavid van Moolenbroek 		case CLOSE_WAIT:
2563*ef8d499eSDavid van Moolenbroek 		case CLOSING:
2564*ef8d499eSDavid van Moolenbroek 		case LAST_ACK:
2565*ef8d499eSDavid van Moolenbroek 			assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
2566*ef8d499eSDavid van Moolenbroek 
2567*ef8d499eSDavid van Moolenbroek 			/* FALLTHROUGH */
2568*ef8d499eSDavid van Moolenbroek 		case SYN_RCVD:
2569*ef8d499eSDavid van Moolenbroek 		case ESTABLISHED:
2570*ef8d499eSDavid van Moolenbroek 		case FIN_WAIT_1:
2571*ef8d499eSDavid van Moolenbroek 			/* First check if we should abort the connection. */
2572*ef8d499eSDavid van Moolenbroek 			if (force || rlen > 0)
2573*ef8d499eSDavid van Moolenbroek 				break;
2574*ef8d499eSDavid van Moolenbroek 
2575*ef8d499eSDavid van Moolenbroek 			/*
2576*ef8d499eSDavid van Moolenbroek 			 * If we have not sent a FIN yet, try sending it now;
2577*ef8d499eSDavid van Moolenbroek 			 * if all other conditions are met for closing the
2578*ef8d499eSDavid van Moolenbroek 			 * socket, successful FIN transmission will complete
2579*ef8d499eSDavid van Moolenbroek 			 * the close.  Otherwise, perform the close check
2580*ef8d499eSDavid van Moolenbroek 			 * explicitly.
2581*ef8d499eSDavid van Moolenbroek 			 */
2582*ef8d499eSDavid van Moolenbroek 			if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
2583*ef8d499eSDavid van Moolenbroek 				tcpsock_send_fin(tcp);
2584*ef8d499eSDavid van Moolenbroek 			else if (tcpsock_may_close(tcp))
2585*ef8d499eSDavid van Moolenbroek 				tcpsock_pcb_close(tcp);
2586*ef8d499eSDavid van Moolenbroek 
2587*ef8d499eSDavid van Moolenbroek 			/*
2588*ef8d499eSDavid van Moolenbroek 			 * If at this point the PCB is gone, we managed to
2589*ef8d499eSDavid van Moolenbroek 			 * close the connection immediately, and the socket has
2590*ef8d499eSDavid van Moolenbroek 			 * already been cleaned up by now.  This may occur if
2591*ef8d499eSDavid van Moolenbroek 			 * there is no unacknowledged data and we already
2592*ef8d499eSDavid van Moolenbroek 			 * received a FIN earlier on.
2593*ef8d499eSDavid van Moolenbroek 			 */
2594*ef8d499eSDavid van Moolenbroek 			if (tcp->tcp_pcb == NULL)
2595*ef8d499eSDavid van Moolenbroek 				return OK;
2596*ef8d499eSDavid van Moolenbroek 
2597*ef8d499eSDavid van Moolenbroek 			/*
2598*ef8d499eSDavid van Moolenbroek 			 * Complete the close operation at a later time.
2599*ef8d499eSDavid van Moolenbroek 			 * Adjust the polling interval, so that we can detect
2600*ef8d499eSDavid van Moolenbroek 			 * completion of the close as quickly as possible.
2601*ef8d499eSDavid van Moolenbroek 			 */
2602*ef8d499eSDavid van Moolenbroek 			tcp_poll(tcp->tcp_pcb, tcpsock_event_poll,
2603*ef8d499eSDavid van Moolenbroek 			    TCP_POLL_CLOSE_INTERVAL);
2604*ef8d499eSDavid van Moolenbroek 
2605*ef8d499eSDavid van Moolenbroek 			return SUSPEND;
2606*ef8d499eSDavid van Moolenbroek 
2607*ef8d499eSDavid van Moolenbroek 		default:
2608*ef8d499eSDavid van Moolenbroek 			/*
2609*ef8d499eSDavid van Moolenbroek 			 * The connection is either not yet established, or
2610*ef8d499eSDavid van Moolenbroek 			 * already in a state where we can close it right now.
2611*ef8d499eSDavid van Moolenbroek 			 */
2612*ef8d499eSDavid van Moolenbroek 			tcpsock_pcb_close(tcp);
2613*ef8d499eSDavid van Moolenbroek 		}
2614*ef8d499eSDavid van Moolenbroek 	}
2615*ef8d499eSDavid van Moolenbroek 
2616*ef8d499eSDavid van Moolenbroek 	/*
2617*ef8d499eSDavid van Moolenbroek 	 * Abort the connection is the PCB is still around, and clean up the
2618*ef8d499eSDavid van Moolenbroek 	 * socket.  We cannot let tcpsock_cleanup() free the socket object yet,
2619*ef8d499eSDavid van Moolenbroek 	 * because we are still in the callback from libsockevent, and the
2620*ef8d499eSDavid van Moolenbroek 	 * latter cannot handle the socket object being freed from here.
2621*ef8d499eSDavid van Moolenbroek 	 */
2622*ef8d499eSDavid van Moolenbroek 	if (tcp->tcp_pcb != NULL)
2623*ef8d499eSDavid van Moolenbroek 		tcpsock_pcb_abort(tcp);
2624*ef8d499eSDavid van Moolenbroek 
2625*ef8d499eSDavid van Moolenbroek 	(void)tcpsock_cleanup(tcp, FALSE /*may_free*/);
2626*ef8d499eSDavid van Moolenbroek 
2627*ef8d499eSDavid van Moolenbroek 	return OK;
2628*ef8d499eSDavid van Moolenbroek }
2629*ef8d499eSDavid van Moolenbroek 
2630*ef8d499eSDavid van Moolenbroek /*
2631*ef8d499eSDavid van Moolenbroek  * Free up a closed TCP socket.
2632*ef8d499eSDavid van Moolenbroek  */
2633*ef8d499eSDavid van Moolenbroek static void
tcpsock_free(struct sock * sock)2634*ef8d499eSDavid van Moolenbroek tcpsock_free(struct sock * sock)
2635*ef8d499eSDavid van Moolenbroek {
2636*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp = (struct tcpsock *)sock;
2637*ef8d499eSDavid van Moolenbroek 
2638*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_pcb == NULL);
2639*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_snd.ts_len == 0);
2640*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_snd.ts_head == NULL);
2641*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_rcv.tr_len == 0);
2642*ef8d499eSDavid van Moolenbroek 	assert(tcp->tcp_rcv.tr_head == NULL);
2643*ef8d499eSDavid van Moolenbroek 
2644*ef8d499eSDavid van Moolenbroek 	TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next);
2645*ef8d499eSDavid van Moolenbroek }
2646*ef8d499eSDavid van Moolenbroek 
2647*ef8d499eSDavid van Moolenbroek /* This table maps TCP states from lwIP numbers to NetBSD numbers. */
2648*ef8d499eSDavid van Moolenbroek static const struct {
2649*ef8d499eSDavid van Moolenbroek 	int tsm_tstate;
2650*ef8d499eSDavid van Moolenbroek 	int tsm_sostate;
2651*ef8d499eSDavid van Moolenbroek } tcpsock_statemap[] = {
2652*ef8d499eSDavid van Moolenbroek 	[CLOSED]	= { TCPS_CLOSED,	SS_ISDISCONNECTED	},
2653*ef8d499eSDavid van Moolenbroek 	[LISTEN]	= { TCPS_LISTEN,	0			},
2654*ef8d499eSDavid van Moolenbroek 	[SYN_SENT]	= { TCPS_SYN_SENT,	SS_ISCONNECTING		},
2655*ef8d499eSDavid van Moolenbroek 	[SYN_RCVD]	= { TCPS_SYN_RECEIVED,	SS_ISCONNECTING		},
2656*ef8d499eSDavid van Moolenbroek 	[ESTABLISHED]	= { TCPS_ESTABLISHED,	SS_ISCONNECTED		},
2657*ef8d499eSDavid van Moolenbroek 	[FIN_WAIT_1]	= { TCPS_FIN_WAIT_1,	SS_ISDISCONNECTING	},
2658*ef8d499eSDavid van Moolenbroek 	[FIN_WAIT_2]	= { TCPS_FIN_WAIT_2,	SS_ISDISCONNECTING	},
2659*ef8d499eSDavid van Moolenbroek 	[CLOSE_WAIT]	= { TCPS_CLOSE_WAIT,	SS_ISCONNECTED		},
2660*ef8d499eSDavid van Moolenbroek 	[CLOSING]	= { TCPS_CLOSING,	SS_ISDISCONNECTING	},
2661*ef8d499eSDavid van Moolenbroek 	[LAST_ACK]	= { TCPS_LAST_ACK,	SS_ISDISCONNECTING	},
2662*ef8d499eSDavid van Moolenbroek 	[TIME_WAIT]	= { TCPS_TIME_WAIT,	SS_ISDISCONNECTED	},
2663*ef8d499eSDavid van Moolenbroek };
2664*ef8d499eSDavid van Moolenbroek 
2665*ef8d499eSDavid van Moolenbroek /*
2666*ef8d499eSDavid van Moolenbroek  * Fill the given kinfo_pcb sysctl(7) structure with information about the TCP
2667*ef8d499eSDavid van Moolenbroek  * PCB identified by the given pointer.
2668*ef8d499eSDavid van Moolenbroek  */
2669*ef8d499eSDavid van Moolenbroek static void
tcpsock_get_info(struct kinfo_pcb * ki,const void * ptr)2670*ef8d499eSDavid van Moolenbroek tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr)
2671*ef8d499eSDavid van Moolenbroek {
2672*ef8d499eSDavid van Moolenbroek 	const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr;
2673*ef8d499eSDavid van Moolenbroek 	struct tcpsock *tcp;
2674*ef8d499eSDavid van Moolenbroek 
2675*ef8d499eSDavid van Moolenbroek 	/*
2676*ef8d499eSDavid van Moolenbroek 	 * Not all TCP PCBs have an associated tcpsock structure.  We are
2677*ef8d499eSDavid van Moolenbroek 	 * careful enough clearing the callback argument for PCBs on any of the
2678*ef8d499eSDavid van Moolenbroek 	 * TCP lists that we can use that callback argument to determine
2679*ef8d499eSDavid van Moolenbroek 	 * whether there is an associated tcpsock structure, although with one
2680*ef8d499eSDavid van Moolenbroek 	 * exception: PCBs for incoming connections that have not yet been
2681*ef8d499eSDavid van Moolenbroek 	 * fully established (i.e., in SYN_RCVD state).  These will have the
2682*ef8d499eSDavid van Moolenbroek 	 * callback argument of the listening socket (which itself may already
2683*ef8d499eSDavid van Moolenbroek 	 * have been deallocated at this point) but should not be considered as
2684*ef8d499eSDavid van Moolenbroek 	 * associated with the listening socket's tcpsock structure.
2685*ef8d499eSDavid van Moolenbroek 	 */
2686*ef8d499eSDavid van Moolenbroek 	if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) {
2687*ef8d499eSDavid van Moolenbroek 		tcp = (struct tcpsock *)pcb->callback_arg;
2688*ef8d499eSDavid van Moolenbroek 		assert(tcp >= tcp_array &&
2689*ef8d499eSDavid van Moolenbroek 		    tcp < &tcp_array[__arraycount(tcp_array)]);
2690*ef8d499eSDavid van Moolenbroek 
2691*ef8d499eSDavid van Moolenbroek 		/* TODO: change this so that sockstat(1) may work one day. */
2692*ef8d499eSDavid van Moolenbroek 		ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp);
2693*ef8d499eSDavid van Moolenbroek 	} else {
2694*ef8d499eSDavid van Moolenbroek 		/* No tcpsock.  Could also be in TIME_WAIT state etc. */
2695*ef8d499eSDavid van Moolenbroek 		tcp = NULL;
2696*ef8d499eSDavid van Moolenbroek 
2697*ef8d499eSDavid van Moolenbroek 		ki->ki_sostate = SS_NOFDREF;
2698*ef8d499eSDavid van Moolenbroek 	}
2699*ef8d499eSDavid van Moolenbroek 
2700*ef8d499eSDavid van Moolenbroek 	ki->ki_type = SOCK_STREAM;
2701*ef8d499eSDavid van Moolenbroek 
2702*ef8d499eSDavid van Moolenbroek 	if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) {
2703*ef8d499eSDavid van Moolenbroek 		ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate;
2704*ef8d499eSDavid van Moolenbroek 		/* TODO: this needs work, but does anything rely on it? */
2705*ef8d499eSDavid van Moolenbroek 		ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate;
2706*ef8d499eSDavid van Moolenbroek 	}
2707*ef8d499eSDavid van Moolenbroek 
2708*ef8d499eSDavid van Moolenbroek 	/* Careful with the LISTEN state here (see below). */
2709*ef8d499eSDavid van Moolenbroek 	ipsock_get_info(ki, &pcb->local_ip, pcb->local_port,
2710*ef8d499eSDavid van Moolenbroek 	    &pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0);
2711*ef8d499eSDavid van Moolenbroek 
2712*ef8d499eSDavid van Moolenbroek 	/*
2713*ef8d499eSDavid van Moolenbroek 	 * The PCBs for listening sockets are actually smaller.  Thus, for
2714*ef8d499eSDavid van Moolenbroek 	 * listening sockets, do not attempt to access any of the fields beyond
2715*ef8d499eSDavid van Moolenbroek 	 * those provided in the smaller structure.
2716*ef8d499eSDavid van Moolenbroek 	 */
2717*ef8d499eSDavid van Moolenbroek 	if (pcb->state == LISTEN) {
2718*ef8d499eSDavid van Moolenbroek 		assert(tcp != NULL);
2719*ef8d499eSDavid van Moolenbroek 		ki->ki_refs =
2720*ef8d499eSDavid van Moolenbroek 		    (uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head);
2721*ef8d499eSDavid van Moolenbroek 	} else {
2722*ef8d499eSDavid van Moolenbroek 		if (tcp_nagle_disabled(pcb))
2723*ef8d499eSDavid van Moolenbroek 			ki->ki_tflags |= NETBSD_TF_NODELAY;
2724*ef8d499eSDavid van Moolenbroek 
2725*ef8d499eSDavid van Moolenbroek 		if (tcp != NULL) {
2726*ef8d499eSDavid van Moolenbroek 			ki->ki_rcvq = tcp->tcp_rcv.tr_len;
2727*ef8d499eSDavid van Moolenbroek 			ki->ki_sndq = tcp->tcp_snd.ts_len;
2728*ef8d499eSDavid van Moolenbroek 
2729*ef8d499eSDavid van Moolenbroek 			if (tcp->tcp_listener != NULL)
2730*ef8d499eSDavid van Moolenbroek 				ki->ki_nextref = (uint64_t)(uintptr_t)
2731*ef8d499eSDavid van Moolenbroek 				    TAILQ_NEXT(tcp, tcp_queue.tq_next);
2732*ef8d499eSDavid van Moolenbroek 		}
2733*ef8d499eSDavid van Moolenbroek 	}
2734*ef8d499eSDavid van Moolenbroek }
2735*ef8d499eSDavid van Moolenbroek 
2736*ef8d499eSDavid van Moolenbroek /*
2737*ef8d499eSDavid van Moolenbroek  * Given either NULL or a previously returned TCP PCB pointer, return the first
2738*ef8d499eSDavid van Moolenbroek  * or next TCP PCB pointer, or NULL if there are no more.  The current
2739*ef8d499eSDavid van Moolenbroek  * implementation supports only one concurrent iteration at once.
2740*ef8d499eSDavid van Moolenbroek  */
2741*ef8d499eSDavid van Moolenbroek static const void *
tcpsock_enum(const void * last)2742*ef8d499eSDavid van Moolenbroek tcpsock_enum(const void * last)
2743*ef8d499eSDavid van Moolenbroek {
2744*ef8d499eSDavid van Moolenbroek 	static struct {
2745*ef8d499eSDavid van Moolenbroek 		unsigned int i;
2746*ef8d499eSDavid van Moolenbroek 		const struct tcp_pcb *pcb;
2747*ef8d499eSDavid van Moolenbroek 	} iter;
2748*ef8d499eSDavid van Moolenbroek 
2749*ef8d499eSDavid van Moolenbroek 	if (last != NULL && (iter.pcb = iter.pcb->next) != NULL)
2750*ef8d499eSDavid van Moolenbroek 		return (const void *)iter.pcb;
2751*ef8d499eSDavid van Moolenbroek 
2752*ef8d499eSDavid van Moolenbroek 	for (iter.i = (last != NULL) ? iter.i + 1 : 0;
2753*ef8d499eSDavid van Moolenbroek 	    iter.i < __arraycount(tcp_pcb_lists); iter.i++) {
2754*ef8d499eSDavid van Moolenbroek 		if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL)
2755*ef8d499eSDavid van Moolenbroek 			return (const void *)iter.pcb;
2756*ef8d499eSDavid van Moolenbroek 	}
2757*ef8d499eSDavid van Moolenbroek 
2758*ef8d499eSDavid van Moolenbroek 	return NULL;
2759*ef8d499eSDavid van Moolenbroek }
2760*ef8d499eSDavid van Moolenbroek 
2761*ef8d499eSDavid van Moolenbroek /*
2762*ef8d499eSDavid van Moolenbroek  * Obtain the list of TCP protocol control blocks, for sysctl(7).
2763*ef8d499eSDavid van Moolenbroek  */
2764*ef8d499eSDavid van Moolenbroek static ssize_t
tcpsock_pcblist(struct rmib_call * call,struct rmib_node * node __unused,struct rmib_oldp * oldp,struct rmib_newp * newp __unused)2765*ef8d499eSDavid van Moolenbroek tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
2766*ef8d499eSDavid van Moolenbroek 	struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
2767*ef8d499eSDavid van Moolenbroek {
2768*ef8d499eSDavid van Moolenbroek 
2769*ef8d499eSDavid van Moolenbroek 	return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info);
2770*ef8d499eSDavid van Moolenbroek }
2771*ef8d499eSDavid van Moolenbroek 
2772*ef8d499eSDavid van Moolenbroek static const struct sockevent_ops tcpsock_ops = {
2773*ef8d499eSDavid van Moolenbroek 	.sop_bind		= tcpsock_bind,
2774*ef8d499eSDavid van Moolenbroek 	.sop_listen		= tcpsock_listen,
2775*ef8d499eSDavid van Moolenbroek 	.sop_connect		= tcpsock_connect,
2776*ef8d499eSDavid van Moolenbroek 	.sop_accept		= tcpsock_accept,
2777*ef8d499eSDavid van Moolenbroek 	.sop_test_accept	= tcpsock_test_accept,
2778*ef8d499eSDavid van Moolenbroek 	.sop_pre_send		= tcpsock_pre_send,
2779*ef8d499eSDavid van Moolenbroek 	.sop_send		= tcpsock_send,
2780*ef8d499eSDavid van Moolenbroek 	.sop_test_send		= tcpsock_test_send,
2781*ef8d499eSDavid van Moolenbroek 	.sop_pre_recv		= tcpsock_pre_recv,
2782*ef8d499eSDavid van Moolenbroek 	.sop_recv		= tcpsock_recv,
2783*ef8d499eSDavid van Moolenbroek 	.sop_test_recv		= tcpsock_test_recv,
2784*ef8d499eSDavid van Moolenbroek 	.sop_ioctl		= ifconf_ioctl,
2785*ef8d499eSDavid van Moolenbroek 	.sop_setsockmask	= tcpsock_setsockmask,
2786*ef8d499eSDavid van Moolenbroek 	.sop_setsockopt		= tcpsock_setsockopt,
2787*ef8d499eSDavid van Moolenbroek 	.sop_getsockopt		= tcpsock_getsockopt,
2788*ef8d499eSDavid van Moolenbroek 	.sop_getsockname	= tcpsock_getsockname,
2789*ef8d499eSDavid van Moolenbroek 	.sop_getpeername	= tcpsock_getpeername,
2790*ef8d499eSDavid van Moolenbroek 	.sop_shutdown		= tcpsock_shutdown,
2791*ef8d499eSDavid van Moolenbroek 	.sop_close		= tcpsock_close,
2792*ef8d499eSDavid van Moolenbroek 	.sop_free		= tcpsock_free
2793*ef8d499eSDavid van Moolenbroek };
2794