1*ef8d499eSDavid van Moolenbroek /* LWIP service - tcpsock.c - TCP sockets */
2*ef8d499eSDavid van Moolenbroek /*
3*ef8d499eSDavid van Moolenbroek * This module implements support for TCP sockets based on lwIP's core TCP PCB
4*ef8d499eSDavid van Moolenbroek * module, which is largely but not fully cooperative with exactly what we want
5*ef8d499eSDavid van Moolenbroek * to achieve, with as a result that this module is rather complicated.
6*ef8d499eSDavid van Moolenbroek *
7*ef8d499eSDavid van Moolenbroek * Each socket has a send queue and a receive queue. Both are using lwIP's own
8*ef8d499eSDavid van Moolenbroek * (pbuf) buffers, which largely come out of the main 512-byte buffer pool.
9*ef8d499eSDavid van Moolenbroek * The buffers on the send queue are allocated and freed by us--the latter only
10*ef8d499eSDavid van Moolenbroek * once they are no longer in use by lwIP as well. A bit counterintuitively,
11*ef8d499eSDavid van Moolenbroek * we deliberately use a smaller lwIP per-PCB TCP send buffer limit
12*ef8d499eSDavid van Moolenbroek * (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more
13*ef8d499eSDavid van Moolenbroek * easily trigger conditions where we cannot enqueue data (or the final FIN)
14*ef8d499eSDavid van Moolenbroek * right away. This way, we get to test the internal logic of this module a
15*ef8d499eSDavid van Moolenbroek * lot more easily. The small lwIP send queue size should not have any impact
16*ef8d499eSDavid van Moolenbroek * on performance, as our own per-socket send queues can be much larger and we
17*ef8d499eSDavid van Moolenbroek * enqueue more of that on the lwIP PCB as soon as we can in all cases.
18*ef8d499eSDavid van Moolenbroek *
19*ef8d499eSDavid van Moolenbroek * The receive queue consists of whatever buffers were given to us by lwIP, but
20*ef8d499eSDavid van Moolenbroek * since those may be many buffers with small amounts of data each, we perform
21*ef8d499eSDavid van Moolenbroek * fairly aggressive merging of consecutive buffers. The intended result is
22*ef8d499eSDavid van Moolenbroek * that we waste no more than 50% of memory within the receive queue. Merging
23*ef8d499eSDavid van Moolenbroek * requires memory copies, which makes it expensive, but we do not configure
24*ef8d499eSDavid van Moolenbroek * lwIP with enough buffers to make running out of buffers a non-issue, so this
25*ef8d499eSDavid van Moolenbroek * trade-off is necessary. Practical experience and measurements of the merge
26*ef8d499eSDavid van Moolenbroek * policy will have to show whether and how the current policy may be improved.
27*ef8d499eSDavid van Moolenbroek *
28*ef8d499eSDavid van Moolenbroek * As can be expected, the connection close semantics are by far the most
29*ef8d499eSDavid van Moolenbroek * complicated part of this module. We attempt to get rid of the lwIP PCB as
30*ef8d499eSDavid van Moolenbroek * soon as we can, letting lwIP take care of the TIME_WAIT state for example.
31*ef8d499eSDavid van Moolenbroek * However, there are various conditions that have to be met before we can
32*ef8d499eSDavid van Moolenbroek * forget about the PCB here--most importantly, that none of our sent data
33*ef8d499eSDavid van Moolenbroek * blocks are still referenced by lwIP because they have not yet been sent or
34*ef8d499eSDavid van Moolenbroek * acknowledged. We can only free the data blocks once lwIP is done with them.
35*ef8d499eSDavid van Moolenbroek *
36*ef8d499eSDavid van Moolenbroek * We do consider the TCP state of lwIP's PCB, in order to avoid duplicating
37*ef8d499eSDavid van Moolenbroek * full state tracking here. However, we do not look at a socket's TCP state
38*ef8d499eSDavid van Moolenbroek * while in a lwIP-generated event for that socket, because the state may not
39*ef8d499eSDavid van Moolenbroek * necessarily reflect the (correct or new) TCP state of the connection, nor
40*ef8d499eSDavid van Moolenbroek * may the PCB be available--this is the case for error events. For these
41*ef8d499eSDavid van Moolenbroek * reasons we use a few internal TCPF_ flags to perform partial state tracking.
42*ef8d499eSDavid van Moolenbroek *
43*ef8d499eSDavid van Moolenbroek * More generally, we tend to access lwIP PCB fields directly only when lwIP's
44*ef8d499eSDavid van Moolenbroek * own BSD API implementation does that too and there is no better alternative.
45*ef8d499eSDavid van Moolenbroek * One example of this is the check to see if our FIN was acknowledged, for
46*ef8d499eSDavid van Moolenbroek * SO_LINGER support. In terms of maintenance, our hope is that if lwIP's API
47*ef8d499eSDavid van Moolenbroek * changes later, we can change our code to imitate whatever lwIP's BSD API
48*ef8d499eSDavid van Moolenbroek * implementation does at that point.
49*ef8d499eSDavid van Moolenbroek */
50*ef8d499eSDavid van Moolenbroek
51*ef8d499eSDavid van Moolenbroek #include <sys/socketvar.h>
52*ef8d499eSDavid van Moolenbroek #include <netinet/in.h>
53*ef8d499eSDavid van Moolenbroek #include <netinet/tcp.h>
54*ef8d499eSDavid van Moolenbroek #include <netinet/ip_var.h>
55*ef8d499eSDavid van Moolenbroek #include <netinet/tcp_timer.h>
56*ef8d499eSDavid van Moolenbroek #include <netinet/tcp_var.h>
57*ef8d499eSDavid van Moolenbroek #include <netinet/tcp_fsm.h>
58*ef8d499eSDavid van Moolenbroek
59*ef8d499eSDavid van Moolenbroek /*
60*ef8d499eSDavid van Moolenbroek * Unfortunately, NetBSD and lwIP have different definitions of a few relevant
61*ef8d499eSDavid van Moolenbroek * preprocessor variables. Make sure we do not attempt to use the NetBSD one
62*ef8d499eSDavid van Moolenbroek * where it matters. We do need one of the NetBSD definitions though.
63*ef8d499eSDavid van Moolenbroek */
64*ef8d499eSDavid van Moolenbroek static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY;
65*ef8d499eSDavid van Moolenbroek #undef TF_NODELAY
66*ef8d499eSDavid van Moolenbroek #undef TCP_MSS
67*ef8d499eSDavid van Moolenbroek
68*ef8d499eSDavid van Moolenbroek #include "lwip.h"
69*ef8d499eSDavid van Moolenbroek #include "tcpisn.h"
70*ef8d499eSDavid van Moolenbroek
71*ef8d499eSDavid van Moolenbroek #include "lwip/tcp.h"
72*ef8d499eSDavid van Moolenbroek #include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */
73*ef8d499eSDavid van Moolenbroek
74*ef8d499eSDavid van Moolenbroek /*
75*ef8d499eSDavid van Moolenbroek * The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration.
76*ef8d499eSDavid van Moolenbroek */
77*ef8d499eSDavid van Moolenbroek
78*ef8d499eSDavid van Moolenbroek /*
79*ef8d499eSDavid van Moolenbroek * We fully control the send buffer, so we can let its size be set to whatever
80*ef8d499eSDavid van Moolenbroek * we want. The receive buffer is different: if it is smaller than the window
81*ef8d499eSDavid van Moolenbroek * size, we may have to refuse data that lwIP hands us, at which point more
82*ef8d499eSDavid van Moolenbroek * incoming data will cause lwIP to abort the TCP connection--even aside from
83*ef8d499eSDavid van Moolenbroek * performance issues. Therefore, we must make sure the receive buffer is
84*ef8d499eSDavid van Moolenbroek * larger than the TCP window at all times.
85*ef8d499eSDavid van Moolenbroek */
86*ef8d499eSDavid van Moolenbroek #define TCP_SNDBUF_MIN 1 /* minimum TCP send buffer size */
87*ef8d499eSDavid van Moolenbroek #define TCP_SNDBUF_DEF 32768 /* default TCP send buffer size */
88*ef8d499eSDavid van Moolenbroek #define TCP_SNDBUF_MAX 131072 /* maximum TCP send buffer size */
89*ef8d499eSDavid van Moolenbroek #define TCP_RCVBUF_MIN TCP_WND /* minimum TCP receive buffer size */
90*ef8d499eSDavid van Moolenbroek #define TCP_RCVBUF_DEF MAX(TCP_WND, 32768) /* default TCP recv buffer size */
91*ef8d499eSDavid van Moolenbroek #define TCP_RCVBUF_MAX MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */
92*ef8d499eSDavid van Moolenbroek
93*ef8d499eSDavid van Moolenbroek /*
94*ef8d499eSDavid van Moolenbroek * The total number of buffers that may in use for TCP socket send queues. The
95*ef8d499eSDavid van Moolenbroek * goal is to allow at least some progress to be made on receiving from TCP
96*ef8d499eSDavid van Moolenbroek * sockets and on differently-typed sockets, at least as long as the LWIP
97*ef8d499eSDavid van Moolenbroek * service can manage to allocate the memory it wants. For the case that it
98*ef8d499eSDavid van Moolenbroek * does not, we can only reactively kill off TCP sockets and/or free enqueued
99*ef8d499eSDavid van Moolenbroek * ethernet packets, neither of which is currently implemented (TODO).
100*ef8d499eSDavid van Moolenbroek */
101*ef8d499eSDavid van Moolenbroek #define TCP_MAX_SENDBUFS (mempool_max_buffers() * 3 / 4)
102*ef8d499eSDavid van Moolenbroek
103*ef8d499eSDavid van Moolenbroek /* Polling intervals, in 500-millsecond units. */
104*ef8d499eSDavid van Moolenbroek #define TCP_POLL_REG_INTERVAL 10 /* interval for reattempting sends */
105*ef8d499eSDavid van Moolenbroek #define TCP_POLL_CLOSE_INTERVAL 1 /* interval while closing connection */
106*ef8d499eSDavid van Moolenbroek
107*ef8d499eSDavid van Moolenbroek static struct tcpsock {
108*ef8d499eSDavid van Moolenbroek struct ipsock tcp_ipsock; /* IP socket, MUST be first */
109*ef8d499eSDavid van Moolenbroek struct tcp_pcb *tcp_pcb; /* lwIP TCP control block */
110*ef8d499eSDavid van Moolenbroek union pxfer_tcp_queue { /* free/accept queue */
111*ef8d499eSDavid van Moolenbroek TAILQ_ENTRY(tcpsock) tq_next; /* next in queue */
112*ef8d499eSDavid van Moolenbroek TAILQ_HEAD(, tcpsock) tq_head; /* head of queue */
113*ef8d499eSDavid van Moolenbroek } tcp_queue;
114*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp_listener; /* listener if on accept q. */
115*ef8d499eSDavid van Moolenbroek struct { /* send queue */
116*ef8d499eSDavid van Moolenbroek struct pbuf *ts_head; /* first pbuf w/unacked data */
117*ef8d499eSDavid van Moolenbroek struct pbuf *ts_unsent; /* first pbuf w/unsent data */
118*ef8d499eSDavid van Moolenbroek struct pbuf *ts_tail; /* most recently added data */
119*ef8d499eSDavid van Moolenbroek size_t ts_len; /* total sent + unsent */
120*ef8d499eSDavid van Moolenbroek unsigned short ts_head_off; /* offset into head pbuf */
121*ef8d499eSDavid van Moolenbroek unsigned short ts_unsent_off; /* offset into unsent pbuf */
122*ef8d499eSDavid van Moolenbroek } tcp_snd;
123*ef8d499eSDavid van Moolenbroek struct { /* receive queue */
124*ef8d499eSDavid van Moolenbroek struct pbuf *tr_head; /* first pbuf w/unrecvd data */
125*ef8d499eSDavid van Moolenbroek struct pbuf **tr_pre_tailp; /* ptr-ptr to newest pbuf */
126*ef8d499eSDavid van Moolenbroek size_t tr_len; /* bytes on receive queue */
127*ef8d499eSDavid van Moolenbroek unsigned short tr_head_off; /* offset into head pbuf */
128*ef8d499eSDavid van Moolenbroek unsigned short tr_unacked; /* current window reduction */
129*ef8d499eSDavid van Moolenbroek } tcp_rcv;
130*ef8d499eSDavid van Moolenbroek } tcp_array[NR_TCPSOCK];
131*ef8d499eSDavid van Moolenbroek
132*ef8d499eSDavid van Moolenbroek static TAILQ_HEAD(, tcpsock) tcp_freelist; /* list of free TCP sockets */
133*ef8d499eSDavid van Moolenbroek
134*ef8d499eSDavid van Moolenbroek static const struct sockevent_ops tcpsock_ops;
135*ef8d499eSDavid van Moolenbroek
136*ef8d499eSDavid van Moolenbroek static unsigned int tcpsock_sendbufs; /* # send buffers in use */
137*ef8d499eSDavid van Moolenbroek static unsigned int tcpsock_recvbufs; /* # receive buffers in use */
138*ef8d499eSDavid van Moolenbroek
139*ef8d499eSDavid van Moolenbroek /* A bunch of macros that are just for convenience. */
140*ef8d499eSDavid van Moolenbroek #define tcpsock_get_id(tcp) (SOCKID_TCP | (sockid_t)((tcp) - tcp_array))
141*ef8d499eSDavid van Moolenbroek #define tcpsock_get_ipsock(tcp) (&(tcp)->tcp_ipsock)
142*ef8d499eSDavid van Moolenbroek #define tcpsock_get_sock(tcp) (ipsock_get_sock(tcpsock_get_ipsock(tcp)))
143*ef8d499eSDavid van Moolenbroek #define tcpsock_get_sndbuf(tcp) (ipsock_get_sndbuf(tcpsock_get_ipsock(tcp)))
144*ef8d499eSDavid van Moolenbroek #define tcpsock_get_rcvbuf(tcp) (ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp)))
145*ef8d499eSDavid van Moolenbroek #define tcpsock_is_ipv6(tcp) (ipsock_is_ipv6(tcpsock_get_ipsock(tcp)))
146*ef8d499eSDavid van Moolenbroek #define tcpsock_is_shutdown(tcp,fl) \
147*ef8d499eSDavid van Moolenbroek (sockevent_is_shutdown(tcpsock_get_sock(tcp), fl))
148*ef8d499eSDavid van Moolenbroek #define tcpsock_is_listening(tcp) \
149*ef8d499eSDavid van Moolenbroek (sockevent_is_listening(tcpsock_get_sock(tcp)))
150*ef8d499eSDavid van Moolenbroek #define tcpsock_get_flags(tcp) (ipsock_get_flags(tcpsock_get_ipsock(tcp)))
151*ef8d499eSDavid van Moolenbroek #define tcpsock_set_flag(tcp,fl) \
152*ef8d499eSDavid van Moolenbroek (ipsock_set_flag(tcpsock_get_ipsock(tcp), fl))
153*ef8d499eSDavid van Moolenbroek #define tcpsock_clear_flag(tcp,fl) \
154*ef8d499eSDavid van Moolenbroek (ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl))
155*ef8d499eSDavid van Moolenbroek
156*ef8d499eSDavid van Moolenbroek static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *,
157*ef8d499eSDavid van Moolenbroek struct rmib_oldp *, struct rmib_newp *);
158*ef8d499eSDavid van Moolenbroek
159*ef8d499eSDavid van Moolenbroek /* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */
160*ef8d499eSDavid van Moolenbroek /* TODO: add many more and make some of them writable.. */
161*ef8d499eSDavid van Moolenbroek static struct rmib_node net_inet_tcp_table[] = {
162*ef8d499eSDavid van Moolenbroek /* 2*/ [TCPCTL_SENDSPACE] = RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF,
163*ef8d499eSDavid van Moolenbroek "sendspace",
164*ef8d499eSDavid van Moolenbroek "Default TCP send buffer size"),
165*ef8d499eSDavid van Moolenbroek /* 3*/ [TCPCTL_RECVSPACE] = RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF,
166*ef8d499eSDavid van Moolenbroek "recvspace",
167*ef8d499eSDavid van Moolenbroek "Default TCP receive buffer size"),
168*ef8d499eSDavid van Moolenbroek /*29*/ [TCPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int),
169*ef8d499eSDavid van Moolenbroek loopif_cksum, "do_loopback_cksum",
170*ef8d499eSDavid van Moolenbroek "Perform TCP checksum on loopback"),
171*ef8d499eSDavid van Moolenbroek /*+0*/ [TCPCTL_MAXID] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0,
172*ef8d499eSDavid van Moolenbroek tcpsock_pcblist, "pcblist",
173*ef8d499eSDavid van Moolenbroek "TCP protocol control block list"),
174*ef8d499eSDavid van Moolenbroek /*+1*/ [TCPCTL_MAXID + 1] = RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE |
175*ef8d499eSDavid van Moolenbroek CTLFLAG_HIDDEN | CTLTYPE_STRING,
176*ef8d499eSDavid van Moolenbroek TCPISN_SECRET_HEX_LENGTH, tcpisn_secret,
177*ef8d499eSDavid van Moolenbroek "isn_secret",
178*ef8d499eSDavid van Moolenbroek "TCP ISN secret (MINIX 3 specific)")
179*ef8d499eSDavid van Moolenbroek };
180*ef8d499eSDavid van Moolenbroek
181*ef8d499eSDavid van Moolenbroek static struct rmib_node net_inet_tcp_node =
182*ef8d499eSDavid van Moolenbroek RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings");
183*ef8d499eSDavid van Moolenbroek static struct rmib_node net_inet6_tcp6_node =
184*ef8d499eSDavid van Moolenbroek RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings");
185*ef8d499eSDavid van Moolenbroek
186*ef8d499eSDavid van Moolenbroek /*
187*ef8d499eSDavid van Moolenbroek * Initialize the TCP sockets module.
188*ef8d499eSDavid van Moolenbroek */
189*ef8d499eSDavid van Moolenbroek void
tcpsock_init(void)190*ef8d499eSDavid van Moolenbroek tcpsock_init(void)
191*ef8d499eSDavid van Moolenbroek {
192*ef8d499eSDavid van Moolenbroek unsigned int slot;
193*ef8d499eSDavid van Moolenbroek
194*ef8d499eSDavid van Moolenbroek /* Initialize the list of free TCP sockets. */
195*ef8d499eSDavid van Moolenbroek TAILQ_INIT(&tcp_freelist);
196*ef8d499eSDavid van Moolenbroek
197*ef8d499eSDavid van Moolenbroek for (slot = 0; slot < __arraycount(tcp_array); slot++)
198*ef8d499eSDavid van Moolenbroek TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot],
199*ef8d499eSDavid van Moolenbroek tcp_queue.tq_next);
200*ef8d499eSDavid van Moolenbroek
201*ef8d499eSDavid van Moolenbroek /* Initialize other variables. */
202*ef8d499eSDavid van Moolenbroek tcpsock_sendbufs = 0;
203*ef8d499eSDavid van Moolenbroek
204*ef8d499eSDavid van Moolenbroek /* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */
205*ef8d499eSDavid van Moolenbroek mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node);
206*ef8d499eSDavid van Moolenbroek mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node);
207*ef8d499eSDavid van Moolenbroek }
208*ef8d499eSDavid van Moolenbroek
209*ef8d499eSDavid van Moolenbroek /*
210*ef8d499eSDavid van Moolenbroek * Initialize the state of a TCP socket's send queue.
211*ef8d499eSDavid van Moolenbroek */
212*ef8d499eSDavid van Moolenbroek static void
tcpsock_reset_send(struct tcpsock * tcp)213*ef8d499eSDavid van Moolenbroek tcpsock_reset_send(struct tcpsock * tcp)
214*ef8d499eSDavid van Moolenbroek {
215*ef8d499eSDavid van Moolenbroek
216*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_tail = NULL;
217*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_unsent = NULL;
218*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_head = NULL;
219*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_len = 0;
220*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_unsent_off = 0;
221*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_head_off = 0;
222*ef8d499eSDavid van Moolenbroek }
223*ef8d499eSDavid van Moolenbroek
224*ef8d499eSDavid van Moolenbroek /*
225*ef8d499eSDavid van Moolenbroek * Initialize the state of a TCP socket's receive queue.
226*ef8d499eSDavid van Moolenbroek */
227*ef8d499eSDavid van Moolenbroek static void
tcpsock_reset_recv(struct tcpsock * tcp)228*ef8d499eSDavid van Moolenbroek tcpsock_reset_recv(struct tcpsock * tcp)
229*ef8d499eSDavid van Moolenbroek {
230*ef8d499eSDavid van Moolenbroek
231*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_pre_tailp = NULL;
232*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_head = NULL;
233*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_len = 0;
234*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_head_off = 0;
235*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_unacked = 0;
236*ef8d499eSDavid van Moolenbroek }
237*ef8d499eSDavid van Moolenbroek
238*ef8d499eSDavid van Moolenbroek /*
239*ef8d499eSDavid van Moolenbroek * Create a TCP socket.
240*ef8d499eSDavid van Moolenbroek */
241*ef8d499eSDavid van Moolenbroek sockid_t
tcpsock_socket(int domain,int protocol,struct sock ** sockp,const struct sockevent_ops ** ops)242*ef8d499eSDavid van Moolenbroek tcpsock_socket(int domain, int protocol, struct sock ** sockp,
243*ef8d499eSDavid van Moolenbroek const struct sockevent_ops ** ops)
244*ef8d499eSDavid van Moolenbroek {
245*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp;
246*ef8d499eSDavid van Moolenbroek uint8_t ip_type;
247*ef8d499eSDavid van Moolenbroek
248*ef8d499eSDavid van Moolenbroek switch (protocol) {
249*ef8d499eSDavid van Moolenbroek case 0:
250*ef8d499eSDavid van Moolenbroek case IPPROTO_TCP:
251*ef8d499eSDavid van Moolenbroek break;
252*ef8d499eSDavid van Moolenbroek
253*ef8d499eSDavid van Moolenbroek default:
254*ef8d499eSDavid van Moolenbroek return EPROTONOSUPPORT;
255*ef8d499eSDavid van Moolenbroek }
256*ef8d499eSDavid van Moolenbroek
257*ef8d499eSDavid van Moolenbroek if (TAILQ_EMPTY(&tcp_freelist))
258*ef8d499eSDavid van Moolenbroek return ENOBUFS;
259*ef8d499eSDavid van Moolenbroek
260*ef8d499eSDavid van Moolenbroek tcp = TAILQ_FIRST(&tcp_freelist);
261*ef8d499eSDavid van Moolenbroek
262*ef8d499eSDavid van Moolenbroek /*
263*ef8d499eSDavid van Moolenbroek * Initialize the structure. Do not memset it to zero, as it is still
264*ef8d499eSDavid van Moolenbroek * part of the linked free list. Initialization may still fail. When
265*ef8d499eSDavid van Moolenbroek * adding new fields, make sure to change tcpsock_clone() accordingly.
266*ef8d499eSDavid van Moolenbroek */
267*ef8d499eSDavid van Moolenbroek
268*ef8d499eSDavid van Moolenbroek ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain,
269*ef8d499eSDavid van Moolenbroek TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp);
270*ef8d499eSDavid van Moolenbroek
271*ef8d499eSDavid van Moolenbroek if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL)
272*ef8d499eSDavid van Moolenbroek return ENOBUFS;
273*ef8d499eSDavid van Moolenbroek tcp_arg(tcp->tcp_pcb, tcp);
274*ef8d499eSDavid van Moolenbroek
275*ef8d499eSDavid van Moolenbroek tcp->tcp_listener = NULL;
276*ef8d499eSDavid van Moolenbroek
277*ef8d499eSDavid van Moolenbroek tcpsock_reset_send(tcp);
278*ef8d499eSDavid van Moolenbroek tcpsock_reset_recv(tcp);
279*ef8d499eSDavid van Moolenbroek
280*ef8d499eSDavid van Moolenbroek TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
281*ef8d499eSDavid van Moolenbroek
282*ef8d499eSDavid van Moolenbroek *ops = &tcpsock_ops;
283*ef8d499eSDavid van Moolenbroek return tcpsock_get_id(tcp);
284*ef8d499eSDavid van Moolenbroek }
285*ef8d499eSDavid van Moolenbroek
286*ef8d499eSDavid van Moolenbroek /*
287*ef8d499eSDavid van Moolenbroek * Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection
288*ef8d499eSDavid van Moolenbroek * incoming on listening socket 'listener'. The new socket is essentially a
289*ef8d499eSDavid van Moolenbroek * "clone" of the listening TCP socket, in that it should inherit any settings
290*ef8d499eSDavid van Moolenbroek * from the listening socket. The socket has not yet been accepted by userland
291*ef8d499eSDavid van Moolenbroek * so add it to the queue of connetions pending for the listening socket. On
292*ef8d499eSDavid van Moolenbroek * success, return OK. On failure, return a negative error code.
293*ef8d499eSDavid van Moolenbroek */
294*ef8d499eSDavid van Moolenbroek static int
tcpsock_clone(struct tcpsock * listener,struct tcp_pcb * pcb)295*ef8d499eSDavid van Moolenbroek tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb)
296*ef8d499eSDavid van Moolenbroek {
297*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp;
298*ef8d499eSDavid van Moolenbroek
299*ef8d499eSDavid van Moolenbroek if (TAILQ_EMPTY(&tcp_freelist))
300*ef8d499eSDavid van Moolenbroek return ENOBUFS;
301*ef8d499eSDavid van Moolenbroek
302*ef8d499eSDavid van Moolenbroek tcp = TAILQ_FIRST(&tcp_freelist);
303*ef8d499eSDavid van Moolenbroek
304*ef8d499eSDavid van Moolenbroek /*
305*ef8d499eSDavid van Moolenbroek * Initialize the structure. Do not memset it to zero, as it is still
306*ef8d499eSDavid van Moolenbroek * part of the linked free list. Initialization may still fail. Most
307*ef8d499eSDavid van Moolenbroek * settings should be inherited from the listening socket here, rather
308*ef8d499eSDavid van Moolenbroek * than being initialized to their default state.
309*ef8d499eSDavid van Moolenbroek */
310*ef8d499eSDavid van Moolenbroek
311*ef8d499eSDavid van Moolenbroek ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp),
312*ef8d499eSDavid van Moolenbroek tcpsock_get_id(tcp));
313*ef8d499eSDavid van Moolenbroek
314*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb = pcb;
315*ef8d499eSDavid van Moolenbroek tcp_arg(pcb, tcp);
316*ef8d499eSDavid van Moolenbroek
317*ef8d499eSDavid van Moolenbroek tcpsock_reset_send(tcp);
318*ef8d499eSDavid van Moolenbroek tcpsock_reset_recv(tcp);
319*ef8d499eSDavid van Moolenbroek
320*ef8d499eSDavid van Moolenbroek /*
321*ef8d499eSDavid van Moolenbroek * Remove the new socket from the free list, and add it to the queue of
322*ef8d499eSDavid van Moolenbroek * the listening socket--in this order, because the same next pointer
323*ef8d499eSDavid van Moolenbroek * is used for both.
324*ef8d499eSDavid van Moolenbroek */
325*ef8d499eSDavid van Moolenbroek TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
326*ef8d499eSDavid van Moolenbroek
327*ef8d499eSDavid van Moolenbroek TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp,
328*ef8d499eSDavid van Moolenbroek tcp_queue.tq_next);
329*ef8d499eSDavid van Moolenbroek tcp->tcp_listener = listener;
330*ef8d499eSDavid van Moolenbroek
331*ef8d499eSDavid van Moolenbroek return OK;
332*ef8d499eSDavid van Moolenbroek }
333*ef8d499eSDavid van Moolenbroek
334*ef8d499eSDavid van Moolenbroek /*
335*ef8d499eSDavid van Moolenbroek * Allocate a buffer from the pool, using the standard pool size. The returned
336*ef8d499eSDavid van Moolenbroek * buffer is a single element--never a chain.
337*ef8d499eSDavid van Moolenbroek */
338*ef8d499eSDavid van Moolenbroek static struct pbuf *
tcpsock_alloc_buf(void)339*ef8d499eSDavid van Moolenbroek tcpsock_alloc_buf(void)
340*ef8d499eSDavid van Moolenbroek {
341*ef8d499eSDavid van Moolenbroek struct pbuf *pbuf;
342*ef8d499eSDavid van Moolenbroek
343*ef8d499eSDavid van Moolenbroek pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM);
344*ef8d499eSDavid van Moolenbroek
345*ef8d499eSDavid van Moolenbroek assert(pbuf == NULL || pbuf->len == pbuf->tot_len);
346*ef8d499eSDavid van Moolenbroek
347*ef8d499eSDavid van Moolenbroek return pbuf;
348*ef8d499eSDavid van Moolenbroek }
349*ef8d499eSDavid van Moolenbroek
350*ef8d499eSDavid van Moolenbroek /*
351*ef8d499eSDavid van Moolenbroek * Free the given buffer. Ensure that pbuf_free() will not attempt to free the
352*ef8d499eSDavid van Moolenbroek * next buffer(s) in the chain as well. This may be called for pbufs other
353*ef8d499eSDavid van Moolenbroek * than those allocated with tcpsock_alloc_buf().
354*ef8d499eSDavid van Moolenbroek */
355*ef8d499eSDavid van Moolenbroek static void
tcpsock_free_buf(struct pbuf * pbuf)356*ef8d499eSDavid van Moolenbroek tcpsock_free_buf(struct pbuf * pbuf)
357*ef8d499eSDavid van Moolenbroek {
358*ef8d499eSDavid van Moolenbroek
359*ef8d499eSDavid van Moolenbroek /*
360*ef8d499eSDavid van Moolenbroek * Resetting the length is currently not necessary, but better safe
361*ef8d499eSDavid van Moolenbroek * than sorry..
362*ef8d499eSDavid van Moolenbroek */
363*ef8d499eSDavid van Moolenbroek pbuf->len = pbuf->tot_len;
364*ef8d499eSDavid van Moolenbroek pbuf->next = NULL;
365*ef8d499eSDavid van Moolenbroek
366*ef8d499eSDavid van Moolenbroek pbuf_free(pbuf);
367*ef8d499eSDavid van Moolenbroek }
368*ef8d499eSDavid van Moolenbroek
369*ef8d499eSDavid van Moolenbroek /*
370*ef8d499eSDavid van Moolenbroek * Clear the send queue of a TCP socket. The caller must ensure that lwIP will
371*ef8d499eSDavid van Moolenbroek * no longer access any of data on the send queue.
372*ef8d499eSDavid van Moolenbroek */
373*ef8d499eSDavid van Moolenbroek static void
tcpsock_clear_send(struct tcpsock * tcp)374*ef8d499eSDavid van Moolenbroek tcpsock_clear_send(struct tcpsock * tcp)
375*ef8d499eSDavid van Moolenbroek {
376*ef8d499eSDavid van Moolenbroek struct pbuf *phead;
377*ef8d499eSDavid van Moolenbroek
378*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb == NULL);
379*ef8d499eSDavid van Moolenbroek
380*ef8d499eSDavid van Moolenbroek while ((phead = tcp->tcp_snd.ts_head) != NULL) {
381*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_head = phead->next;
382*ef8d499eSDavid van Moolenbroek
383*ef8d499eSDavid van Moolenbroek assert(tcpsock_sendbufs > 0);
384*ef8d499eSDavid van Moolenbroek tcpsock_sendbufs--;
385*ef8d499eSDavid van Moolenbroek
386*ef8d499eSDavid van Moolenbroek tcpsock_free_buf(phead);
387*ef8d499eSDavid van Moolenbroek }
388*ef8d499eSDavid van Moolenbroek
389*ef8d499eSDavid van Moolenbroek tcpsock_reset_send(tcp);
390*ef8d499eSDavid van Moolenbroek }
391*ef8d499eSDavid van Moolenbroek
392*ef8d499eSDavid van Moolenbroek /*
393*ef8d499eSDavid van Moolenbroek * Clear the receive queue of a TCP socket. If 'ack_data' is set, also
394*ef8d499eSDavid van Moolenbroek * acknowledge the previous contents of the receive queue to lwIP.
395*ef8d499eSDavid van Moolenbroek */
396*ef8d499eSDavid van Moolenbroek static size_t
tcpsock_clear_recv(struct tcpsock * tcp,int ack_data)397*ef8d499eSDavid van Moolenbroek tcpsock_clear_recv(struct tcpsock * tcp, int ack_data)
398*ef8d499eSDavid van Moolenbroek {
399*ef8d499eSDavid van Moolenbroek struct pbuf *phead;
400*ef8d499eSDavid van Moolenbroek size_t rlen;
401*ef8d499eSDavid van Moolenbroek
402*ef8d499eSDavid van Moolenbroek rlen = tcp->tcp_rcv.tr_len;
403*ef8d499eSDavid van Moolenbroek
404*ef8d499eSDavid van Moolenbroek while ((phead = tcp->tcp_rcv.tr_head) != NULL) {
405*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_head = phead->next;
406*ef8d499eSDavid van Moolenbroek
407*ef8d499eSDavid van Moolenbroek assert(tcpsock_recvbufs > 0);
408*ef8d499eSDavid van Moolenbroek tcpsock_recvbufs--;
409*ef8d499eSDavid van Moolenbroek
410*ef8d499eSDavid van Moolenbroek tcpsock_free_buf(phead);
411*ef8d499eSDavid van Moolenbroek }
412*ef8d499eSDavid van Moolenbroek
413*ef8d499eSDavid van Moolenbroek /*
414*ef8d499eSDavid van Moolenbroek * From now on, we will basically be discarding incoming data as fast
415*ef8d499eSDavid van Moolenbroek * as possible, to keep the full window open at all times.
416*ef8d499eSDavid van Moolenbroek */
417*ef8d499eSDavid van Moolenbroek if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0)
418*ef8d499eSDavid van Moolenbroek tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked);
419*ef8d499eSDavid van Moolenbroek
420*ef8d499eSDavid van Moolenbroek tcpsock_reset_recv(tcp);
421*ef8d499eSDavid van Moolenbroek
422*ef8d499eSDavid van Moolenbroek return rlen;
423*ef8d499eSDavid van Moolenbroek }
424*ef8d499eSDavid van Moolenbroek
425*ef8d499eSDavid van Moolenbroek /*
426*ef8d499eSDavid van Moolenbroek * The TCP socket's PCB has been detached from the socket, typically because
427*ef8d499eSDavid van Moolenbroek * the connection was aborted, either by us or by lwIP. Either way, any TCP
428*ef8d499eSDavid van Moolenbroek * connection is gone. Clear the socket's send queue, remove the socket from
429*ef8d499eSDavid van Moolenbroek * a listening socket's queue, and if the socket itself is ready and allowed to
430*ef8d499eSDavid van Moolenbroek * be freed, free it now. The socket is ready to be freed if it was either on
431*ef8d499eSDavid van Moolenbroek * a listening queue or being closed already. The socket is allowed to be
432*ef8d499eSDavid van Moolenbroek * freed only if 'may_free' is TRUE. If the socket is not freed, its receive
433*ef8d499eSDavid van Moolenbroek * queue is left as is, as it may still have data to be received by userland.
434*ef8d499eSDavid van Moolenbroek */
435*ef8d499eSDavid van Moolenbroek static int
tcpsock_cleanup(struct tcpsock * tcp,int may_free)436*ef8d499eSDavid van Moolenbroek tcpsock_cleanup(struct tcpsock * tcp, int may_free)
437*ef8d499eSDavid van Moolenbroek {
438*ef8d499eSDavid van Moolenbroek int destroy;
439*ef8d499eSDavid van Moolenbroek
440*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb == NULL);
441*ef8d499eSDavid van Moolenbroek
442*ef8d499eSDavid van Moolenbroek /*
443*ef8d499eSDavid van Moolenbroek * Free any data on the send queue. This is safe to do right now,
444*ef8d499eSDavid van Moolenbroek * because the PCB has been aborted (or was already gone). We must be
445*ef8d499eSDavid van Moolenbroek * very careful about clearing the send queue in all other situations.
446*ef8d499eSDavid van Moolenbroek */
447*ef8d499eSDavid van Moolenbroek tcpsock_clear_send(tcp);
448*ef8d499eSDavid van Moolenbroek
449*ef8d499eSDavid van Moolenbroek /*
450*ef8d499eSDavid van Moolenbroek * If this was a socket pending acceptance, remove it from the
451*ef8d499eSDavid van Moolenbroek * corresponding listener socket's queue, and free it. Otherwise, free
452*ef8d499eSDavid van Moolenbroek * the socket only if it suspended a graceful close operation.
453*ef8d499eSDavid van Moolenbroek */
454*ef8d499eSDavid van Moolenbroek if (tcp->tcp_listener != NULL) {
455*ef8d499eSDavid van Moolenbroek TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp,
456*ef8d499eSDavid van Moolenbroek tcp_queue.tq_next);
457*ef8d499eSDavid van Moolenbroek tcp->tcp_listener = NULL;
458*ef8d499eSDavid van Moolenbroek
459*ef8d499eSDavid van Moolenbroek /*
460*ef8d499eSDavid van Moolenbroek * The listener socket's backlog count should be adjusted by
461*ef8d499eSDavid van Moolenbroek * lwIP whenever the PCB is freed up, so we need (and must) not
462*ef8d499eSDavid van Moolenbroek * attempt to do that here.
463*ef8d499eSDavid van Moolenbroek */
464*ef8d499eSDavid van Moolenbroek
465*ef8d499eSDavid van Moolenbroek destroy = TRUE;
466*ef8d499eSDavid van Moolenbroek } else
467*ef8d499eSDavid van Moolenbroek destroy = sockevent_is_closing(tcpsock_get_sock(tcp));
468*ef8d499eSDavid van Moolenbroek
469*ef8d499eSDavid van Moolenbroek /*
470*ef8d499eSDavid van Moolenbroek * Do not free the socket if 'may_free' is FALSE. That flag may be set
471*ef8d499eSDavid van Moolenbroek * if we are currently in the second tcpsock_close() call on the
472*ef8d499eSDavid van Moolenbroek * socket, in which case sockevent_is_closing() is TRUE but we must
473*ef8d499eSDavid van Moolenbroek * still not free the socket now: doing so would derail libsockevent.
474*ef8d499eSDavid van Moolenbroek */
475*ef8d499eSDavid van Moolenbroek if (destroy && may_free) {
476*ef8d499eSDavid van Moolenbroek (void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
477*ef8d499eSDavid van Moolenbroek
478*ef8d499eSDavid van Moolenbroek sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
479*ef8d499eSDavid van Moolenbroek }
480*ef8d499eSDavid van Moolenbroek
481*ef8d499eSDavid van Moolenbroek return destroy;
482*ef8d499eSDavid van Moolenbroek }
483*ef8d499eSDavid van Moolenbroek
484*ef8d499eSDavid van Moolenbroek /*
485*ef8d499eSDavid van Moolenbroek * Abort the lwIP PCB for the given socket, using tcp_abort(). If the PCB is
486*ef8d499eSDavid van Moolenbroek * connected, this will cause the connection to be reset. The PCB, which must
487*ef8d499eSDavid van Moolenbroek * have still been present before the call, will be gone after the call.
488*ef8d499eSDavid van Moolenbroek */
489*ef8d499eSDavid van Moolenbroek static void
tcpsock_pcb_abort(struct tcpsock * tcp)490*ef8d499eSDavid van Moolenbroek tcpsock_pcb_abort(struct tcpsock * tcp)
491*ef8d499eSDavid van Moolenbroek {
492*ef8d499eSDavid van Moolenbroek
493*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb != NULL);
494*ef8d499eSDavid van Moolenbroek assert(!tcpsock_is_listening(tcp));
495*ef8d499eSDavid van Moolenbroek
496*ef8d499eSDavid van Moolenbroek tcp_recv(tcp->tcp_pcb, NULL);
497*ef8d499eSDavid van Moolenbroek tcp_sent(tcp->tcp_pcb, NULL);
498*ef8d499eSDavid van Moolenbroek tcp_err(tcp->tcp_pcb, NULL);
499*ef8d499eSDavid van Moolenbroek tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
500*ef8d499eSDavid van Moolenbroek
501*ef8d499eSDavid van Moolenbroek tcp_arg(tcp->tcp_pcb, NULL);
502*ef8d499eSDavid van Moolenbroek
503*ef8d499eSDavid van Moolenbroek tcp_abort(tcp->tcp_pcb);
504*ef8d499eSDavid van Moolenbroek
505*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb = NULL;
506*ef8d499eSDavid van Moolenbroek }
507*ef8d499eSDavid van Moolenbroek
508*ef8d499eSDavid van Moolenbroek /*
509*ef8d499eSDavid van Moolenbroek * Close the lwIP PCB for the given socket, using tcp_close(). If the PCB is
510*ef8d499eSDavid van Moolenbroek * connected, its graceful close will be finished by lwIP in the background.
511*ef8d499eSDavid van Moolenbroek * The PCB, which must have still been present before the call, will be gone
512*ef8d499eSDavid van Moolenbroek * after the call.
513*ef8d499eSDavid van Moolenbroek */
514*ef8d499eSDavid van Moolenbroek static void
tcpsock_pcb_close(struct tcpsock * tcp)515*ef8d499eSDavid van Moolenbroek tcpsock_pcb_close(struct tcpsock * tcp)
516*ef8d499eSDavid van Moolenbroek {
517*ef8d499eSDavid van Moolenbroek err_t err;
518*ef8d499eSDavid van Moolenbroek
519*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb != NULL);
520*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_len == 0);
521*ef8d499eSDavid van Moolenbroek
522*ef8d499eSDavid van Moolenbroek if (!tcpsock_is_listening(tcp)) {
523*ef8d499eSDavid van Moolenbroek tcp_recv(tcp->tcp_pcb, NULL);
524*ef8d499eSDavid van Moolenbroek tcp_sent(tcp->tcp_pcb, NULL);
525*ef8d499eSDavid van Moolenbroek tcp_err(tcp->tcp_pcb, NULL);
526*ef8d499eSDavid van Moolenbroek tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
527*ef8d499eSDavid van Moolenbroek }
528*ef8d499eSDavid van Moolenbroek
529*ef8d499eSDavid van Moolenbroek tcp_arg(tcp->tcp_pcb, NULL);
530*ef8d499eSDavid van Moolenbroek
531*ef8d499eSDavid van Moolenbroek if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK)
532*ef8d499eSDavid van Moolenbroek panic("unexpected TCP close failure: %d", err);
533*ef8d499eSDavid van Moolenbroek
534*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb = NULL;
535*ef8d499eSDavid van Moolenbroek }
536*ef8d499eSDavid van Moolenbroek
537*ef8d499eSDavid van Moolenbroek /*
538*ef8d499eSDavid van Moolenbroek * Return TRUE if all conditions are met for closing the TCP socket's PCB, or
539*ef8d499eSDavid van Moolenbroek * FALSE if they are not. Upon calling this function, the socket's PCB must
540*ef8d499eSDavid van Moolenbroek * still be around.
541*ef8d499eSDavid van Moolenbroek */
542*ef8d499eSDavid van Moolenbroek static int
tcpsock_may_close(struct tcpsock * tcp)543*ef8d499eSDavid van Moolenbroek tcpsock_may_close(struct tcpsock * tcp)
544*ef8d499eSDavid van Moolenbroek {
545*ef8d499eSDavid van Moolenbroek
546*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb != NULL);
547*ef8d499eSDavid van Moolenbroek
548*ef8d499eSDavid van Moolenbroek /*
549*ef8d499eSDavid van Moolenbroek * Regular closing of the PCB requires three conditions to be met:
550*ef8d499eSDavid van Moolenbroek *
551*ef8d499eSDavid van Moolenbroek * 1. all our data has been transmitted AND acknowledged, so that we do
552*ef8d499eSDavid van Moolenbroek * not risk corruption in case there are still unsent or unack'ed
553*ef8d499eSDavid van Moolenbroek * data buffers that may otherwise be recycled too soon;
554*ef8d499eSDavid van Moolenbroek * 2. we have sent our FIN to the peer; and,
555*ef8d499eSDavid van Moolenbroek * 3. we have received a FIN from the peer.
556*ef8d499eSDavid van Moolenbroek */
557*ef8d499eSDavid van Moolenbroek return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) ==
558*ef8d499eSDavid van Moolenbroek (TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0);
559*ef8d499eSDavid van Moolenbroek }
560*ef8d499eSDavid van Moolenbroek
561*ef8d499eSDavid van Moolenbroek /*
562*ef8d499eSDavid van Moolenbroek * The given socket is ready to be closed as per the tcpsock_may_close() rules.
563*ef8d499eSDavid van Moolenbroek * This implies that its send queue is already empty. Gracefully close the
564*ef8d499eSDavid van Moolenbroek * PCB. In addition, if the socket is being closed gracefully, meaning we
565*ef8d499eSDavid van Moolenbroek * suspended an earlier tcpsock_close() call (and as such already emptied the
566*ef8d499eSDavid van Moolenbroek * receive queue as well), then tell libsockevent that the close is finished,
567*ef8d499eSDavid van Moolenbroek * freeing the socket. Return TRUE if the socket has indeed been freed this
568*ef8d499eSDavid van Moolenbroek * way, or FALSE if the socket is still around.
569*ef8d499eSDavid van Moolenbroek */
570*ef8d499eSDavid van Moolenbroek static int
tcpsock_finish_close(struct tcpsock * tcp)571*ef8d499eSDavid van Moolenbroek tcpsock_finish_close(struct tcpsock * tcp)
572*ef8d499eSDavid van Moolenbroek {
573*ef8d499eSDavid van Moolenbroek
574*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_len == 0);
575*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_listener == NULL);
576*ef8d499eSDavid van Moolenbroek
577*ef8d499eSDavid van Moolenbroek /*
578*ef8d499eSDavid van Moolenbroek * If we get here, we have already shut down the sending side of the
579*ef8d499eSDavid van Moolenbroek * PCB. Technically, we are interested only in shutting down the
580*ef8d499eSDavid van Moolenbroek * receiving side of the PCB here, so that lwIP may decide to recycle
581*ef8d499eSDavid van Moolenbroek * the socket later etcetera. We call tcp_close() because we do not
582*ef8d499eSDavid van Moolenbroek * want to rely on tcp_shutdown(RX) doing the exact same thing.
583*ef8d499eSDavid van Moolenbroek * However, we do rely on the fact that the PCB is not immediately
584*ef8d499eSDavid van Moolenbroek * destroyed by the tcp_close() call: otherwise we may have to return
585*ef8d499eSDavid van Moolenbroek * ERR_ABRT if this function is called from a lwIP-generated event.
586*ef8d499eSDavid van Moolenbroek */
587*ef8d499eSDavid van Moolenbroek tcpsock_pcb_close(tcp);
588*ef8d499eSDavid van Moolenbroek
589*ef8d499eSDavid van Moolenbroek /*
590*ef8d499eSDavid van Moolenbroek * If we suspended an earlier tcpsock_close() call, we have to tell
591*ef8d499eSDavid van Moolenbroek * libsockevent that the close operation is now complete.
592*ef8d499eSDavid van Moolenbroek */
593*ef8d499eSDavid van Moolenbroek if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
594*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_len == 0);
595*ef8d499eSDavid van Moolenbroek
596*ef8d499eSDavid van Moolenbroek sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
597*ef8d499eSDavid van Moolenbroek
598*ef8d499eSDavid van Moolenbroek return TRUE;
599*ef8d499eSDavid van Moolenbroek } else
600*ef8d499eSDavid van Moolenbroek return FALSE;
601*ef8d499eSDavid van Moolenbroek }
602*ef8d499eSDavid van Moolenbroek
603*ef8d499eSDavid van Moolenbroek /*
604*ef8d499eSDavid van Moolenbroek * Attempt to start or resume enqueuing data and/or a FIN to send on the given
605*ef8d499eSDavid van Moolenbroek * TCP socket. Return TRUE if anything at all could be newly enqueued on the
606*ef8d499eSDavid van Moolenbroek * lwIP PCB, even if less than desired. In that case, the caller should try to
607*ef8d499eSDavid van Moolenbroek * send whatever was enqueued, and if applicable, check if the socket may now
608*ef8d499eSDavid van Moolenbroek * be closed (due to the FIN being enqueued). In particular, in any situation
609*ef8d499eSDavid van Moolenbroek * where the socket may be in the process of being closed, the caller must use
610*ef8d499eSDavid van Moolenbroek * tcpsock_may_close() if TRUE is returned. Return FALSE if nothing new could
611*ef8d499eSDavid van Moolenbroek * be enqueued, in which case no send attempt need to be made either.
612*ef8d499eSDavid van Moolenbroek */
613*ef8d499eSDavid van Moolenbroek static int
tcpsock_pcb_enqueue(struct tcpsock * tcp)614*ef8d499eSDavid van Moolenbroek tcpsock_pcb_enqueue(struct tcpsock * tcp)
615*ef8d499eSDavid van Moolenbroek {
616*ef8d499eSDavid van Moolenbroek struct pbuf *punsent;
617*ef8d499eSDavid van Moolenbroek size_t space, chunk;
618*ef8d499eSDavid van Moolenbroek unsigned int flags;
619*ef8d499eSDavid van Moolenbroek err_t err;
620*ef8d499eSDavid van Moolenbroek int enqueued;
621*ef8d499eSDavid van Moolenbroek
622*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb != NULL);
623*ef8d499eSDavid van Moolenbroek
624*ef8d499eSDavid van Moolenbroek if (tcpsock_get_flags(tcp) & TCPF_FULL)
625*ef8d499eSDavid van Moolenbroek return FALSE;
626*ef8d499eSDavid van Moolenbroek
627*ef8d499eSDavid van Moolenbroek /*
628*ef8d499eSDavid van Moolenbroek * Attempt to enqueue more unsent data, if any, on the PCB's send
629*ef8d499eSDavid van Moolenbroek * queue.
630*ef8d499eSDavid van Moolenbroek */
631*ef8d499eSDavid van Moolenbroek enqueued = FALSE;
632*ef8d499eSDavid van Moolenbroek
633*ef8d499eSDavid van Moolenbroek while (tcp->tcp_snd.ts_unsent != NULL) {
634*ef8d499eSDavid van Moolenbroek if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0)
635*ef8d499eSDavid van Moolenbroek break;
636*ef8d499eSDavid van Moolenbroek
637*ef8d499eSDavid van Moolenbroek /*
638*ef8d499eSDavid van Moolenbroek * We may maintain a non-NULL unsent pointer even when there is
639*ef8d499eSDavid van Moolenbroek * nothing more to send right now, because the tail buffer may
640*ef8d499eSDavid van Moolenbroek * be filled up further later on.
641*ef8d499eSDavid van Moolenbroek */
642*ef8d499eSDavid van Moolenbroek punsent = tcp->tcp_snd.ts_unsent;
643*ef8d499eSDavid van Moolenbroek
644*ef8d499eSDavid van Moolenbroek assert(punsent->len >= tcp->tcp_snd.ts_unsent_off);
645*ef8d499eSDavid van Moolenbroek
646*ef8d499eSDavid van Moolenbroek chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off;
647*ef8d499eSDavid van Moolenbroek if (chunk == 0)
648*ef8d499eSDavid van Moolenbroek break;
649*ef8d499eSDavid van Moolenbroek
650*ef8d499eSDavid van Moolenbroek if (chunk > space)
651*ef8d499eSDavid van Moolenbroek chunk = space;
652*ef8d499eSDavid van Moolenbroek
653*ef8d499eSDavid van Moolenbroek /* Try to enqueue more data for sending. */
654*ef8d499eSDavid van Moolenbroek if (chunk < punsent->len || punsent->next != NULL)
655*ef8d499eSDavid van Moolenbroek flags = TCP_WRITE_FLAG_MORE;
656*ef8d499eSDavid van Moolenbroek else
657*ef8d499eSDavid van Moolenbroek flags = 0;
658*ef8d499eSDavid van Moolenbroek
659*ef8d499eSDavid van Moolenbroek err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload +
660*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_unsent_off, chunk, flags);
661*ef8d499eSDavid van Moolenbroek
662*ef8d499eSDavid van Moolenbroek /*
663*ef8d499eSDavid van Moolenbroek * Since tcp_write() enqueues data only, it should only return
664*ef8d499eSDavid van Moolenbroek * out-of-memory errors; no fatal ones. In any case, stop.
665*ef8d499eSDavid van Moolenbroek */
666*ef8d499eSDavid van Moolenbroek if (err != ERR_OK) {
667*ef8d499eSDavid van Moolenbroek assert(err == ERR_MEM);
668*ef8d499eSDavid van Moolenbroek
669*ef8d499eSDavid van Moolenbroek break;
670*ef8d499eSDavid van Moolenbroek }
671*ef8d499eSDavid van Moolenbroek
672*ef8d499eSDavid van Moolenbroek /* We have successfully enqueued data. */
673*ef8d499eSDavid van Moolenbroek enqueued = TRUE;
674*ef8d499eSDavid van Moolenbroek
675*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_unsent_off += chunk;
676*ef8d499eSDavid van Moolenbroek
677*ef8d499eSDavid van Moolenbroek if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) {
678*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_unsent_off < punsent->len ||
679*ef8d499eSDavid van Moolenbroek punsent->next == NULL);
680*ef8d499eSDavid van Moolenbroek
681*ef8d499eSDavid van Moolenbroek break;
682*ef8d499eSDavid van Moolenbroek }
683*ef8d499eSDavid van Moolenbroek
684*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_unsent = punsent->next;
685*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_unsent_off = 0;
686*ef8d499eSDavid van Moolenbroek }
687*ef8d499eSDavid van Moolenbroek
688*ef8d499eSDavid van Moolenbroek /*
689*ef8d499eSDavid van Moolenbroek * If all pending data has been enqueued for sending, and we should
690*ef8d499eSDavid van Moolenbroek * shut down the sending end of the socket, try that now.
691*ef8d499eSDavid van Moolenbroek */
692*ef8d499eSDavid van Moolenbroek if ((tcp->tcp_snd.ts_unsent == NULL ||
693*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) &&
694*ef8d499eSDavid van Moolenbroek tcpsock_is_shutdown(tcp, SFL_SHUT_WR) &&
695*ef8d499eSDavid van Moolenbroek !(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) {
696*ef8d499eSDavid van Moolenbroek err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/);
697*ef8d499eSDavid van Moolenbroek
698*ef8d499eSDavid van Moolenbroek if (err == ERR_OK) {
699*ef8d499eSDavid van Moolenbroek /*
700*ef8d499eSDavid van Moolenbroek * We have successfully enqueued a FIN. The caller is
701*ef8d499eSDavid van Moolenbroek * now responsible for checking whether the PCB and
702*ef8d499eSDavid van Moolenbroek * possibly even the socket object can now be freed.
703*ef8d499eSDavid van Moolenbroek */
704*ef8d499eSDavid van Moolenbroek tcpsock_set_flag(tcp, TCPF_SENT_FIN);
705*ef8d499eSDavid van Moolenbroek
706*ef8d499eSDavid van Moolenbroek enqueued = TRUE;
707*ef8d499eSDavid van Moolenbroek } else {
708*ef8d499eSDavid van Moolenbroek assert(err == ERR_MEM);
709*ef8d499eSDavid van Moolenbroek
710*ef8d499eSDavid van Moolenbroek /*
711*ef8d499eSDavid van Moolenbroek * FIXME: the resolution for lwIP bug #47485 has taken
712*ef8d499eSDavid van Moolenbroek * away even more control over the closing process from
713*ef8d499eSDavid van Moolenbroek * us, making tracking sockets especially for SO_LINGER
714*ef8d499eSDavid van Moolenbroek * even harder. For now, we simply effectively undo
715*ef8d499eSDavid van Moolenbroek * the patch by clearing TF_CLOSEPEND if tcp_shutdown()
716*ef8d499eSDavid van Moolenbroek * returns ERR_MEM. This will not be sustainable in
717*ef8d499eSDavid van Moolenbroek * the long term, though.
718*ef8d499eSDavid van Moolenbroek */
719*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb->flags &= ~TF_CLOSEPEND;
720*ef8d499eSDavid van Moolenbroek
721*ef8d499eSDavid van Moolenbroek tcpsock_set_flag(tcp, TCPF_FULL);
722*ef8d499eSDavid van Moolenbroek }
723*ef8d499eSDavid van Moolenbroek }
724*ef8d499eSDavid van Moolenbroek
725*ef8d499eSDavid van Moolenbroek return enqueued;
726*ef8d499eSDavid van Moolenbroek }
727*ef8d499eSDavid van Moolenbroek
728*ef8d499eSDavid van Moolenbroek /*
729*ef8d499eSDavid van Moolenbroek * Request lwIP to start sending any enqueued data and/or FIN on the TCP
730*ef8d499eSDavid van Moolenbroek * socket's lwIP PCB. On success, return OK. On failure, return a negative
731*ef8d499eSDavid van Moolenbroek * error code, after cleaning up the socket, freeing the PCB. If the socket
732*ef8d499eSDavid van Moolenbroek * was already being closed, also free the socket object in that case; the
733*ef8d499eSDavid van Moolenbroek * caller must then not touch the socket object anymore upon return. If the
734*ef8d499eSDavid van Moolenbroek * socket object is not freed, and if 'raise_error' is TRUE, raise the error
735*ef8d499eSDavid van Moolenbroek * on the socket object.
736*ef8d499eSDavid van Moolenbroek */
737*ef8d499eSDavid van Moolenbroek static int
tcpsock_pcb_send(struct tcpsock * tcp,int raise_error)738*ef8d499eSDavid van Moolenbroek tcpsock_pcb_send(struct tcpsock * tcp, int raise_error)
739*ef8d499eSDavid van Moolenbroek {
740*ef8d499eSDavid van Moolenbroek err_t err;
741*ef8d499eSDavid van Moolenbroek int r;
742*ef8d499eSDavid van Moolenbroek
743*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb != NULL);
744*ef8d499eSDavid van Moolenbroek
745*ef8d499eSDavid van Moolenbroek /*
746*ef8d499eSDavid van Moolenbroek * If we have enqueued something, ask lwIP to send TCP packets now.
747*ef8d499eSDavid van Moolenbroek * This may result in a fatal error, in which case we clean up the
748*ef8d499eSDavid van Moolenbroek * socket and return the error to the caller. Since cleaning up the
749*ef8d499eSDavid van Moolenbroek * socket may free the socket object, and the caller cannot tell
750*ef8d499eSDavid van Moolenbroek * whether that will happen or has happened, also possibly raise the
751*ef8d499eSDavid van Moolenbroek * error on the socket object if it is not gone. As such, callers that
752*ef8d499eSDavid van Moolenbroek * set 'raise_error' to FALSE must know for sure that the socket was
753*ef8d499eSDavid van Moolenbroek * not being closed, for example because the caller is processing a
754*ef8d499eSDavid van Moolenbroek * (send) call from userland.
755*ef8d499eSDavid van Moolenbroek */
756*ef8d499eSDavid van Moolenbroek err = tcp_output(tcp->tcp_pcb);
757*ef8d499eSDavid van Moolenbroek
758*ef8d499eSDavid van Moolenbroek if (err != ERR_OK && err != ERR_MEM) {
759*ef8d499eSDavid van Moolenbroek tcpsock_pcb_abort(tcp);
760*ef8d499eSDavid van Moolenbroek
761*ef8d499eSDavid van Moolenbroek r = util_convert_err(err);
762*ef8d499eSDavid van Moolenbroek
763*ef8d499eSDavid van Moolenbroek if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
764*ef8d499eSDavid van Moolenbroek if (raise_error)
765*ef8d499eSDavid van Moolenbroek sockevent_set_error(tcpsock_get_sock(tcp), r);
766*ef8d499eSDavid van Moolenbroek }
767*ef8d499eSDavid van Moolenbroek /* Otherwise, do not touch the socket object anymore! */
768*ef8d499eSDavid van Moolenbroek
769*ef8d499eSDavid van Moolenbroek return r;
770*ef8d499eSDavid van Moolenbroek } else
771*ef8d499eSDavid van Moolenbroek return OK;
772*ef8d499eSDavid van Moolenbroek }
773*ef8d499eSDavid van Moolenbroek
774*ef8d499eSDavid van Moolenbroek /*
775*ef8d499eSDavid van Moolenbroek * Callback from lwIP. The given number of data bytes have been acknowledged
776*ef8d499eSDavid van Moolenbroek * as received by the remote end. Dequeue and free data from the TCP socket's
777*ef8d499eSDavid van Moolenbroek * send queue as appropriate.
778*ef8d499eSDavid van Moolenbroek */
779*ef8d499eSDavid van Moolenbroek static err_t
tcpsock_event_sent(void * arg,struct tcp_pcb * pcb __unused,uint16_t len)780*ef8d499eSDavid van Moolenbroek tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len)
781*ef8d499eSDavid van Moolenbroek {
782*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)arg;
783*ef8d499eSDavid van Moolenbroek struct pbuf *phead;
784*ef8d499eSDavid van Moolenbroek size_t left;
785*ef8d499eSDavid van Moolenbroek
786*ef8d499eSDavid van Moolenbroek assert(tcp != NULL);
787*ef8d499eSDavid van Moolenbroek assert(pcb == tcp->tcp_pcb);
788*ef8d499eSDavid van Moolenbroek assert(len > 0);
789*ef8d499eSDavid van Moolenbroek
790*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_len >= len);
791*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_head != NULL);
792*ef8d499eSDavid van Moolenbroek
793*ef8d499eSDavid van Moolenbroek left = len;
794*ef8d499eSDavid van Moolenbroek
795*ef8d499eSDavid van Moolenbroek /*
796*ef8d499eSDavid van Moolenbroek * First see if we can free up whole buffers. Check against the head
797*ef8d499eSDavid van Moolenbroek * buffer's 'len' rather than 'tot_len', or we may end up leaving an
798*ef8d499eSDavid van Moolenbroek * empty buffer on the chain.
799*ef8d499eSDavid van Moolenbroek */
800*ef8d499eSDavid van Moolenbroek while ((phead = tcp->tcp_snd.ts_head) != NULL &&
801*ef8d499eSDavid van Moolenbroek left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) {
802*ef8d499eSDavid van Moolenbroek left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off;
803*ef8d499eSDavid van Moolenbroek
804*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_head = phead->next;
805*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_head_off = 0;
806*ef8d499eSDavid van Moolenbroek
807*ef8d499eSDavid van Moolenbroek if (phead == tcp->tcp_snd.ts_unsent) {
808*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_unsent_off == phead->len);
809*ef8d499eSDavid van Moolenbroek
810*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_unsent = phead->next;
811*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_unsent_off = 0;
812*ef8d499eSDavid van Moolenbroek }
813*ef8d499eSDavid van Moolenbroek
814*ef8d499eSDavid van Moolenbroek assert(tcpsock_sendbufs > 0);
815*ef8d499eSDavid van Moolenbroek tcpsock_sendbufs--;
816*ef8d499eSDavid van Moolenbroek
817*ef8d499eSDavid van Moolenbroek tcpsock_free_buf(phead);
818*ef8d499eSDavid van Moolenbroek }
819*ef8d499eSDavid van Moolenbroek
820*ef8d499eSDavid van Moolenbroek /*
821*ef8d499eSDavid van Moolenbroek * The rest of the given length is for less than the current head
822*ef8d499eSDavid van Moolenbroek * buffer.
823*ef8d499eSDavid van Moolenbroek */
824*ef8d499eSDavid van Moolenbroek if (left > 0) {
825*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_head != NULL);
826*ef8d499eSDavid van Moolenbroek assert((size_t)tcp->tcp_snd.ts_head->len -
827*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_head_off > left);
828*ef8d499eSDavid van Moolenbroek
829*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_head_off += left;
830*ef8d499eSDavid van Moolenbroek }
831*ef8d499eSDavid van Moolenbroek
832*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_len -= (size_t)len;
833*ef8d499eSDavid van Moolenbroek
834*ef8d499eSDavid van Moolenbroek if (tcp->tcp_snd.ts_head == NULL) {
835*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_len == 0);
836*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_unsent == NULL);
837*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_tail = NULL;
838*ef8d499eSDavid van Moolenbroek } else
839*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_len > 0);
840*ef8d499eSDavid van Moolenbroek
841*ef8d499eSDavid van Moolenbroek /*
842*ef8d499eSDavid van Moolenbroek * If we emptied the send queue, and we already managed to send a FIN
843*ef8d499eSDavid van Moolenbroek * earlier, we may now have met all requirements to close the socket's
844*ef8d499eSDavid van Moolenbroek * PCB. Otherwise, we may also be able to send more now, so try to
845*ef8d499eSDavid van Moolenbroek * resume sending. Since we are invoked from the "sent" event,
846*ef8d499eSDavid van Moolenbroek * tcp_output() will not actually process anything, and so we do not
847*ef8d499eSDavid van Moolenbroek * call it either. If we did, we would have to deal with errors here.
848*ef8d499eSDavid van Moolenbroek */
849*ef8d499eSDavid van Moolenbroek if (tcpsock_may_close(tcp)) {
850*ef8d499eSDavid van Moolenbroek if (tcpsock_finish_close(tcp))
851*ef8d499eSDavid van Moolenbroek return ERR_OK;
852*ef8d499eSDavid van Moolenbroek } else {
853*ef8d499eSDavid van Moolenbroek tcpsock_clear_flag(tcp, TCPF_FULL);
854*ef8d499eSDavid van Moolenbroek
855*ef8d499eSDavid van Moolenbroek /*
856*ef8d499eSDavid van Moolenbroek * If we now manage to enqueue a FIN, we may be ready to close
857*ef8d499eSDavid van Moolenbroek * the PCB after all.
858*ef8d499eSDavid van Moolenbroek */
859*ef8d499eSDavid van Moolenbroek if (tcpsock_pcb_enqueue(tcp)) {
860*ef8d499eSDavid van Moolenbroek if (tcpsock_may_close(tcp) &&
861*ef8d499eSDavid van Moolenbroek tcpsock_finish_close(tcp))
862*ef8d499eSDavid van Moolenbroek return ERR_OK;
863*ef8d499eSDavid van Moolenbroek }
864*ef8d499eSDavid van Moolenbroek }
865*ef8d499eSDavid van Moolenbroek
866*ef8d499eSDavid van Moolenbroek /* The user may also be able to send more now. */
867*ef8d499eSDavid van Moolenbroek sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
868*ef8d499eSDavid van Moolenbroek
869*ef8d499eSDavid van Moolenbroek return ERR_OK;
870*ef8d499eSDavid van Moolenbroek }
871*ef8d499eSDavid van Moolenbroek
872*ef8d499eSDavid van Moolenbroek /*
873*ef8d499eSDavid van Moolenbroek * Check whether any (additional) data previously received on a TCP socket
874*ef8d499eSDavid van Moolenbroek * should be acknowledged, possibly allowing the remote end to send additional
875*ef8d499eSDavid van Moolenbroek * data as a result.
876*ef8d499eSDavid van Moolenbroek */
877*ef8d499eSDavid van Moolenbroek static void
tcpsock_ack_recv(struct tcpsock * tcp)878*ef8d499eSDavid van Moolenbroek tcpsock_ack_recv(struct tcpsock * tcp)
879*ef8d499eSDavid van Moolenbroek {
880*ef8d499eSDavid van Moolenbroek size_t rcvbuf, left, delta, ack;
881*ef8d499eSDavid van Moolenbroek
882*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb != NULL);
883*ef8d499eSDavid van Moolenbroek
884*ef8d499eSDavid van Moolenbroek /*
885*ef8d499eSDavid van Moolenbroek * We must make sure that at all times, we can still add an entire
886*ef8d499eSDavid van Moolenbroek * window's worth of data to the receive queue. If the amount of free
887*ef8d499eSDavid van Moolenbroek * space drops below that threshold, we stop acknowledging received
888*ef8d499eSDavid van Moolenbroek * data. The user may change the receive buffer size at all times; we
889*ef8d499eSDavid van Moolenbroek * update the window size lazily as appropriate.
890*ef8d499eSDavid van Moolenbroek */
891*ef8d499eSDavid van Moolenbroek rcvbuf = tcpsock_get_rcvbuf(tcp);
892*ef8d499eSDavid van Moolenbroek
893*ef8d499eSDavid van Moolenbroek if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) {
894*ef8d499eSDavid van Moolenbroek /*
895*ef8d499eSDavid van Moolenbroek * The number of bytes that lwIP can still give us at any time
896*ef8d499eSDavid van Moolenbroek * is represented as 'left'. The number of bytes that we still
897*ef8d499eSDavid van Moolenbroek * allow to be stored in the receive queue is represented as
898*ef8d499eSDavid van Moolenbroek * 'delta'. We must make sure that 'left' does not ever exceed
899*ef8d499eSDavid van Moolenbroek * 'delta' while acknowledging as many bytes as possible under
900*ef8d499eSDavid van Moolenbroek * that rule.
901*ef8d499eSDavid van Moolenbroek */
902*ef8d499eSDavid van Moolenbroek left = TCP_WND - tcp->tcp_rcv.tr_unacked;
903*ef8d499eSDavid van Moolenbroek delta = rcvbuf - tcp->tcp_rcv.tr_len;
904*ef8d499eSDavid van Moolenbroek
905*ef8d499eSDavid van Moolenbroek if (left < delta) {
906*ef8d499eSDavid van Moolenbroek ack = delta - left;
907*ef8d499eSDavid van Moolenbroek
908*ef8d499eSDavid van Moolenbroek if (ack > tcp->tcp_rcv.tr_unacked)
909*ef8d499eSDavid van Moolenbroek ack = tcp->tcp_rcv.tr_unacked;
910*ef8d499eSDavid van Moolenbroek
911*ef8d499eSDavid van Moolenbroek tcp_recved(tcp->tcp_pcb, ack);
912*ef8d499eSDavid van Moolenbroek
913*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_unacked -= ack;
914*ef8d499eSDavid van Moolenbroek
915*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_len + TCP_WND -
916*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_unacked <= rcvbuf);
917*ef8d499eSDavid van Moolenbroek }
918*ef8d499eSDavid van Moolenbroek }
919*ef8d499eSDavid van Moolenbroek }
920*ef8d499eSDavid van Moolenbroek
921*ef8d499eSDavid van Moolenbroek /*
922*ef8d499eSDavid van Moolenbroek * Attempt to merge two consecutive underfilled buffers in the receive queue of
923*ef8d499eSDavid van Moolenbroek * a TCP socket, freeing up one of the two buffers as a result. The first
924*ef8d499eSDavid van Moolenbroek * (oldest) buffer is 'ptail', and the pointer to this buffer is stored at
925*ef8d499eSDavid van Moolenbroek * 'pnext'. The second (new) buffer is 'pbuf', which is already attached to
926*ef8d499eSDavid van Moolenbroek * the first buffer. The second buffer may be followed by additional buffers
927*ef8d499eSDavid van Moolenbroek * with even more new data. Return TRUE if buffers have been merged, in which
928*ef8d499eSDavid van Moolenbroek * case the pointer at 'pnext' may have changed, and no assumptions should be
929*ef8d499eSDavid van Moolenbroek * made about whether 'ptail' and 'pbuf' still exist in any form. Return FALSE
930*ef8d499eSDavid van Moolenbroek * if no merging was necessary or if no new buffer could be allocated.
931*ef8d499eSDavid van Moolenbroek */
932*ef8d499eSDavid van Moolenbroek static int
tcpsock_try_merge(struct pbuf ** pnext,struct pbuf * ptail,struct pbuf * pbuf)933*ef8d499eSDavid van Moolenbroek tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf)
934*ef8d499eSDavid van Moolenbroek {
935*ef8d499eSDavid van Moolenbroek struct pbuf *pnew;
936*ef8d499eSDavid van Moolenbroek
937*ef8d499eSDavid van Moolenbroek assert(*pnext == ptail);
938*ef8d499eSDavid van Moolenbroek assert(ptail->next == pbuf);
939*ef8d499eSDavid van Moolenbroek
940*ef8d499eSDavid van Moolenbroek /*
941*ef8d499eSDavid van Moolenbroek * Unfortunately, we cannot figure out what kind of pbuf we were given
942*ef8d499eSDavid van Moolenbroek * by the lower layers, so we cannot merge two buffers without first
943*ef8d499eSDavid van Moolenbroek * allocating a third. Once we have done that, though, we can easily
944*ef8d499eSDavid van Moolenbroek * merge more into that new buffer. For now we use the following
945*ef8d499eSDavid van Moolenbroek * policies:
946*ef8d499eSDavid van Moolenbroek *
947*ef8d499eSDavid van Moolenbroek * 1. if two consecutive lwIP-provided buffers are both used less than
948*ef8d499eSDavid van Moolenbroek * half the size of a full buffer, try to allocate a new buffer and
949*ef8d499eSDavid van Moolenbroek * copy both lwIP-provided buffers into that new buffer, freeing up
950*ef8d499eSDavid van Moolenbroek * the pair afterwards;
951*ef8d499eSDavid van Moolenbroek * 2. if the tail buffer on the chain is allocated by us and not yet
952*ef8d499eSDavid van Moolenbroek * full, and the next buffer's contents can be added to the tail
953*ef8d499eSDavid van Moolenbroek * buffer in their entirety, do just that.
954*ef8d499eSDavid van Moolenbroek *
955*ef8d499eSDavid van Moolenbroek * Obviously there is a trade-off between the performance overhead of
956*ef8d499eSDavid van Moolenbroek * copying and the resource overhead of keeping less-than-full buffers
957*ef8d499eSDavid van Moolenbroek * on the receive queue, but this policy should both keep actual memory
958*ef8d499eSDavid van Moolenbroek * usage to no more than twice the receive queue length and prevent
959*ef8d499eSDavid van Moolenbroek * excessive copying. The policy deliberately performs more aggressive
960*ef8d499eSDavid van Moolenbroek * merging into a buffer that we allocated ourselves.
961*ef8d499eSDavid van Moolenbroek */
962*ef8d499eSDavid van Moolenbroek if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 &&
963*ef8d499eSDavid van Moolenbroek pbuf->len <= MEMPOOL_BUFSIZE / 2) {
964*ef8d499eSDavid van Moolenbroek /*
965*ef8d499eSDavid van Moolenbroek * Case #1.
966*ef8d499eSDavid van Moolenbroek */
967*ef8d499eSDavid van Moolenbroek assert(ptail->tot_len == ptail->len);
968*ef8d499eSDavid van Moolenbroek assert(pbuf->tot_len == pbuf->len);
969*ef8d499eSDavid van Moolenbroek
970*ef8d499eSDavid van Moolenbroek pnew = tcpsock_alloc_buf();
971*ef8d499eSDavid van Moolenbroek if (pnew == NULL)
972*ef8d499eSDavid van Moolenbroek return FALSE;
973*ef8d499eSDavid van Moolenbroek
974*ef8d499eSDavid van Moolenbroek memcpy(pnew->payload, ptail->payload, ptail->len);
975*ef8d499eSDavid van Moolenbroek memcpy((char *)pnew->payload + ptail->len, pbuf->payload,
976*ef8d499eSDavid van Moolenbroek pbuf->len);
977*ef8d499eSDavid van Moolenbroek pnew->len = ptail->len + pbuf->len;
978*ef8d499eSDavid van Moolenbroek assert(pnew->len <= pnew->tot_len);
979*ef8d499eSDavid van Moolenbroek
980*ef8d499eSDavid van Moolenbroek pnew->next = pbuf->next;
981*ef8d499eSDavid van Moolenbroek /* For now, we need not inherit any flags from either pbuf. */
982*ef8d499eSDavid van Moolenbroek
983*ef8d499eSDavid van Moolenbroek *pnext = pnew;
984*ef8d499eSDavid van Moolenbroek
985*ef8d499eSDavid van Moolenbroek /* One allocated, two about to be deallocated. */
986*ef8d499eSDavid van Moolenbroek assert(tcpsock_recvbufs > 0);
987*ef8d499eSDavid van Moolenbroek tcpsock_recvbufs--;
988*ef8d499eSDavid van Moolenbroek
989*ef8d499eSDavid van Moolenbroek tcpsock_free_buf(ptail);
990*ef8d499eSDavid van Moolenbroek tcpsock_free_buf(pbuf);
991*ef8d499eSDavid van Moolenbroek
992*ef8d499eSDavid van Moolenbroek return TRUE;
993*ef8d499eSDavid van Moolenbroek } else if (ptail->tot_len - ptail->len >= pbuf->len) {
994*ef8d499eSDavid van Moolenbroek /*
995*ef8d499eSDavid van Moolenbroek * Case #2.
996*ef8d499eSDavid van Moolenbroek */
997*ef8d499eSDavid van Moolenbroek memcpy((char *)ptail->payload + ptail->len, pbuf->payload,
998*ef8d499eSDavid van Moolenbroek pbuf->len);
999*ef8d499eSDavid van Moolenbroek
1000*ef8d499eSDavid van Moolenbroek ptail->len += pbuf->len;
1001*ef8d499eSDavid van Moolenbroek
1002*ef8d499eSDavid van Moolenbroek ptail->next = pbuf->next;
1003*ef8d499eSDavid van Moolenbroek
1004*ef8d499eSDavid van Moolenbroek assert(tcpsock_recvbufs > 0);
1005*ef8d499eSDavid van Moolenbroek tcpsock_recvbufs--;
1006*ef8d499eSDavid van Moolenbroek
1007*ef8d499eSDavid van Moolenbroek tcpsock_free_buf(pbuf);
1008*ef8d499eSDavid van Moolenbroek
1009*ef8d499eSDavid van Moolenbroek return TRUE;
1010*ef8d499eSDavid van Moolenbroek } else
1011*ef8d499eSDavid van Moolenbroek return FALSE;
1012*ef8d499eSDavid van Moolenbroek }
1013*ef8d499eSDavid van Moolenbroek
1014*ef8d499eSDavid van Moolenbroek /*
1015*ef8d499eSDavid van Moolenbroek * Callback from lwIP. New data or flags have been received on a TCP socket.
1016*ef8d499eSDavid van Moolenbroek */
1017*ef8d499eSDavid van Moolenbroek static err_t
tcpsock_event_recv(void * arg,struct tcp_pcb * pcb __unused,struct pbuf * pbuf,err_t err)1018*ef8d499eSDavid van Moolenbroek tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused,
1019*ef8d499eSDavid van Moolenbroek struct pbuf * pbuf, err_t err)
1020*ef8d499eSDavid van Moolenbroek {
1021*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)arg;
1022*ef8d499eSDavid van Moolenbroek struct pbuf *ptail, **pprevp;
1023*ef8d499eSDavid van Moolenbroek size_t len;
1024*ef8d499eSDavid van Moolenbroek
1025*ef8d499eSDavid van Moolenbroek assert(tcp != NULL);
1026*ef8d499eSDavid van Moolenbroek assert(pcb == tcp->tcp_pcb);
1027*ef8d499eSDavid van Moolenbroek
1028*ef8d499eSDavid van Moolenbroek /*
1029*ef8d499eSDavid van Moolenbroek * lwIP should never provide anything other than ERR_OK in 'err', and
1030*ef8d499eSDavid van Moolenbroek * it is not clear what we should do if it would. If lwIP ever changes
1031*ef8d499eSDavid van Moolenbroek * in this regard, we will likely have to change this code accordingly.
1032*ef8d499eSDavid van Moolenbroek */
1033*ef8d499eSDavid van Moolenbroek if (err != ERR_OK)
1034*ef8d499eSDavid van Moolenbroek panic("TCP receive event with error: %d", err);
1035*ef8d499eSDavid van Moolenbroek
1036*ef8d499eSDavid van Moolenbroek /* If the given buffer is NULL, we have received a FIN. */
1037*ef8d499eSDavid van Moolenbroek if (pbuf == NULL) {
1038*ef8d499eSDavid van Moolenbroek tcpsock_set_flag(tcp, TCPF_RCVD_FIN);
1039*ef8d499eSDavid van Moolenbroek
1040*ef8d499eSDavid van Moolenbroek /* Userland may now receive EOF. */
1041*ef8d499eSDavid van Moolenbroek if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD))
1042*ef8d499eSDavid van Moolenbroek sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
1043*ef8d499eSDavid van Moolenbroek
1044*ef8d499eSDavid van Moolenbroek /*
1045*ef8d499eSDavid van Moolenbroek * If we were in the process of closing the socket, and we
1046*ef8d499eSDavid van Moolenbroek * receive a FIN before our FIN got acknowledged, we close the
1047*ef8d499eSDavid van Moolenbroek * socket anyway, as described in tcpsock_close(). However, if
1048*ef8d499eSDavid van Moolenbroek * there is still unacknowledged outgoing data or we did not
1049*ef8d499eSDavid van Moolenbroek * even manage to send our FIN yet, hold off closing the socket
1050*ef8d499eSDavid van Moolenbroek * for now.
1051*ef8d499eSDavid van Moolenbroek */
1052*ef8d499eSDavid van Moolenbroek if (tcpsock_may_close(tcp))
1053*ef8d499eSDavid van Moolenbroek (void)tcpsock_finish_close(tcp);
1054*ef8d499eSDavid van Moolenbroek
1055*ef8d499eSDavid van Moolenbroek return ERR_OK;
1056*ef8d499eSDavid van Moolenbroek }
1057*ef8d499eSDavid van Moolenbroek
1058*ef8d499eSDavid van Moolenbroek /*
1059*ef8d499eSDavid van Moolenbroek * If the socket is being closed, receiving new data should cause a
1060*ef8d499eSDavid van Moolenbroek * reset.
1061*ef8d499eSDavid van Moolenbroek */
1062*ef8d499eSDavid van Moolenbroek if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
1063*ef8d499eSDavid van Moolenbroek tcpsock_pcb_abort(tcp);
1064*ef8d499eSDavid van Moolenbroek
1065*ef8d499eSDavid van Moolenbroek (void)tcpsock_cleanup(tcp, TRUE /*may_free*/);
1066*ef8d499eSDavid van Moolenbroek /* Do not touch the socket object anymore! */
1067*ef8d499eSDavid van Moolenbroek
1068*ef8d499eSDavid van Moolenbroek pbuf_free(pbuf);
1069*ef8d499eSDavid van Moolenbroek
1070*ef8d499eSDavid van Moolenbroek return ERR_ABRT;
1071*ef8d499eSDavid van Moolenbroek }
1072*ef8d499eSDavid van Moolenbroek
1073*ef8d499eSDavid van Moolenbroek /*
1074*ef8d499eSDavid van Moolenbroek * If the socket has already been shut down for reading, discard the
1075*ef8d499eSDavid van Moolenbroek * incoming data and do nothing else.
1076*ef8d499eSDavid van Moolenbroek */
1077*ef8d499eSDavid van Moolenbroek if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) {
1078*ef8d499eSDavid van Moolenbroek tcp_recved(tcp->tcp_pcb, pbuf->tot_len);
1079*ef8d499eSDavid van Moolenbroek
1080*ef8d499eSDavid van Moolenbroek pbuf_free(pbuf);
1081*ef8d499eSDavid van Moolenbroek
1082*ef8d499eSDavid van Moolenbroek return ERR_OK;
1083*ef8d499eSDavid van Moolenbroek }
1084*ef8d499eSDavid van Moolenbroek
1085*ef8d499eSDavid van Moolenbroek /*
1086*ef8d499eSDavid van Moolenbroek * We deliberately ignore the PBUF_FLAG_PUSH flag. This flag would
1087*ef8d499eSDavid van Moolenbroek * enable the receive functionality to delay delivering "un-pushed"
1088*ef8d499eSDavid van Moolenbroek * data to applications. The implementation of this scheme could track
1089*ef8d499eSDavid van Moolenbroek * the amount of data up to and including the last-pushed segment using
1090*ef8d499eSDavid van Moolenbroek * a "tr_push_len" field or so. Deciding when to deliver "un-pushed"
1091*ef8d499eSDavid van Moolenbroek * data after all is a bit tricker though. As far as I can tell, the
1092*ef8d499eSDavid van Moolenbroek * BSDs do not implement anything like that. Windows does, and this
1093*ef8d499eSDavid van Moolenbroek * results in interaction problems with even more lightweight TCP/IP
1094*ef8d499eSDavid van Moolenbroek * stacks that do not send the TCP PSH flag. Currently, there is no
1095*ef8d499eSDavid van Moolenbroek * obvious benefit for us to support delaying data delivery like that.
1096*ef8d499eSDavid van Moolenbroek * In addition, testing its implementation reliably would be difficult.
1097*ef8d499eSDavid van Moolenbroek */
1098*ef8d499eSDavid van Moolenbroek
1099*ef8d499eSDavid van Moolenbroek len = (size_t)pbuf->tot_len;
1100*ef8d499eSDavid van Moolenbroek
1101*ef8d499eSDavid van Moolenbroek /*
1102*ef8d499eSDavid van Moolenbroek * Count the number of buffers that are now owned by us. The new total
1103*ef8d499eSDavid van Moolenbroek * of buffers owned by us must not exceed the size of the memory pool.
1104*ef8d499eSDavid van Moolenbroek * Any more would indicate an accounting error. Note that
1105*ef8d499eSDavid van Moolenbroek * tcpsock_recvbufs is currently used for debugging only!
1106*ef8d499eSDavid van Moolenbroek */
1107*ef8d499eSDavid van Moolenbroek tcpsock_recvbufs += pbuf_clen(pbuf);
1108*ef8d499eSDavid van Moolenbroek assert(tcpsock_recvbufs < mempool_cur_buffers());
1109*ef8d499eSDavid van Moolenbroek
1110*ef8d499eSDavid van Moolenbroek /*
1111*ef8d499eSDavid van Moolenbroek * The pre-tail pointer points to whatever is pointing to the tail
1112*ef8d499eSDavid van Moolenbroek * buffer. The latter pointer may be the 'tr_head' field in our
1113*ef8d499eSDavid van Moolenbroek * tcpsock structure, or the 'next' field in the penultimate buffer,
1114*ef8d499eSDavid van Moolenbroek * or NULL if there are currently no buffers on the receive queue.
1115*ef8d499eSDavid van Moolenbroek */
1116*ef8d499eSDavid van Moolenbroek if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) {
1117*ef8d499eSDavid van Moolenbroek ptail = *pprevp;
1118*ef8d499eSDavid van Moolenbroek
1119*ef8d499eSDavid van Moolenbroek assert(ptail != NULL);
1120*ef8d499eSDavid van Moolenbroek assert(ptail->next == NULL);
1121*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_head != NULL);
1122*ef8d499eSDavid van Moolenbroek
1123*ef8d499eSDavid van Moolenbroek ptail->next = pbuf;
1124*ef8d499eSDavid van Moolenbroek pbuf->tot_len = pbuf->len; /* to help freeing on merges */
1125*ef8d499eSDavid van Moolenbroek
1126*ef8d499eSDavid van Moolenbroek if (tcpsock_try_merge(pprevp, ptail, pbuf)) {
1127*ef8d499eSDavid van Moolenbroek ptail = *pprevp;
1128*ef8d499eSDavid van Moolenbroek pbuf = ptail->next;
1129*ef8d499eSDavid van Moolenbroek }
1130*ef8d499eSDavid van Moolenbroek
1131*ef8d499eSDavid van Moolenbroek if (pbuf != NULL)
1132*ef8d499eSDavid van Moolenbroek pprevp = &ptail->next;
1133*ef8d499eSDavid van Moolenbroek } else {
1134*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_head == NULL);
1135*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_head_off == 0);
1136*ef8d499eSDavid van Moolenbroek
1137*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_head = pbuf;
1138*ef8d499eSDavid van Moolenbroek
1139*ef8d499eSDavid van Moolenbroek pprevp = &tcp->tcp_rcv.tr_head;
1140*ef8d499eSDavid van Moolenbroek }
1141*ef8d499eSDavid van Moolenbroek
1142*ef8d499eSDavid van Moolenbroek /*
1143*ef8d499eSDavid van Moolenbroek * Chop up the chain into individual buffers. This is necessary as we
1144*ef8d499eSDavid van Moolenbroek * overload 'tot_len' to mean "space available in the buffer", as we
1145*ef8d499eSDavid van Moolenbroek * want for buffers allocated by us as part of buffer merges. Also get
1146*ef8d499eSDavid van Moolenbroek * a pointer to the pointer to the new penultimate tail buffer. Due to
1147*ef8d499eSDavid van Moolenbroek * merging, the chain may already be empty by now, though.
1148*ef8d499eSDavid van Moolenbroek */
1149*ef8d499eSDavid van Moolenbroek if (pbuf != NULL) {
1150*ef8d499eSDavid van Moolenbroek for (; pbuf->next != NULL; pbuf = pbuf->next) {
1151*ef8d499eSDavid van Moolenbroek pbuf->tot_len = pbuf->len;
1152*ef8d499eSDavid van Moolenbroek
1153*ef8d499eSDavid van Moolenbroek pprevp = &pbuf->next;
1154*ef8d499eSDavid van Moolenbroek }
1155*ef8d499eSDavid van Moolenbroek assert(pbuf->len == pbuf->tot_len);
1156*ef8d499eSDavid van Moolenbroek }
1157*ef8d499eSDavid van Moolenbroek
1158*ef8d499eSDavid van Moolenbroek assert(*pprevp != NULL);
1159*ef8d499eSDavid van Moolenbroek assert((*pprevp)->next == NULL);
1160*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_pre_tailp = pprevp;
1161*ef8d499eSDavid van Moolenbroek
1162*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_len += len;
1163*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_unacked += len;
1164*ef8d499eSDavid van Moolenbroek
1165*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_unacked <= TCP_WND);
1166*ef8d499eSDavid van Moolenbroek
1167*ef8d499eSDavid van Moolenbroek /*
1168*ef8d499eSDavid van Moolenbroek * Note that tr_len may now exceed the receive buffer size in the
1169*ef8d499eSDavid van Moolenbroek * highly exceptional case that the user is adjusting the latter after
1170*ef8d499eSDavid van Moolenbroek * the socket had already received data.
1171*ef8d499eSDavid van Moolenbroek */
1172*ef8d499eSDavid van Moolenbroek
1173*ef8d499eSDavid van Moolenbroek /* See if we can immediately acknowledge some or all of the data. */
1174*ef8d499eSDavid van Moolenbroek tcpsock_ack_recv(tcp);
1175*ef8d499eSDavid van Moolenbroek
1176*ef8d499eSDavid van Moolenbroek /* Also wake up any receivers now. */
1177*ef8d499eSDavid van Moolenbroek sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
1178*ef8d499eSDavid van Moolenbroek
1179*ef8d499eSDavid van Moolenbroek return ERR_OK;
1180*ef8d499eSDavid van Moolenbroek }
1181*ef8d499eSDavid van Moolenbroek
1182*ef8d499eSDavid van Moolenbroek /*
1183*ef8d499eSDavid van Moolenbroek * Callback from lwIP. The PCB corresponding to the socket identified by 'arg'
1184*ef8d499eSDavid van Moolenbroek * has been closed by lwIP, with the reason specified in 'err': either the
1185*ef8d499eSDavid van Moolenbroek * connection has been aborted locally (ERR_ABRT), it has been reset by the
1186*ef8d499eSDavid van Moolenbroek * remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD).
1187*ef8d499eSDavid van Moolenbroek */
1188*ef8d499eSDavid van Moolenbroek static void
tcpsock_event_err(void * arg,err_t err)1189*ef8d499eSDavid van Moolenbroek tcpsock_event_err(void * arg, err_t err)
1190*ef8d499eSDavid van Moolenbroek {
1191*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)arg;
1192*ef8d499eSDavid van Moolenbroek int r;
1193*ef8d499eSDavid van Moolenbroek
1194*ef8d499eSDavid van Moolenbroek assert(tcp != NULL);
1195*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb != NULL);
1196*ef8d499eSDavid van Moolenbroek assert(err != ERR_OK);
1197*ef8d499eSDavid van Moolenbroek
1198*ef8d499eSDavid van Moolenbroek /* The original PCB is now gone, or will be shortly. */
1199*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb = NULL;
1200*ef8d499eSDavid van Moolenbroek
1201*ef8d499eSDavid van Moolenbroek /*
1202*ef8d499eSDavid van Moolenbroek * Clean up the socket. As a result it may be freed, in which case we
1203*ef8d499eSDavid van Moolenbroek * must not touch it anymore. No need to return ERR_ABRT from here, as
1204*ef8d499eSDavid van Moolenbroek * the PCB has been aborted already.
1205*ef8d499eSDavid van Moolenbroek */
1206*ef8d499eSDavid van Moolenbroek if (tcpsock_cleanup(tcp, TRUE /*may_free*/))
1207*ef8d499eSDavid van Moolenbroek return;
1208*ef8d499eSDavid van Moolenbroek
1209*ef8d499eSDavid van Moolenbroek if (err == ERR_CLSD) {
1210*ef8d499eSDavid van Moolenbroek /*
1211*ef8d499eSDavid van Moolenbroek * We may get here if the socket is shut down for writing and
1212*ef8d499eSDavid van Moolenbroek * we already received a FIN from the remote side, thus putting
1213*ef8d499eSDavid van Moolenbroek * the socket in LAST_ACK state, and we receive that last
1214*ef8d499eSDavid van Moolenbroek * acknowledgment. There is nothing more we need to do.
1215*ef8d499eSDavid van Moolenbroek *
1216*ef8d499eSDavid van Moolenbroek * We will never get here in the other case that ERR_CLSD is
1217*ef8d499eSDavid van Moolenbroek * raised, which is when the socket is reset because of
1218*ef8d499eSDavid van Moolenbroek * unacknowledged data while closing: we handle the
1219*ef8d499eSDavid van Moolenbroek * reset-on-ACK case ourselves in tcpsock_close(), and the
1220*ef8d499eSDavid van Moolenbroek * socket is in closing state after that.
1221*ef8d499eSDavid van Moolenbroek */
1222*ef8d499eSDavid van Moolenbroek assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
1223*ef8d499eSDavid van Moolenbroek assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
1224*ef8d499eSDavid van Moolenbroek } else {
1225*ef8d499eSDavid van Moolenbroek /*
1226*ef8d499eSDavid van Moolenbroek * Anything else should be an error directly from lwIP;
1227*ef8d499eSDavid van Moolenbroek * currently either ERR_ABRT and ERR_RST. Covert it to a
1228*ef8d499eSDavid van Moolenbroek * regular error and set it on the socket. Doing so will also
1229*ef8d499eSDavid van Moolenbroek * raise the appropriate events.
1230*ef8d499eSDavid van Moolenbroek */
1231*ef8d499eSDavid van Moolenbroek /*
1232*ef8d499eSDavid van Moolenbroek * Unfortunately, lwIP is not throwing accurate errors even
1233*ef8d499eSDavid van Moolenbroek * when it can. We convert some errors to reflect more
1234*ef8d499eSDavid van Moolenbroek * accurately the most likely cause.
1235*ef8d499eSDavid van Moolenbroek *
1236*ef8d499eSDavid van Moolenbroek * TODO: fix lwIP in this regard..
1237*ef8d499eSDavid van Moolenbroek */
1238*ef8d499eSDavid van Moolenbroek r = util_convert_err(err);
1239*ef8d499eSDavid van Moolenbroek
1240*ef8d499eSDavid van Moolenbroek if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) {
1241*ef8d499eSDavid van Moolenbroek switch (err) {
1242*ef8d499eSDavid van Moolenbroek case ERR_ABRT: r = ETIMEDOUT; break;
1243*ef8d499eSDavid van Moolenbroek case ERR_RST: r = ECONNREFUSED; break;
1244*ef8d499eSDavid van Moolenbroek }
1245*ef8d499eSDavid van Moolenbroek }
1246*ef8d499eSDavid van Moolenbroek
1247*ef8d499eSDavid van Moolenbroek sockevent_set_error(tcpsock_get_sock(tcp), r);
1248*ef8d499eSDavid van Moolenbroek }
1249*ef8d499eSDavid van Moolenbroek }
1250*ef8d499eSDavid van Moolenbroek
1251*ef8d499eSDavid van Moolenbroek /*
1252*ef8d499eSDavid van Moolenbroek * Callback from lwIP. Perform regular checks on a TCP socket. This function
1253*ef8d499eSDavid van Moolenbroek * is called one per five seconds on connected sockets, and twice per second on
1254*ef8d499eSDavid van Moolenbroek * closing sockets.
1255*ef8d499eSDavid van Moolenbroek */
1256*ef8d499eSDavid van Moolenbroek static err_t
tcpsock_event_poll(void * arg,struct tcp_pcb * pcb __unused)1257*ef8d499eSDavid van Moolenbroek tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused)
1258*ef8d499eSDavid van Moolenbroek {
1259*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)arg;
1260*ef8d499eSDavid van Moolenbroek err_t err;
1261*ef8d499eSDavid van Moolenbroek int r;
1262*ef8d499eSDavid van Moolenbroek
1263*ef8d499eSDavid van Moolenbroek assert(tcp != NULL);
1264*ef8d499eSDavid van Moolenbroek assert(pcb == tcp->tcp_pcb);
1265*ef8d499eSDavid van Moolenbroek
1266*ef8d499eSDavid van Moolenbroek /*
1267*ef8d499eSDavid van Moolenbroek * If we ended up running out of buffers earlier, try resuming any send
1268*ef8d499eSDavid van Moolenbroek * requests now, both for enqueuing TCP data with lwIP and for user
1269*ef8d499eSDavid van Moolenbroek * requests.
1270*ef8d499eSDavid van Moolenbroek */
1271*ef8d499eSDavid van Moolenbroek if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) {
1272*ef8d499eSDavid van Moolenbroek tcpsock_clear_flag(tcp, TCPF_FULL);
1273*ef8d499eSDavid van Moolenbroek tcpsock_clear_flag(tcp, TCPF_OOM);
1274*ef8d499eSDavid van Moolenbroek
1275*ef8d499eSDavid van Moolenbroek /* See if we can enqueue more data with lwIP. */
1276*ef8d499eSDavid van Moolenbroek if (tcpsock_pcb_enqueue(tcp)) {
1277*ef8d499eSDavid van Moolenbroek /* In some cases, we can now close the PCB. */
1278*ef8d499eSDavid van Moolenbroek if (tcpsock_may_close(tcp)) {
1279*ef8d499eSDavid van Moolenbroek (void)tcpsock_finish_close(tcp);
1280*ef8d499eSDavid van Moolenbroek /*
1281*ef8d499eSDavid van Moolenbroek * The PCB is definitely gone here, and the
1282*ef8d499eSDavid van Moolenbroek * entire socket object may be gone now too.
1283*ef8d499eSDavid van Moolenbroek * Do not touch either anymore!
1284*ef8d499eSDavid van Moolenbroek */
1285*ef8d499eSDavid van Moolenbroek
1286*ef8d499eSDavid van Moolenbroek return ERR_OK;
1287*ef8d499eSDavid van Moolenbroek }
1288*ef8d499eSDavid van Moolenbroek
1289*ef8d499eSDavid van Moolenbroek /*
1290*ef8d499eSDavid van Moolenbroek * If actually sending the data fails, the PCB will be
1291*ef8d499eSDavid van Moolenbroek * gone, and the socket object may be gone as well. Do
1292*ef8d499eSDavid van Moolenbroek * not touch either anymore in that case!
1293*ef8d499eSDavid van Moolenbroek */
1294*ef8d499eSDavid van Moolenbroek if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK)
1295*ef8d499eSDavid van Moolenbroek return ERR_ABRT;
1296*ef8d499eSDavid van Moolenbroek }
1297*ef8d499eSDavid van Moolenbroek
1298*ef8d499eSDavid van Moolenbroek /*
1299*ef8d499eSDavid van Moolenbroek * If we ran out of buffers earlier, it may be possible to take
1300*ef8d499eSDavid van Moolenbroek * in more data from a user process now, even if we did not
1301*ef8d499eSDavid van Moolenbroek * manage to enqueue any more pending data with lwIP.
1302*ef8d499eSDavid van Moolenbroek */
1303*ef8d499eSDavid van Moolenbroek sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
1304*ef8d499eSDavid van Moolenbroek
1305*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb != NULL);
1306*ef8d499eSDavid van Moolenbroek } else if (tcp->tcp_snd.ts_unsent != NULL &&
1307*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) {
1308*ef8d499eSDavid van Moolenbroek /*
1309*ef8d499eSDavid van Moolenbroek * If the send buffer is full, we will no longer call
1310*ef8d499eSDavid van Moolenbroek * tcp_output(), which means we may also miss out on fatal
1311*ef8d499eSDavid van Moolenbroek * errors that would otherwise kill the connection (e.g., no
1312*ef8d499eSDavid van Moolenbroek * route). As a result, the connection may erroneously
1313*ef8d499eSDavid van Moolenbroek * continue to exist for a long time. To avoid this, we call
1314*ef8d499eSDavid van Moolenbroek * tcp_output() every once in a while when there are still
1315*ef8d499eSDavid van Moolenbroek * unsent data.
1316*ef8d499eSDavid van Moolenbroek */
1317*ef8d499eSDavid van Moolenbroek err = tcp_output(tcp->tcp_pcb);
1318*ef8d499eSDavid van Moolenbroek
1319*ef8d499eSDavid van Moolenbroek if (err != ERR_OK && err != ERR_MEM) {
1320*ef8d499eSDavid van Moolenbroek tcpsock_pcb_abort(tcp);
1321*ef8d499eSDavid van Moolenbroek
1322*ef8d499eSDavid van Moolenbroek if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
1323*ef8d499eSDavid van Moolenbroek r = util_convert_err(err);
1324*ef8d499eSDavid van Moolenbroek
1325*ef8d499eSDavid van Moolenbroek sockevent_set_error(tcpsock_get_sock(tcp), r);
1326*ef8d499eSDavid van Moolenbroek }
1327*ef8d499eSDavid van Moolenbroek /* Otherwise do not touch the socket object anymore! */
1328*ef8d499eSDavid van Moolenbroek
1329*ef8d499eSDavid van Moolenbroek return ERR_ABRT;
1330*ef8d499eSDavid van Moolenbroek }
1331*ef8d499eSDavid van Moolenbroek }
1332*ef8d499eSDavid van Moolenbroek
1333*ef8d499eSDavid van Moolenbroek /*
1334*ef8d499eSDavid van Moolenbroek * If we are closing the socket, and we sent a FIN, see if the FIN got
1335*ef8d499eSDavid van Moolenbroek * acknowledged. If so, finish closing the socket. Unfortunately, we
1336*ef8d499eSDavid van Moolenbroek * can perform this check by polling only. TODO: change lwIP..
1337*ef8d499eSDavid van Moolenbroek */
1338*ef8d499eSDavid van Moolenbroek if (sockevent_is_closing(tcpsock_get_sock(tcp)) &&
1339*ef8d499eSDavid van Moolenbroek (tcpsock_get_flags(tcp) & TCPF_SENT_FIN) &&
1340*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) {
1341*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_len == 0);
1342*ef8d499eSDavid van Moolenbroek
1343*ef8d499eSDavid van Moolenbroek tcpsock_finish_close(tcp);
1344*ef8d499eSDavid van Moolenbroek }
1345*ef8d499eSDavid van Moolenbroek
1346*ef8d499eSDavid van Moolenbroek return ERR_OK;
1347*ef8d499eSDavid van Moolenbroek }
1348*ef8d499eSDavid van Moolenbroek
1349*ef8d499eSDavid van Moolenbroek /*
1350*ef8d499eSDavid van Moolenbroek * Bind a TCP socket to a local address.
1351*ef8d499eSDavid van Moolenbroek */
1352*ef8d499eSDavid van Moolenbroek static int
tcpsock_bind(struct sock * sock,const struct sockaddr * addr,socklen_t addr_len,endpoint_t user_endpt)1353*ef8d499eSDavid van Moolenbroek tcpsock_bind(struct sock * sock, const struct sockaddr * addr,
1354*ef8d499eSDavid van Moolenbroek socklen_t addr_len, endpoint_t user_endpt)
1355*ef8d499eSDavid van Moolenbroek {
1356*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
1357*ef8d499eSDavid van Moolenbroek ip_addr_t ipaddr;
1358*ef8d499eSDavid van Moolenbroek uint16_t port;
1359*ef8d499eSDavid van Moolenbroek err_t err;
1360*ef8d499eSDavid van Moolenbroek int r;
1361*ef8d499eSDavid van Moolenbroek
1362*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)
1363*ef8d499eSDavid van Moolenbroek return EINVAL;
1364*ef8d499eSDavid van Moolenbroek
1365*ef8d499eSDavid van Moolenbroek if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1366*ef8d499eSDavid van Moolenbroek user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port,
1367*ef8d499eSDavid van Moolenbroek FALSE /*allow_mcast*/, &ipaddr, &port)) != OK)
1368*ef8d499eSDavid van Moolenbroek return r;
1369*ef8d499eSDavid van Moolenbroek
1370*ef8d499eSDavid van Moolenbroek err = tcp_bind(tcp->tcp_pcb, &ipaddr, port);
1371*ef8d499eSDavid van Moolenbroek
1372*ef8d499eSDavid van Moolenbroek return util_convert_err(err);
1373*ef8d499eSDavid van Moolenbroek }
1374*ef8d499eSDavid van Moolenbroek
1375*ef8d499eSDavid van Moolenbroek /*
1376*ef8d499eSDavid van Moolenbroek * Callback from lwIP. A new connection 'pcb' has arrived on the listening
1377*ef8d499eSDavid van Moolenbroek * socket identified by 'arg'. Note that 'pcb' may be NULL in the case that
1378*ef8d499eSDavid van Moolenbroek * lwIP could not accept the connection itself.
1379*ef8d499eSDavid van Moolenbroek */
1380*ef8d499eSDavid van Moolenbroek static err_t
tcpsock_event_accept(void * arg,struct tcp_pcb * pcb,err_t err)1381*ef8d499eSDavid van Moolenbroek tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err)
1382*ef8d499eSDavid van Moolenbroek {
1383*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)arg;
1384*ef8d499eSDavid van Moolenbroek
1385*ef8d499eSDavid van Moolenbroek assert(tcp != NULL);
1386*ef8d499eSDavid van Moolenbroek assert(tcpsock_is_listening(tcp));
1387*ef8d499eSDavid van Moolenbroek
1388*ef8d499eSDavid van Moolenbroek /*
1389*ef8d499eSDavid van Moolenbroek * If the given PCB is NULL, then lwIP ran out of memory allocating a
1390*ef8d499eSDavid van Moolenbroek * PCB for the new connection. There is nothing we can do with that
1391*ef8d499eSDavid van Moolenbroek * information. Also check 'err' just to make sure.
1392*ef8d499eSDavid van Moolenbroek */
1393*ef8d499eSDavid van Moolenbroek if (pcb == NULL || err != OK)
1394*ef8d499eSDavid van Moolenbroek return ERR_OK;
1395*ef8d499eSDavid van Moolenbroek
1396*ef8d499eSDavid van Moolenbroek /*
1397*ef8d499eSDavid van Moolenbroek * The TCP socket is the listening socket, but the PCB is for the
1398*ef8d499eSDavid van Moolenbroek * incoming connection.
1399*ef8d499eSDavid van Moolenbroek */
1400*ef8d499eSDavid van Moolenbroek if (tcpsock_clone(tcp, pcb) != OK) {
1401*ef8d499eSDavid van Moolenbroek /*
1402*ef8d499eSDavid van Moolenbroek * We could not allocate the resources necessary to accept the
1403*ef8d499eSDavid van Moolenbroek * connection. Abort it immediately.
1404*ef8d499eSDavid van Moolenbroek */
1405*ef8d499eSDavid van Moolenbroek tcp_abort(pcb);
1406*ef8d499eSDavid van Moolenbroek
1407*ef8d499eSDavid van Moolenbroek return ERR_ABRT;
1408*ef8d499eSDavid van Moolenbroek }
1409*ef8d499eSDavid van Moolenbroek
1410*ef8d499eSDavid van Moolenbroek /*
1411*ef8d499eSDavid van Moolenbroek * The connection has not yet been accepted, and thus should still be
1412*ef8d499eSDavid van Moolenbroek * considered on the listen queue.
1413*ef8d499eSDavid van Moolenbroek */
1414*ef8d499eSDavid van Moolenbroek tcp_backlog_delayed(pcb);
1415*ef8d499eSDavid van Moolenbroek
1416*ef8d499eSDavid van Moolenbroek /* Set the callback functions. */
1417*ef8d499eSDavid van Moolenbroek tcp_recv(pcb, tcpsock_event_recv);
1418*ef8d499eSDavid van Moolenbroek tcp_sent(pcb, tcpsock_event_sent);
1419*ef8d499eSDavid van Moolenbroek tcp_err(pcb, tcpsock_event_err);
1420*ef8d499eSDavid van Moolenbroek tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
1421*ef8d499eSDavid van Moolenbroek
1422*ef8d499eSDavid van Moolenbroek sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT);
1423*ef8d499eSDavid van Moolenbroek
1424*ef8d499eSDavid van Moolenbroek return ERR_OK;
1425*ef8d499eSDavid van Moolenbroek }
1426*ef8d499eSDavid van Moolenbroek
1427*ef8d499eSDavid van Moolenbroek /*
1428*ef8d499eSDavid van Moolenbroek * Put a TCP socket in listening mode.
1429*ef8d499eSDavid van Moolenbroek */
1430*ef8d499eSDavid van Moolenbroek static int
tcpsock_listen(struct sock * sock,int backlog)1431*ef8d499eSDavid van Moolenbroek tcpsock_listen(struct sock * sock, int backlog)
1432*ef8d499eSDavid van Moolenbroek {
1433*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
1434*ef8d499eSDavid van Moolenbroek struct tcp_pcb *pcb;
1435*ef8d499eSDavid van Moolenbroek err_t err;
1436*ef8d499eSDavid van Moolenbroek
1437*ef8d499eSDavid van Moolenbroek /* The maximum backlog value must not exceed its field size. */
1438*ef8d499eSDavid van Moolenbroek assert(SOMAXCONN <= UINT8_MAX);
1439*ef8d499eSDavid van Moolenbroek
1440*ef8d499eSDavid van Moolenbroek /*
1441*ef8d499eSDavid van Moolenbroek * Allow only CLOSED sockets to enter listening mode. If the socket
1442*ef8d499eSDavid van Moolenbroek * was already in listening mode, allow its backlog value to be
1443*ef8d499eSDavid van Moolenbroek * updated, even if it was shut down already (making this a no-op).
1444*ef8d499eSDavid van Moolenbroek */
1445*ef8d499eSDavid van Moolenbroek if (!tcpsock_is_listening(tcp) &&
1446*ef8d499eSDavid van Moolenbroek (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED))
1447*ef8d499eSDavid van Moolenbroek return EINVAL;
1448*ef8d499eSDavid van Moolenbroek
1449*ef8d499eSDavid van Moolenbroek /*
1450*ef8d499eSDavid van Moolenbroek * If the socket was not already in listening mode, put it in that mode
1451*ef8d499eSDavid van Moolenbroek * now. That involves switching PCBs as lwIP attempts to save memory
1452*ef8d499eSDavid van Moolenbroek * by replacing the original PCB with a smaller one. If the socket was
1453*ef8d499eSDavid van Moolenbroek * already in listening mode, simply update its backlog value--this has
1454*ef8d499eSDavid van Moolenbroek * no effect on the sockets already in the backlog.
1455*ef8d499eSDavid van Moolenbroek */
1456*ef8d499eSDavid van Moolenbroek if (!tcpsock_is_listening(tcp)) {
1457*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb != NULL);
1458*ef8d499eSDavid van Moolenbroek
1459*ef8d499eSDavid van Moolenbroek /*
1460*ef8d499eSDavid van Moolenbroek * If the socket has not been bound to a port yet, do that
1461*ef8d499eSDavid van Moolenbroek * first. This does mean that the listen call may fail with
1462*ef8d499eSDavid van Moolenbroek * side effects, but that is acceptable in this case.
1463*ef8d499eSDavid van Moolenbroek */
1464*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb->local_port == 0) {
1465*ef8d499eSDavid van Moolenbroek err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip,
1466*ef8d499eSDavid van Moolenbroek 0 /*port*/);
1467*ef8d499eSDavid van Moolenbroek
1468*ef8d499eSDavid van Moolenbroek if (err != ERR_OK)
1469*ef8d499eSDavid van Moolenbroek return util_convert_err(err);
1470*ef8d499eSDavid van Moolenbroek }
1471*ef8d499eSDavid van Moolenbroek
1472*ef8d499eSDavid van Moolenbroek /*
1473*ef8d499eSDavid van Moolenbroek * Clear the argument on the PCB that is about to be replaced,
1474*ef8d499eSDavid van Moolenbroek * because if we do not, once the PCB is reused (which does not
1475*ef8d499eSDavid van Moolenbroek * clear the argument), we might get weird events. Do this
1476*ef8d499eSDavid van Moolenbroek * before the tcp_listen() call, because we should no longer
1477*ef8d499eSDavid van Moolenbroek * access the old PCB afterwards (even if we can).
1478*ef8d499eSDavid van Moolenbroek */
1479*ef8d499eSDavid van Moolenbroek tcp_arg(tcp->tcp_pcb, NULL);
1480*ef8d499eSDavid van Moolenbroek
1481*ef8d499eSDavid van Moolenbroek pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog,
1482*ef8d499eSDavid van Moolenbroek &err);
1483*ef8d499eSDavid van Moolenbroek
1484*ef8d499eSDavid van Moolenbroek if (pcb == NULL) {
1485*ef8d499eSDavid van Moolenbroek tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */
1486*ef8d499eSDavid van Moolenbroek
1487*ef8d499eSDavid van Moolenbroek return util_convert_err(err);
1488*ef8d499eSDavid van Moolenbroek }
1489*ef8d499eSDavid van Moolenbroek
1490*ef8d499eSDavid van Moolenbroek tcp_arg(pcb, tcp);
1491*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb = pcb;
1492*ef8d499eSDavid van Moolenbroek
1493*ef8d499eSDavid van Moolenbroek tcp_accept(pcb, tcpsock_event_accept);
1494*ef8d499eSDavid van Moolenbroek
1495*ef8d499eSDavid van Moolenbroek /* Initialize the queue head for sockets pending acceptance. */
1496*ef8d499eSDavid van Moolenbroek TAILQ_INIT(&tcp->tcp_queue.tq_head);
1497*ef8d499eSDavid van Moolenbroek } else if (tcp->tcp_pcb != NULL)
1498*ef8d499eSDavid van Moolenbroek tcp_backlog_set(tcp->tcp_pcb, backlog);
1499*ef8d499eSDavid van Moolenbroek
1500*ef8d499eSDavid van Moolenbroek return OK;
1501*ef8d499eSDavid van Moolenbroek }
1502*ef8d499eSDavid van Moolenbroek
1503*ef8d499eSDavid van Moolenbroek /*
1504*ef8d499eSDavid van Moolenbroek * Callback from lwIP. A socket connection attempt has succeeded. Note that
1505*ef8d499eSDavid van Moolenbroek * failed socket events will trigger the tcpsock_event_err() callback instead.
1506*ef8d499eSDavid van Moolenbroek */
1507*ef8d499eSDavid van Moolenbroek static err_t
tcpsock_event_connected(void * arg,struct tcp_pcb * pcb __unused,err_t err)1508*ef8d499eSDavid van Moolenbroek tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err)
1509*ef8d499eSDavid van Moolenbroek {
1510*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)arg;
1511*ef8d499eSDavid van Moolenbroek
1512*ef8d499eSDavid van Moolenbroek assert(tcp != NULL);
1513*ef8d499eSDavid van Moolenbroek assert(pcb == tcp->tcp_pcb);
1514*ef8d499eSDavid van Moolenbroek assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING);
1515*ef8d499eSDavid van Moolenbroek
1516*ef8d499eSDavid van Moolenbroek /*
1517*ef8d499eSDavid van Moolenbroek * If lwIP ever changes so that this callback is called for connect
1518*ef8d499eSDavid van Moolenbroek * failures as well, then we need to change the code here accordingly.
1519*ef8d499eSDavid van Moolenbroek */
1520*ef8d499eSDavid van Moolenbroek if (err != ERR_OK)
1521*ef8d499eSDavid van Moolenbroek panic("TCP connected event with error: %d", err);
1522*ef8d499eSDavid van Moolenbroek
1523*ef8d499eSDavid van Moolenbroek tcpsock_clear_flag(tcp, TCPF_CONNECTING);
1524*ef8d499eSDavid van Moolenbroek
1525*ef8d499eSDavid van Moolenbroek sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND);
1526*ef8d499eSDavid van Moolenbroek
1527*ef8d499eSDavid van Moolenbroek return ERR_OK;
1528*ef8d499eSDavid van Moolenbroek }
1529*ef8d499eSDavid van Moolenbroek
1530*ef8d499eSDavid van Moolenbroek /*
1531*ef8d499eSDavid van Moolenbroek * Connect a TCP socket to a remote address.
1532*ef8d499eSDavid van Moolenbroek */
1533*ef8d499eSDavid van Moolenbroek static int
tcpsock_connect(struct sock * sock,const struct sockaddr * addr,socklen_t addr_len,endpoint_t user_endpt)1534*ef8d499eSDavid van Moolenbroek tcpsock_connect(struct sock * sock, const struct sockaddr * addr,
1535*ef8d499eSDavid van Moolenbroek socklen_t addr_len, endpoint_t user_endpt)
1536*ef8d499eSDavid van Moolenbroek {
1537*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
1538*ef8d499eSDavid van Moolenbroek ip_addr_t dst_addr;
1539*ef8d499eSDavid van Moolenbroek uint16_t dst_port;
1540*ef8d499eSDavid van Moolenbroek err_t err;
1541*ef8d499eSDavid van Moolenbroek int r;
1542*ef8d499eSDavid van Moolenbroek
1543*ef8d499eSDavid van Moolenbroek /*
1544*ef8d499eSDavid van Moolenbroek * Listening sockets may not have a PCB, so we use higher-level flags
1545*ef8d499eSDavid van Moolenbroek * to throw the correct error code for those instead.
1546*ef8d499eSDavid van Moolenbroek */
1547*ef8d499eSDavid van Moolenbroek if (tcpsock_is_listening(tcp))
1548*ef8d499eSDavid van Moolenbroek return EOPNOTSUPP;
1549*ef8d499eSDavid van Moolenbroek
1550*ef8d499eSDavid van Moolenbroek /*
1551*ef8d499eSDavid van Moolenbroek * If there is no longer any PCB, we obviously cannot perform the
1552*ef8d499eSDavid van Moolenbroek * connection, but POSIX is not clear on which error to return. We
1553*ef8d499eSDavid van Moolenbroek * copy NetBSD's.
1554*ef8d499eSDavid van Moolenbroek */
1555*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb == NULL)
1556*ef8d499eSDavid van Moolenbroek return EINVAL;
1557*ef8d499eSDavid van Moolenbroek
1558*ef8d499eSDavid van Moolenbroek /*
1559*ef8d499eSDavid van Moolenbroek * The only state from which a connection can be initiated, is CLOSED.
1560*ef8d499eSDavid van Moolenbroek * Some of the other states require distinct error codes, though.
1561*ef8d499eSDavid van Moolenbroek */
1562*ef8d499eSDavid van Moolenbroek switch (tcp->tcp_pcb->state) {
1563*ef8d499eSDavid van Moolenbroek case CLOSED:
1564*ef8d499eSDavid van Moolenbroek break;
1565*ef8d499eSDavid van Moolenbroek case SYN_SENT:
1566*ef8d499eSDavid van Moolenbroek return EALREADY;
1567*ef8d499eSDavid van Moolenbroek case LISTEN:
1568*ef8d499eSDavid van Moolenbroek assert(0); /* we just checked.. */
1569*ef8d499eSDavid van Moolenbroek default:
1570*ef8d499eSDavid van Moolenbroek return EISCONN;
1571*ef8d499eSDavid van Moolenbroek }
1572*ef8d499eSDavid van Moolenbroek
1573*ef8d499eSDavid van Moolenbroek /*
1574*ef8d499eSDavid van Moolenbroek * Get the destination address, and attempt to start connecting. If
1575*ef8d499eSDavid van Moolenbroek * the socket was not bound before, or it was bound to a port only,
1576*ef8d499eSDavid van Moolenbroek * then lwIP will select a source address for us. We cannot do this
1577*ef8d499eSDavid van Moolenbroek * ourselves even if we wanted to: it is impossible to re-bind a TCP
1578*ef8d499eSDavid van Moolenbroek * PCB in the case it was previously bound to a port only.
1579*ef8d499eSDavid van Moolenbroek */
1580*ef8d499eSDavid van Moolenbroek if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1581*ef8d499eSDavid van Moolenbroek &tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK)
1582*ef8d499eSDavid van Moolenbroek return r;
1583*ef8d499eSDavid van Moolenbroek
1584*ef8d499eSDavid van Moolenbroek err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port,
1585*ef8d499eSDavid van Moolenbroek tcpsock_event_connected);
1586*ef8d499eSDavid van Moolenbroek
1587*ef8d499eSDavid van Moolenbroek /*
1588*ef8d499eSDavid van Moolenbroek * Note that various tcp_connect() error cases will leave the PCB with
1589*ef8d499eSDavid van Moolenbroek * a newly set local and remote IP address anyway. We should be
1590*ef8d499eSDavid van Moolenbroek * careful not to rely on the addresses being as they were before.
1591*ef8d499eSDavid van Moolenbroek */
1592*ef8d499eSDavid van Moolenbroek if (err != ERR_OK)
1593*ef8d499eSDavid van Moolenbroek return util_convert_err(err);
1594*ef8d499eSDavid van Moolenbroek
1595*ef8d499eSDavid van Moolenbroek /* Set the other callback functions. */
1596*ef8d499eSDavid van Moolenbroek tcp_recv(tcp->tcp_pcb, tcpsock_event_recv);
1597*ef8d499eSDavid van Moolenbroek tcp_sent(tcp->tcp_pcb, tcpsock_event_sent);
1598*ef8d499eSDavid van Moolenbroek tcp_err(tcp->tcp_pcb, tcpsock_event_err);
1599*ef8d499eSDavid van Moolenbroek tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
1600*ef8d499eSDavid van Moolenbroek
1601*ef8d499eSDavid van Moolenbroek /*
1602*ef8d499eSDavid van Moolenbroek * Set a flag so that we can correct lwIP's error codes in case the
1603*ef8d499eSDavid van Moolenbroek * connection fails.
1604*ef8d499eSDavid van Moolenbroek */
1605*ef8d499eSDavid van Moolenbroek tcpsock_set_flag(tcp, TCPF_CONNECTING);
1606*ef8d499eSDavid van Moolenbroek
1607*ef8d499eSDavid van Moolenbroek return SUSPEND;
1608*ef8d499eSDavid van Moolenbroek }
1609*ef8d499eSDavid van Moolenbroek
1610*ef8d499eSDavid van Moolenbroek /*
1611*ef8d499eSDavid van Moolenbroek * Test whether any new connections are pending on a listening TCP socket.
1612*ef8d499eSDavid van Moolenbroek */
1613*ef8d499eSDavid van Moolenbroek static int
tcpsock_test_accept(struct sock * sock)1614*ef8d499eSDavid van Moolenbroek tcpsock_test_accept(struct sock * sock)
1615*ef8d499eSDavid van Moolenbroek {
1616*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
1617*ef8d499eSDavid van Moolenbroek
1618*ef8d499eSDavid van Moolenbroek /* Is this socket in listening mode at all? */
1619*ef8d499eSDavid van Moolenbroek if (!tcpsock_is_listening(tcp))
1620*ef8d499eSDavid van Moolenbroek return EINVAL;
1621*ef8d499eSDavid van Moolenbroek
1622*ef8d499eSDavid van Moolenbroek /* Are there any connections to accept right now? */
1623*ef8d499eSDavid van Moolenbroek if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head))
1624*ef8d499eSDavid van Moolenbroek return OK;
1625*ef8d499eSDavid van Moolenbroek
1626*ef8d499eSDavid van Moolenbroek /* If the socket has been shut down, we return ECONNABORTED. */
1627*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb == NULL)
1628*ef8d499eSDavid van Moolenbroek return ECONNABORTED;
1629*ef8d499eSDavid van Moolenbroek
1630*ef8d499eSDavid van Moolenbroek /* Otherwise, wait for a new connection first. */
1631*ef8d499eSDavid van Moolenbroek return SUSPEND;
1632*ef8d499eSDavid van Moolenbroek }
1633*ef8d499eSDavid van Moolenbroek
1634*ef8d499eSDavid van Moolenbroek /*
1635*ef8d499eSDavid van Moolenbroek * Accept a connection on a listening TCP socket, creating a new TCP socket.
1636*ef8d499eSDavid van Moolenbroek */
1637*ef8d499eSDavid van Moolenbroek static sockid_t
tcpsock_accept(struct sock * sock,struct sockaddr * addr,socklen_t * addr_len,endpoint_t user_endpt __unused,struct sock ** newsockp)1638*ef8d499eSDavid van Moolenbroek tcpsock_accept(struct sock * sock, struct sockaddr * addr,
1639*ef8d499eSDavid van Moolenbroek socklen_t * addr_len, endpoint_t user_endpt __unused,
1640*ef8d499eSDavid van Moolenbroek struct sock ** newsockp)
1641*ef8d499eSDavid van Moolenbroek {
1642*ef8d499eSDavid van Moolenbroek struct tcpsock *listener = (struct tcpsock *)sock;
1643*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp;
1644*ef8d499eSDavid van Moolenbroek int r;
1645*ef8d499eSDavid van Moolenbroek
1646*ef8d499eSDavid van Moolenbroek if ((r = tcpsock_test_accept(sock)) != OK)
1647*ef8d499eSDavid van Moolenbroek return r;
1648*ef8d499eSDavid van Moolenbroek /* Below, we must not assume that the listener has a PCB. */
1649*ef8d499eSDavid van Moolenbroek
1650*ef8d499eSDavid van Moolenbroek tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head);
1651*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_listener == listener);
1652*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb != NULL);
1653*ef8d499eSDavid van Moolenbroek
1654*ef8d499eSDavid van Moolenbroek TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next);
1655*ef8d499eSDavid van Moolenbroek tcp->tcp_listener = NULL;
1656*ef8d499eSDavid van Moolenbroek
1657*ef8d499eSDavid van Moolenbroek tcp_backlog_accepted(tcp->tcp_pcb);
1658*ef8d499eSDavid van Moolenbroek
1659*ef8d499eSDavid van Moolenbroek ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1660*ef8d499eSDavid van Moolenbroek &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
1661*ef8d499eSDavid van Moolenbroek
1662*ef8d499eSDavid van Moolenbroek /*
1663*ef8d499eSDavid van Moolenbroek * Set 'newsockp' to NULL so that libsockevent knows we already cloned
1664*ef8d499eSDavid van Moolenbroek * the socket, and it must not be reinitialized anymore.
1665*ef8d499eSDavid van Moolenbroek */
1666*ef8d499eSDavid van Moolenbroek *newsockp = NULL;
1667*ef8d499eSDavid van Moolenbroek return tcpsock_get_id(tcp);
1668*ef8d499eSDavid van Moolenbroek }
1669*ef8d499eSDavid van Moolenbroek
1670*ef8d499eSDavid van Moolenbroek /*
1671*ef8d499eSDavid van Moolenbroek * Perform preliminary checks on a send request.
1672*ef8d499eSDavid van Moolenbroek */
1673*ef8d499eSDavid van Moolenbroek static int
tcpsock_pre_send(struct sock * sock,size_t len __unused,socklen_t ctl_len __unused,const struct sockaddr * addr __unused,socklen_t addr_len __unused,endpoint_t user_endpt __unused,int flags)1674*ef8d499eSDavid van Moolenbroek tcpsock_pre_send(struct sock * sock, size_t len __unused,
1675*ef8d499eSDavid van Moolenbroek socklen_t ctl_len __unused, const struct sockaddr * addr __unused,
1676*ef8d499eSDavid van Moolenbroek socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags)
1677*ef8d499eSDavid van Moolenbroek {
1678*ef8d499eSDavid van Moolenbroek
1679*ef8d499eSDavid van Moolenbroek /*
1680*ef8d499eSDavid van Moolenbroek * Reject calls with unknown flags. Since libsockevent strips out the
1681*ef8d499eSDavid van Moolenbroek * flags it handles itself here, we only have to test for ones we can
1682*ef8d499eSDavid van Moolenbroek * not handle. Currently, there are no send flags that we support.
1683*ef8d499eSDavid van Moolenbroek */
1684*ef8d499eSDavid van Moolenbroek if (flags != 0)
1685*ef8d499eSDavid van Moolenbroek return EOPNOTSUPP;
1686*ef8d499eSDavid van Moolenbroek
1687*ef8d499eSDavid van Moolenbroek return OK;
1688*ef8d499eSDavid van Moolenbroek }
1689*ef8d499eSDavid van Moolenbroek
1690*ef8d499eSDavid van Moolenbroek /*
1691*ef8d499eSDavid van Moolenbroek * Test whether the given number of data bytes can be sent on a TCP socket.
1692*ef8d499eSDavid van Moolenbroek */
1693*ef8d499eSDavid van Moolenbroek static int
tcpsock_test_send(struct sock * sock,size_t min)1694*ef8d499eSDavid van Moolenbroek tcpsock_test_send(struct sock * sock, size_t min)
1695*ef8d499eSDavid van Moolenbroek {
1696*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
1697*ef8d499eSDavid van Moolenbroek size_t sndbuf;
1698*ef8d499eSDavid van Moolenbroek
1699*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb == NULL)
1700*ef8d499eSDavid van Moolenbroek return EPIPE;
1701*ef8d499eSDavid van Moolenbroek
1702*ef8d499eSDavid van Moolenbroek switch (tcp->tcp_pcb->state) {
1703*ef8d499eSDavid van Moolenbroek case CLOSED: /* new */
1704*ef8d499eSDavid van Moolenbroek case LISTEN: /* listening */
1705*ef8d499eSDavid van Moolenbroek return ENOTCONN;
1706*ef8d499eSDavid van Moolenbroek case SYN_SENT: /* connecting */
1707*ef8d499eSDavid van Moolenbroek case SYN_RCVD: /* simultaneous open, maybe someday? */
1708*ef8d499eSDavid van Moolenbroek return SUSPEND;
1709*ef8d499eSDavid van Moolenbroek case ESTABLISHED: /* connected */
1710*ef8d499eSDavid van Moolenbroek case CLOSE_WAIT: /* closed remotely */
1711*ef8d499eSDavid van Moolenbroek break;
1712*ef8d499eSDavid van Moolenbroek default: /* shut down locally */
1713*ef8d499eSDavid van Moolenbroek assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
1714*ef8d499eSDavid van Moolenbroek return EPIPE;
1715*ef8d499eSDavid van Moolenbroek }
1716*ef8d499eSDavid van Moolenbroek
1717*ef8d499eSDavid van Moolenbroek sndbuf = tcpsock_get_sndbuf(tcp);
1718*ef8d499eSDavid van Moolenbroek if (min > sndbuf)
1719*ef8d499eSDavid van Moolenbroek min = sndbuf;
1720*ef8d499eSDavid van Moolenbroek
1721*ef8d499eSDavid van Moolenbroek if (tcp->tcp_snd.ts_len + min > sndbuf)
1722*ef8d499eSDavid van Moolenbroek return SUSPEND;
1723*ef8d499eSDavid van Moolenbroek else
1724*ef8d499eSDavid van Moolenbroek return OK;
1725*ef8d499eSDavid van Moolenbroek }
1726*ef8d499eSDavid van Moolenbroek
1727*ef8d499eSDavid van Moolenbroek /*
1728*ef8d499eSDavid van Moolenbroek * Send data on a TCP socket.
1729*ef8d499eSDavid van Moolenbroek */
1730*ef8d499eSDavid van Moolenbroek static int
tcpsock_send(struct sock * sock,const struct sockdriver_data * data,size_t len,size_t * offp,const struct sockdriver_data * ctl __unused,socklen_t ctl_len __unused,socklen_t * ctl_off __unused,const struct sockaddr * addr __unused,socklen_t addr_len __unused,endpoint_t user_endpt __unused,int flags __unused,size_t min)1731*ef8d499eSDavid van Moolenbroek tcpsock_send(struct sock * sock, const struct sockdriver_data * data,
1732*ef8d499eSDavid van Moolenbroek size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
1733*ef8d499eSDavid van Moolenbroek socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
1734*ef8d499eSDavid van Moolenbroek const struct sockaddr * addr __unused, socklen_t addr_len __unused,
1735*ef8d499eSDavid van Moolenbroek endpoint_t user_endpt __unused, int flags __unused, size_t min)
1736*ef8d499eSDavid van Moolenbroek {
1737*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
1738*ef8d499eSDavid van Moolenbroek struct pbuf *ptail, *pfirst, *pnext, *plast;
1739*ef8d499eSDavid van Moolenbroek size_t off, tail_off, chunk, left, sndbuf;
1740*ef8d499eSDavid van Moolenbroek int r;
1741*ef8d499eSDavid van Moolenbroek
1742*ef8d499eSDavid van Moolenbroek if ((r = tcpsock_test_send(sock, min)) != OK)
1743*ef8d499eSDavid van Moolenbroek return r;
1744*ef8d499eSDavid van Moolenbroek
1745*ef8d499eSDavid van Moolenbroek if (len == 0)
1746*ef8d499eSDavid van Moolenbroek return OK; /* nothing to do */
1747*ef8d499eSDavid van Moolenbroek
1748*ef8d499eSDavid van Moolenbroek sndbuf = tcpsock_get_sndbuf(tcp);
1749*ef8d499eSDavid van Moolenbroek if (min > sndbuf)
1750*ef8d499eSDavid van Moolenbroek min = sndbuf;
1751*ef8d499eSDavid van Moolenbroek assert(min > 0);
1752*ef8d499eSDavid van Moolenbroek
1753*ef8d499eSDavid van Moolenbroek assert(sndbuf > tcp->tcp_snd.ts_len);
1754*ef8d499eSDavid van Moolenbroek left = sndbuf - tcp->tcp_snd.ts_len;
1755*ef8d499eSDavid van Moolenbroek if (left > len)
1756*ef8d499eSDavid van Moolenbroek left = len;
1757*ef8d499eSDavid van Moolenbroek
1758*ef8d499eSDavid van Moolenbroek /*
1759*ef8d499eSDavid van Moolenbroek * First see if we can fit any more data in the current tail buffer.
1760*ef8d499eSDavid van Moolenbroek * If so, we set 'ptail' to point to it and 'tail_off' to the previous
1761*ef8d499eSDavid van Moolenbroek * length of the tail buffer, while optimistically extending it to
1762*ef8d499eSDavid van Moolenbroek * include the new data. If not, we set them to NULL/0.
1763*ef8d499eSDavid van Moolenbroek */
1764*ef8d499eSDavid van Moolenbroek if ((ptail = tcp->tcp_snd.ts_tail) != NULL &&
1765*ef8d499eSDavid van Moolenbroek ptail->len < ptail->tot_len) {
1766*ef8d499eSDavid van Moolenbroek assert(ptail->len > 0);
1767*ef8d499eSDavid van Moolenbroek tail_off = (size_t)ptail->len;
1768*ef8d499eSDavid van Moolenbroek
1769*ef8d499eSDavid van Moolenbroek /*
1770*ef8d499eSDavid van Moolenbroek * Optimistically extend the head buffer to include whatever
1771*ef8d499eSDavid van Moolenbroek * fits in it. This is needed for util_copy_data().
1772*ef8d499eSDavid van Moolenbroek */
1773*ef8d499eSDavid van Moolenbroek assert(ptail->tot_len > ptail->len);
1774*ef8d499eSDavid van Moolenbroek off = (size_t)ptail->tot_len - (size_t)ptail->len;
1775*ef8d499eSDavid van Moolenbroek if (off > left)
1776*ef8d499eSDavid van Moolenbroek off = left;
1777*ef8d499eSDavid van Moolenbroek ptail->len += off;
1778*ef8d499eSDavid van Moolenbroek } else {
1779*ef8d499eSDavid van Moolenbroek ptail = NULL;
1780*ef8d499eSDavid van Moolenbroek tail_off = 0;
1781*ef8d499eSDavid van Moolenbroek off = 0;
1782*ef8d499eSDavid van Moolenbroek }
1783*ef8d499eSDavid van Moolenbroek
1784*ef8d499eSDavid van Moolenbroek /*
1785*ef8d499eSDavid van Moolenbroek * Then, if there is more to send, allocate new buffers as needed. If
1786*ef8d499eSDavid van Moolenbroek * we run out of memory, work with whatever we did manage to grab.
1787*ef8d499eSDavid van Moolenbroek */
1788*ef8d499eSDavid van Moolenbroek pfirst = NULL;
1789*ef8d499eSDavid van Moolenbroek plast = NULL;
1790*ef8d499eSDavid van Moolenbroek while (off < left) {
1791*ef8d499eSDavid van Moolenbroek if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS ||
1792*ef8d499eSDavid van Moolenbroek (pnext = tcpsock_alloc_buf()) == NULL) {
1793*ef8d499eSDavid van Moolenbroek /*
1794*ef8d499eSDavid van Moolenbroek * Chances are that we will end up suspending this send
1795*ef8d499eSDavid van Moolenbroek * request because of being out of buffers. We try to
1796*ef8d499eSDavid van Moolenbroek * resume such requests from the polling function.
1797*ef8d499eSDavid van Moolenbroek */
1798*ef8d499eSDavid van Moolenbroek tcpsock_set_flag(tcp, TCPF_OOM);
1799*ef8d499eSDavid van Moolenbroek
1800*ef8d499eSDavid van Moolenbroek break;
1801*ef8d499eSDavid van Moolenbroek }
1802*ef8d499eSDavid van Moolenbroek
1803*ef8d499eSDavid van Moolenbroek tcpsock_sendbufs++;
1804*ef8d499eSDavid van Moolenbroek
1805*ef8d499eSDavid van Moolenbroek if (pfirst == NULL)
1806*ef8d499eSDavid van Moolenbroek pfirst = pnext;
1807*ef8d499eSDavid van Moolenbroek else
1808*ef8d499eSDavid van Moolenbroek plast->next = pnext;
1809*ef8d499eSDavid van Moolenbroek plast = pnext;
1810*ef8d499eSDavid van Moolenbroek
1811*ef8d499eSDavid van Moolenbroek chunk = (size_t)pnext->tot_len;
1812*ef8d499eSDavid van Moolenbroek if (chunk > left - off)
1813*ef8d499eSDavid van Moolenbroek chunk = left - off;
1814*ef8d499eSDavid van Moolenbroek pnext->len = chunk;
1815*ef8d499eSDavid van Moolenbroek off += chunk;
1816*ef8d499eSDavid van Moolenbroek }
1817*ef8d499eSDavid van Moolenbroek
1818*ef8d499eSDavid van Moolenbroek /*
1819*ef8d499eSDavid van Moolenbroek * Copy in the data and continue, unless we did not manage to find
1820*ef8d499eSDavid van Moolenbroek * enough space to even meet the low send watermark, in which case we
1821*ef8d499eSDavid van Moolenbroek * undo any allocation and suspend the call until later.
1822*ef8d499eSDavid van Moolenbroek */
1823*ef8d499eSDavid van Moolenbroek if (off >= min) {
1824*ef8d499eSDavid van Moolenbroek /*
1825*ef8d499eSDavid van Moolenbroek * Optimistically attach the new buffers to the tail, also for
1826*ef8d499eSDavid van Moolenbroek * util_copy_data(). We undo all this if the copy fails.
1827*ef8d499eSDavid van Moolenbroek */
1828*ef8d499eSDavid van Moolenbroek if (ptail != NULL) {
1829*ef8d499eSDavid van Moolenbroek ptail->next = pfirst;
1830*ef8d499eSDavid van Moolenbroek
1831*ef8d499eSDavid van Moolenbroek pnext = ptail;
1832*ef8d499eSDavid van Moolenbroek } else
1833*ef8d499eSDavid van Moolenbroek pnext = pfirst;
1834*ef8d499eSDavid van Moolenbroek
1835*ef8d499eSDavid van Moolenbroek assert(pnext != NULL);
1836*ef8d499eSDavid van Moolenbroek
1837*ef8d499eSDavid van Moolenbroek r = util_copy_data(data, off, *offp, pnext, tail_off,
1838*ef8d499eSDavid van Moolenbroek TRUE /*copy_in*/);
1839*ef8d499eSDavid van Moolenbroek } else
1840*ef8d499eSDavid van Moolenbroek r = SUSPEND;
1841*ef8d499eSDavid van Moolenbroek
1842*ef8d499eSDavid van Moolenbroek if (r != OK) {
1843*ef8d499eSDavid van Moolenbroek /* Undo the modifications made so far. */
1844*ef8d499eSDavid van Moolenbroek while (pfirst != NULL) {
1845*ef8d499eSDavid van Moolenbroek pnext = pfirst->next;
1846*ef8d499eSDavid van Moolenbroek
1847*ef8d499eSDavid van Moolenbroek assert(tcpsock_sendbufs > 0);
1848*ef8d499eSDavid van Moolenbroek tcpsock_sendbufs--;
1849*ef8d499eSDavid van Moolenbroek
1850*ef8d499eSDavid van Moolenbroek tcpsock_free_buf(pfirst);
1851*ef8d499eSDavid van Moolenbroek
1852*ef8d499eSDavid van Moolenbroek pfirst = pnext;
1853*ef8d499eSDavid van Moolenbroek }
1854*ef8d499eSDavid van Moolenbroek
1855*ef8d499eSDavid van Moolenbroek if (ptail != NULL) {
1856*ef8d499eSDavid van Moolenbroek ptail->next = NULL;
1857*ef8d499eSDavid van Moolenbroek
1858*ef8d499eSDavid van Moolenbroek ptail->len = tail_off;
1859*ef8d499eSDavid van Moolenbroek }
1860*ef8d499eSDavid van Moolenbroek
1861*ef8d499eSDavid van Moolenbroek return r;
1862*ef8d499eSDavid van Moolenbroek }
1863*ef8d499eSDavid van Moolenbroek
1864*ef8d499eSDavid van Moolenbroek /* Attach the new buffers, if any, to the buffer tail. */
1865*ef8d499eSDavid van Moolenbroek if (pfirst != NULL) {
1866*ef8d499eSDavid van Moolenbroek if ((ptail = tcp->tcp_snd.ts_tail) != NULL) {
1867*ef8d499eSDavid van Moolenbroek assert(ptail->len == ptail->tot_len);
1868*ef8d499eSDavid van Moolenbroek
1869*ef8d499eSDavid van Moolenbroek /*
1870*ef8d499eSDavid van Moolenbroek * Due to our earlier optimistic modifications, this
1871*ef8d499eSDavid van Moolenbroek * may or may not be redundant.
1872*ef8d499eSDavid van Moolenbroek */
1873*ef8d499eSDavid van Moolenbroek ptail->next = pfirst;
1874*ef8d499eSDavid van Moolenbroek }
1875*ef8d499eSDavid van Moolenbroek
1876*ef8d499eSDavid van Moolenbroek assert(plast != NULL);
1877*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_tail = plast;
1878*ef8d499eSDavid van Moolenbroek
1879*ef8d499eSDavid van Moolenbroek if (tcp->tcp_snd.ts_head == NULL) {
1880*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_head = pfirst;
1881*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_head_off == 0);
1882*ef8d499eSDavid van Moolenbroek }
1883*ef8d499eSDavid van Moolenbroek if (tcp->tcp_snd.ts_unsent == NULL) {
1884*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_unsent = pfirst;
1885*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_unsent_off == 0);
1886*ef8d499eSDavid van Moolenbroek }
1887*ef8d499eSDavid van Moolenbroek }
1888*ef8d499eSDavid van Moolenbroek
1889*ef8d499eSDavid van Moolenbroek tcp->tcp_snd.ts_len += off;
1890*ef8d499eSDavid van Moolenbroek
1891*ef8d499eSDavid van Moolenbroek /*
1892*ef8d499eSDavid van Moolenbroek * See if we can send any of the data we just enqueued. The socket is
1893*ef8d499eSDavid van Moolenbroek * still open as we are still processing a call from userland on it;
1894*ef8d499eSDavid van Moolenbroek * this saves us from having to deal with the cases that the following
1895*ef8d499eSDavid van Moolenbroek * calls end up freeing the socket object.
1896*ef8d499eSDavid van Moolenbroek */
1897*ef8d499eSDavid van Moolenbroek if (tcpsock_pcb_enqueue(tcp) &&
1898*ef8d499eSDavid van Moolenbroek (r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) {
1899*ef8d499eSDavid van Moolenbroek /*
1900*ef8d499eSDavid van Moolenbroek * That did not go well. Return the error immediately if we
1901*ef8d499eSDavid van Moolenbroek * had not made any progress earlier. Otherwise, return our
1902*ef8d499eSDavid van Moolenbroek * partial progress and leave the error to be picked up later.
1903*ef8d499eSDavid van Moolenbroek */
1904*ef8d499eSDavid van Moolenbroek if (*offp > 0) {
1905*ef8d499eSDavid van Moolenbroek sockevent_set_error(tcpsock_get_sock(tcp), r);
1906*ef8d499eSDavid van Moolenbroek
1907*ef8d499eSDavid van Moolenbroek return OK;
1908*ef8d499eSDavid van Moolenbroek } else
1909*ef8d499eSDavid van Moolenbroek return r;
1910*ef8d499eSDavid van Moolenbroek }
1911*ef8d499eSDavid van Moolenbroek
1912*ef8d499eSDavid van Moolenbroek *offp += off;
1913*ef8d499eSDavid van Moolenbroek return (off < len) ? SUSPEND : OK;
1914*ef8d499eSDavid van Moolenbroek }
1915*ef8d499eSDavid van Moolenbroek
1916*ef8d499eSDavid van Moolenbroek /*
1917*ef8d499eSDavid van Moolenbroek * Perform preliminary checks on a receive request.
1918*ef8d499eSDavid van Moolenbroek */
1919*ef8d499eSDavid van Moolenbroek static int
tcpsock_pre_recv(struct sock * sock __unused,endpoint_t user_endpt __unused,int flags)1920*ef8d499eSDavid van Moolenbroek tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
1921*ef8d499eSDavid van Moolenbroek int flags)
1922*ef8d499eSDavid van Moolenbroek {
1923*ef8d499eSDavid van Moolenbroek
1924*ef8d499eSDavid van Moolenbroek /*
1925*ef8d499eSDavid van Moolenbroek * Reject calls with unknown flags. Since libsockevent strips out the
1926*ef8d499eSDavid van Moolenbroek * flags it handles itself here, we only have to test for ones we can
1927*ef8d499eSDavid van Moolenbroek * not handle.
1928*ef8d499eSDavid van Moolenbroek */
1929*ef8d499eSDavid van Moolenbroek if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0)
1930*ef8d499eSDavid van Moolenbroek return EOPNOTSUPP;
1931*ef8d499eSDavid van Moolenbroek
1932*ef8d499eSDavid van Moolenbroek return OK;
1933*ef8d499eSDavid van Moolenbroek }
1934*ef8d499eSDavid van Moolenbroek
1935*ef8d499eSDavid van Moolenbroek /*
1936*ef8d499eSDavid van Moolenbroek * Return TRUE if receive calls may wait for more data to come in on the
1937*ef8d499eSDavid van Moolenbroek * connection, or FALSE if we already know that that is not going to happen.
1938*ef8d499eSDavid van Moolenbroek */
1939*ef8d499eSDavid van Moolenbroek static int
tcpsock_may_wait(struct tcpsock * tcp)1940*ef8d499eSDavid van Moolenbroek tcpsock_may_wait(struct tcpsock * tcp)
1941*ef8d499eSDavid van Moolenbroek {
1942*ef8d499eSDavid van Moolenbroek
1943*ef8d499eSDavid van Moolenbroek return (tcp->tcp_pcb != NULL &&
1944*ef8d499eSDavid van Moolenbroek !(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN));
1945*ef8d499eSDavid van Moolenbroek }
1946*ef8d499eSDavid van Moolenbroek
1947*ef8d499eSDavid van Moolenbroek /*
1948*ef8d499eSDavid van Moolenbroek * Test whether data can be received on a TCP socket, and if so, how many bytes
1949*ef8d499eSDavid van Moolenbroek * of data.
1950*ef8d499eSDavid van Moolenbroek */
1951*ef8d499eSDavid van Moolenbroek static int
tcpsock_test_recv(struct sock * sock,size_t min,size_t * size)1952*ef8d499eSDavid van Moolenbroek tcpsock_test_recv(struct sock * sock, size_t min, size_t * size)
1953*ef8d499eSDavid van Moolenbroek {
1954*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
1955*ef8d499eSDavid van Moolenbroek int may_wait;
1956*ef8d499eSDavid van Moolenbroek
1957*ef8d499eSDavid van Moolenbroek /* If there is and never was a connection, refuse the call at all. */
1958*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED ||
1959*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb->state == LISTEN))
1960*ef8d499eSDavid van Moolenbroek return ENOTCONN;
1961*ef8d499eSDavid van Moolenbroek
1962*ef8d499eSDavid van Moolenbroek /*
1963*ef8d499eSDavid van Moolenbroek * If we are certain that no more data will come in later, ignore the
1964*ef8d499eSDavid van Moolenbroek * low receive watermark. Otherwise, bound it to the size of the
1965*ef8d499eSDavid van Moolenbroek * receive buffer, or receive calls may block forever.
1966*ef8d499eSDavid van Moolenbroek */
1967*ef8d499eSDavid van Moolenbroek if (!(may_wait = tcpsock_may_wait(tcp)))
1968*ef8d499eSDavid van Moolenbroek min = 1;
1969*ef8d499eSDavid van Moolenbroek else if (min > tcpsock_get_rcvbuf(tcp))
1970*ef8d499eSDavid van Moolenbroek min = tcpsock_get_rcvbuf(tcp);
1971*ef8d499eSDavid van Moolenbroek
1972*ef8d499eSDavid van Moolenbroek if (tcp->tcp_rcv.tr_len >= min) {
1973*ef8d499eSDavid van Moolenbroek if (size != NULL)
1974*ef8d499eSDavid van Moolenbroek *size = tcp->tcp_rcv.tr_len;
1975*ef8d499eSDavid van Moolenbroek
1976*ef8d499eSDavid van Moolenbroek return OK;
1977*ef8d499eSDavid van Moolenbroek }
1978*ef8d499eSDavid van Moolenbroek
1979*ef8d499eSDavid van Moolenbroek return (may_wait) ? SUSPEND : SOCKEVENT_EOF;
1980*ef8d499eSDavid van Moolenbroek }
1981*ef8d499eSDavid van Moolenbroek
1982*ef8d499eSDavid van Moolenbroek /*
1983*ef8d499eSDavid van Moolenbroek * Receive data on a TCP socket.
1984*ef8d499eSDavid van Moolenbroek */
1985*ef8d499eSDavid van Moolenbroek static int
tcpsock_recv(struct sock * sock,const struct sockdriver_data * data,size_t len,size_t * offp,const struct sockdriver_data * ctl __unused,socklen_t ctl_len __unused,socklen_t * ctl_off __unused,struct sockaddr * addr __unused,socklen_t * addr_len __unused,endpoint_t user_endpt __unused,int flags,size_t min,int * rflags __unused)1986*ef8d499eSDavid van Moolenbroek tcpsock_recv(struct sock * sock, const struct sockdriver_data * data,
1987*ef8d499eSDavid van Moolenbroek size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
1988*ef8d499eSDavid van Moolenbroek socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
1989*ef8d499eSDavid van Moolenbroek struct sockaddr * addr __unused, socklen_t * addr_len __unused,
1990*ef8d499eSDavid van Moolenbroek endpoint_t user_endpt __unused, int flags, size_t min,
1991*ef8d499eSDavid van Moolenbroek int * rflags __unused)
1992*ef8d499eSDavid van Moolenbroek {
1993*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
1994*ef8d499eSDavid van Moolenbroek struct pbuf *ptail;
1995*ef8d499eSDavid van Moolenbroek size_t off, left;
1996*ef8d499eSDavid van Moolenbroek int r;
1997*ef8d499eSDavid van Moolenbroek
1998*ef8d499eSDavid van Moolenbroek /* See if we can receive at all, and if so, how much at most. */
1999*ef8d499eSDavid van Moolenbroek if ((r = tcpsock_test_recv(sock, min, NULL)) != OK)
2000*ef8d499eSDavid van Moolenbroek return r;
2001*ef8d499eSDavid van Moolenbroek
2002*ef8d499eSDavid van Moolenbroek if (len == 0)
2003*ef8d499eSDavid van Moolenbroek return OK; /* nothing to do */
2004*ef8d499eSDavid van Moolenbroek
2005*ef8d499eSDavid van Moolenbroek off = tcp->tcp_rcv.tr_len;
2006*ef8d499eSDavid van Moolenbroek if (off > len)
2007*ef8d499eSDavid van Moolenbroek off = len;
2008*ef8d499eSDavid van Moolenbroek
2009*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_head != NULL);
2010*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len);
2011*ef8d499eSDavid van Moolenbroek
2012*ef8d499eSDavid van Moolenbroek /* Copy out the data to the caller. */
2013*ef8d499eSDavid van Moolenbroek if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head,
2014*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK)
2015*ef8d499eSDavid van Moolenbroek return r;
2016*ef8d499eSDavid van Moolenbroek
2017*ef8d499eSDavid van Moolenbroek /* Unless peeking, remove the data from the receive queue. */
2018*ef8d499eSDavid van Moolenbroek if (!(flags & MSG_PEEK)) {
2019*ef8d499eSDavid van Moolenbroek left = off;
2020*ef8d499eSDavid van Moolenbroek
2021*ef8d499eSDavid van Moolenbroek /* Dequeue and free as many entire buffers as possible. */
2022*ef8d499eSDavid van Moolenbroek while ((ptail = tcp->tcp_rcv.tr_head) != NULL &&
2023*ef8d499eSDavid van Moolenbroek left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) {
2024*ef8d499eSDavid van Moolenbroek left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off;
2025*ef8d499eSDavid van Moolenbroek
2026*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_head = ptail->next;
2027*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_head_off = 0;
2028*ef8d499eSDavid van Moolenbroek
2029*ef8d499eSDavid van Moolenbroek if (tcp->tcp_rcv.tr_head == NULL)
2030*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_pre_tailp = NULL;
2031*ef8d499eSDavid van Moolenbroek else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next)
2032*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_pre_tailp =
2033*ef8d499eSDavid van Moolenbroek &tcp->tcp_rcv.tr_head;
2034*ef8d499eSDavid van Moolenbroek
2035*ef8d499eSDavid van Moolenbroek assert(tcpsock_recvbufs > 0);
2036*ef8d499eSDavid van Moolenbroek tcpsock_recvbufs--;
2037*ef8d499eSDavid van Moolenbroek
2038*ef8d499eSDavid van Moolenbroek tcpsock_free_buf(ptail);
2039*ef8d499eSDavid van Moolenbroek }
2040*ef8d499eSDavid van Moolenbroek
2041*ef8d499eSDavid van Moolenbroek /*
2042*ef8d499eSDavid van Moolenbroek * If only part of the (new) head buffer is consumed, adjust
2043*ef8d499eSDavid van Moolenbroek * the saved offset into that buffer.
2044*ef8d499eSDavid van Moolenbroek */
2045*ef8d499eSDavid van Moolenbroek if (left > 0) {
2046*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_head != NULL);
2047*ef8d499eSDavid van Moolenbroek assert((size_t)tcp->tcp_rcv.tr_head->len -
2048*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_head_off > left);
2049*ef8d499eSDavid van Moolenbroek
2050*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_head_off += left;
2051*ef8d499eSDavid van Moolenbroek }
2052*ef8d499eSDavid van Moolenbroek
2053*ef8d499eSDavid van Moolenbroek tcp->tcp_rcv.tr_len -= off;
2054*ef8d499eSDavid van Moolenbroek
2055*ef8d499eSDavid van Moolenbroek if (tcp->tcp_rcv.tr_head != NULL) {
2056*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_pre_tailp != NULL);
2057*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_len > 0);
2058*ef8d499eSDavid van Moolenbroek } else {
2059*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_pre_tailp == NULL);
2060*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_len == 0);
2061*ef8d499eSDavid van Moolenbroek }
2062*ef8d499eSDavid van Moolenbroek
2063*ef8d499eSDavid van Moolenbroek /*
2064*ef8d499eSDavid van Moolenbroek * The receive buffer has shrunk, so there may now be space to
2065*ef8d499eSDavid van Moolenbroek * receive more data.
2066*ef8d499eSDavid van Moolenbroek */
2067*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb != NULL)
2068*ef8d499eSDavid van Moolenbroek tcpsock_ack_recv(tcp);
2069*ef8d499eSDavid van Moolenbroek } else
2070*ef8d499eSDavid van Moolenbroek flags &= ~MSG_WAITALL; /* for the check below */
2071*ef8d499eSDavid van Moolenbroek
2072*ef8d499eSDavid van Moolenbroek /* Advance the current copy position, and see if we are done. */
2073*ef8d499eSDavid van Moolenbroek *offp += off;
2074*ef8d499eSDavid van Moolenbroek if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp))
2075*ef8d499eSDavid van Moolenbroek return SUSPEND;
2076*ef8d499eSDavid van Moolenbroek else
2077*ef8d499eSDavid van Moolenbroek return OK;
2078*ef8d499eSDavid van Moolenbroek }
2079*ef8d499eSDavid van Moolenbroek
2080*ef8d499eSDavid van Moolenbroek /*
2081*ef8d499eSDavid van Moolenbroek * Update the set of flag-type socket options on a TCP socket.
2082*ef8d499eSDavid van Moolenbroek */
2083*ef8d499eSDavid van Moolenbroek static void
tcpsock_setsockmask(struct sock * sock,unsigned int mask)2084*ef8d499eSDavid van Moolenbroek tcpsock_setsockmask(struct sock * sock, unsigned int mask)
2085*ef8d499eSDavid van Moolenbroek {
2086*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
2087*ef8d499eSDavid van Moolenbroek
2088*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb == NULL)
2089*ef8d499eSDavid van Moolenbroek return;
2090*ef8d499eSDavid van Moolenbroek
2091*ef8d499eSDavid van Moolenbroek if (mask & SO_REUSEADDR)
2092*ef8d499eSDavid van Moolenbroek ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR);
2093*ef8d499eSDavid van Moolenbroek else
2094*ef8d499eSDavid van Moolenbroek ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR);
2095*ef8d499eSDavid van Moolenbroek
2096*ef8d499eSDavid van Moolenbroek if (mask & SO_KEEPALIVE)
2097*ef8d499eSDavid van Moolenbroek ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE);
2098*ef8d499eSDavid van Moolenbroek else
2099*ef8d499eSDavid van Moolenbroek ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE);
2100*ef8d499eSDavid van Moolenbroek }
2101*ef8d499eSDavid van Moolenbroek
2102*ef8d499eSDavid van Moolenbroek /*
2103*ef8d499eSDavid van Moolenbroek * Prepare a helper structure for IP-level option processing.
2104*ef8d499eSDavid van Moolenbroek */
2105*ef8d499eSDavid van Moolenbroek static void
tcpsock_get_ipopts(struct tcpsock * tcp,struct ipopts * ipopts)2106*ef8d499eSDavid van Moolenbroek tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts)
2107*ef8d499eSDavid van Moolenbroek {
2108*ef8d499eSDavid van Moolenbroek
2109*ef8d499eSDavid van Moolenbroek ipopts->local_ip = &tcp->tcp_pcb->local_ip;
2110*ef8d499eSDavid van Moolenbroek ipopts->remote_ip = &tcp->tcp_pcb->remote_ip;
2111*ef8d499eSDavid van Moolenbroek ipopts->tos = &tcp->tcp_pcb->tos;
2112*ef8d499eSDavid van Moolenbroek ipopts->ttl = &tcp->tcp_pcb->ttl;
2113*ef8d499eSDavid van Moolenbroek ipopts->sndmin = TCP_SNDBUF_MIN;
2114*ef8d499eSDavid van Moolenbroek ipopts->sndmax = TCP_SNDBUF_MAX;
2115*ef8d499eSDavid van Moolenbroek ipopts->rcvmin = TCP_RCVBUF_MIN;
2116*ef8d499eSDavid van Moolenbroek ipopts->rcvmax = TCP_RCVBUF_MAX;
2117*ef8d499eSDavid van Moolenbroek }
2118*ef8d499eSDavid van Moolenbroek
2119*ef8d499eSDavid van Moolenbroek /*
2120*ef8d499eSDavid van Moolenbroek * Set socket options on a TCP socket.
2121*ef8d499eSDavid van Moolenbroek */
2122*ef8d499eSDavid van Moolenbroek static int
tcpsock_setsockopt(struct sock * sock,int level,int name,const struct sockdriver_data * data,socklen_t len)2123*ef8d499eSDavid van Moolenbroek tcpsock_setsockopt(struct sock * sock, int level, int name,
2124*ef8d499eSDavid van Moolenbroek const struct sockdriver_data * data, socklen_t len)
2125*ef8d499eSDavid van Moolenbroek {
2126*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
2127*ef8d499eSDavid van Moolenbroek struct ipopts ipopts;
2128*ef8d499eSDavid van Moolenbroek uint32_t uval;
2129*ef8d499eSDavid van Moolenbroek int r, val;
2130*ef8d499eSDavid van Moolenbroek
2131*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb == NULL)
2132*ef8d499eSDavid van Moolenbroek return ECONNRESET;
2133*ef8d499eSDavid van Moolenbroek
2134*ef8d499eSDavid van Moolenbroek /* Handle TCP-level options. */
2135*ef8d499eSDavid van Moolenbroek switch (level) {
2136*ef8d499eSDavid van Moolenbroek case IPPROTO_IPV6:
2137*ef8d499eSDavid van Moolenbroek switch (name) {
2138*ef8d499eSDavid van Moolenbroek case IPV6_RECVTCLASS:
2139*ef8d499eSDavid van Moolenbroek if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2140*ef8d499eSDavid van Moolenbroek len)) != OK)
2141*ef8d499eSDavid van Moolenbroek return r;
2142*ef8d499eSDavid van Moolenbroek
2143*ef8d499eSDavid van Moolenbroek /*
2144*ef8d499eSDavid van Moolenbroek * This option is not supported for TCP sockets; it
2145*ef8d499eSDavid van Moolenbroek * would not even make sense. However, named(8)
2146*ef8d499eSDavid van Moolenbroek * insists on trying to set it anyway. We accept the
2147*ef8d499eSDavid van Moolenbroek * request but ignore the value, not even returning
2148*ef8d499eSDavid van Moolenbroek * what was set through getsockopt(2).
2149*ef8d499eSDavid van Moolenbroek */
2150*ef8d499eSDavid van Moolenbroek return OK;
2151*ef8d499eSDavid van Moolenbroek
2152*ef8d499eSDavid van Moolenbroek case IPV6_FAITH:
2153*ef8d499eSDavid van Moolenbroek if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2154*ef8d499eSDavid van Moolenbroek len)) != OK)
2155*ef8d499eSDavid van Moolenbroek return r;
2156*ef8d499eSDavid van Moolenbroek
2157*ef8d499eSDavid van Moolenbroek /*
2158*ef8d499eSDavid van Moolenbroek * This option is not supported at all, but to save
2159*ef8d499eSDavid van Moolenbroek * ourselves from having to remember the current state
2160*ef8d499eSDavid van Moolenbroek * for getsockopt(2), we also refuse to enable it.
2161*ef8d499eSDavid van Moolenbroek */
2162*ef8d499eSDavid van Moolenbroek if (val != 0)
2163*ef8d499eSDavid van Moolenbroek return EINVAL;
2164*ef8d499eSDavid van Moolenbroek
2165*ef8d499eSDavid van Moolenbroek return OK;
2166*ef8d499eSDavid van Moolenbroek }
2167*ef8d499eSDavid van Moolenbroek
2168*ef8d499eSDavid van Moolenbroek break;
2169*ef8d499eSDavid van Moolenbroek
2170*ef8d499eSDavid van Moolenbroek case IPPROTO_TCP:
2171*ef8d499eSDavid van Moolenbroek switch (name) {
2172*ef8d499eSDavid van Moolenbroek case TCP_NODELAY:
2173*ef8d499eSDavid van Moolenbroek /*
2174*ef8d499eSDavid van Moolenbroek * lwIP's listening TCP PCBs do not have this field.
2175*ef8d499eSDavid van Moolenbroek * If this ever becomes an issue, we can create our own
2176*ef8d499eSDavid van Moolenbroek * shadow flag and do the inheritance ourselves.
2177*ef8d499eSDavid van Moolenbroek */
2178*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb->state == LISTEN)
2179*ef8d499eSDavid van Moolenbroek return EINVAL;
2180*ef8d499eSDavid van Moolenbroek
2181*ef8d499eSDavid van Moolenbroek if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2182*ef8d499eSDavid van Moolenbroek len)) != OK)
2183*ef8d499eSDavid van Moolenbroek return r;
2184*ef8d499eSDavid van Moolenbroek
2185*ef8d499eSDavid van Moolenbroek if (val)
2186*ef8d499eSDavid van Moolenbroek tcp_nagle_disable(tcp->tcp_pcb);
2187*ef8d499eSDavid van Moolenbroek else
2188*ef8d499eSDavid van Moolenbroek tcp_nagle_enable(tcp->tcp_pcb);
2189*ef8d499eSDavid van Moolenbroek
2190*ef8d499eSDavid van Moolenbroek return OK;
2191*ef8d499eSDavid van Moolenbroek
2192*ef8d499eSDavid van Moolenbroek case TCP_KEEPIDLE:
2193*ef8d499eSDavid van Moolenbroek case TCP_KEEPINTVL:
2194*ef8d499eSDavid van Moolenbroek /*
2195*ef8d499eSDavid van Moolenbroek * lwIP's listening TCP PCBs do not have these fields.
2196*ef8d499eSDavid van Moolenbroek */
2197*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb->state == LISTEN)
2198*ef8d499eSDavid van Moolenbroek return EINVAL;
2199*ef8d499eSDavid van Moolenbroek
2200*ef8d499eSDavid van Moolenbroek if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2201*ef8d499eSDavid van Moolenbroek len)) != OK)
2202*ef8d499eSDavid van Moolenbroek return r;
2203*ef8d499eSDavid van Moolenbroek
2204*ef8d499eSDavid van Moolenbroek if (val == 0)
2205*ef8d499eSDavid van Moolenbroek return EINVAL;
2206*ef8d499eSDavid van Moolenbroek
2207*ef8d499eSDavid van Moolenbroek /*
2208*ef8d499eSDavid van Moolenbroek * The given value is unsigned, but lwIP stores the
2209*ef8d499eSDavid van Moolenbroek * value in milliseconds in a uint32_t field, so we
2210*ef8d499eSDavid van Moolenbroek * have to limit large values to whatever fits in the
2211*ef8d499eSDavid van Moolenbroek * field anyway.
2212*ef8d499eSDavid van Moolenbroek */
2213*ef8d499eSDavid van Moolenbroek if (val < 0 || (uint32_t)val > UINT32_MAX / 1000)
2214*ef8d499eSDavid van Moolenbroek uval = UINT32_MAX;
2215*ef8d499eSDavid van Moolenbroek else
2216*ef8d499eSDavid van Moolenbroek uval = (uint32_t)val * 1000;
2217*ef8d499eSDavid van Moolenbroek
2218*ef8d499eSDavid van Moolenbroek if (name == TCP_KEEPIDLE)
2219*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb->keep_idle = uval;
2220*ef8d499eSDavid van Moolenbroek else
2221*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb->keep_intvl = uval;
2222*ef8d499eSDavid van Moolenbroek
2223*ef8d499eSDavid van Moolenbroek return OK;
2224*ef8d499eSDavid van Moolenbroek
2225*ef8d499eSDavid van Moolenbroek case TCP_KEEPCNT:
2226*ef8d499eSDavid van Moolenbroek /* lwIP's listening TCP PCBs do not have this field. */
2227*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb->state == LISTEN)
2228*ef8d499eSDavid van Moolenbroek return EINVAL;
2229*ef8d499eSDavid van Moolenbroek
2230*ef8d499eSDavid van Moolenbroek if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2231*ef8d499eSDavid van Moolenbroek len)) != OK)
2232*ef8d499eSDavid van Moolenbroek return r;
2233*ef8d499eSDavid van Moolenbroek
2234*ef8d499eSDavid van Moolenbroek if (val == 0)
2235*ef8d499eSDavid van Moolenbroek return EINVAL;
2236*ef8d499eSDavid van Moolenbroek
2237*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb->keep_cnt = (uint32_t)val;
2238*ef8d499eSDavid van Moolenbroek
2239*ef8d499eSDavid van Moolenbroek return OK;
2240*ef8d499eSDavid van Moolenbroek }
2241*ef8d499eSDavid van Moolenbroek
2242*ef8d499eSDavid van Moolenbroek return EOPNOTSUPP;
2243*ef8d499eSDavid van Moolenbroek }
2244*ef8d499eSDavid van Moolenbroek
2245*ef8d499eSDavid van Moolenbroek /* Handle all other options at the IP level. */
2246*ef8d499eSDavid van Moolenbroek tcpsock_get_ipopts(tcp, &ipopts);
2247*ef8d499eSDavid van Moolenbroek
2248*ef8d499eSDavid van Moolenbroek return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data,
2249*ef8d499eSDavid van Moolenbroek len, &ipopts);
2250*ef8d499eSDavid van Moolenbroek }
2251*ef8d499eSDavid van Moolenbroek
2252*ef8d499eSDavid van Moolenbroek /*
2253*ef8d499eSDavid van Moolenbroek * Retrieve socket options on a TCP socket.
2254*ef8d499eSDavid van Moolenbroek */
2255*ef8d499eSDavid van Moolenbroek static int
tcpsock_getsockopt(struct sock * sock,int level,int name,const struct sockdriver_data * data,socklen_t * len)2256*ef8d499eSDavid van Moolenbroek tcpsock_getsockopt(struct sock * sock, int level, int name,
2257*ef8d499eSDavid van Moolenbroek const struct sockdriver_data * data, socklen_t * len)
2258*ef8d499eSDavid van Moolenbroek {
2259*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
2260*ef8d499eSDavid van Moolenbroek struct ipopts ipopts;
2261*ef8d499eSDavid van Moolenbroek int val;
2262*ef8d499eSDavid van Moolenbroek
2263*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb == NULL)
2264*ef8d499eSDavid van Moolenbroek return ECONNRESET;
2265*ef8d499eSDavid van Moolenbroek
2266*ef8d499eSDavid van Moolenbroek /* Handle TCP-level options. */
2267*ef8d499eSDavid van Moolenbroek switch (level) {
2268*ef8d499eSDavid van Moolenbroek case IPPROTO_IPV6:
2269*ef8d499eSDavid van Moolenbroek switch (name) {
2270*ef8d499eSDavid van Moolenbroek case IPV6_RECVTCLASS:
2271*ef8d499eSDavid van Moolenbroek case IPV6_FAITH:
2272*ef8d499eSDavid van Moolenbroek val = 0;
2273*ef8d499eSDavid van Moolenbroek
2274*ef8d499eSDavid van Moolenbroek return sockdriver_copyout_opt(data, &val, sizeof(val),
2275*ef8d499eSDavid van Moolenbroek len);
2276*ef8d499eSDavid van Moolenbroek }
2277*ef8d499eSDavid van Moolenbroek
2278*ef8d499eSDavid van Moolenbroek break;
2279*ef8d499eSDavid van Moolenbroek
2280*ef8d499eSDavid van Moolenbroek case IPPROTO_TCP:
2281*ef8d499eSDavid van Moolenbroek switch (name) {
2282*ef8d499eSDavid van Moolenbroek case TCP_NODELAY:
2283*ef8d499eSDavid van Moolenbroek /* lwIP's listening TCP PCBs do not have this field. */
2284*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb->state == LISTEN)
2285*ef8d499eSDavid van Moolenbroek return EINVAL;
2286*ef8d499eSDavid van Moolenbroek
2287*ef8d499eSDavid van Moolenbroek val = tcp_nagle_disabled(tcp->tcp_pcb);
2288*ef8d499eSDavid van Moolenbroek
2289*ef8d499eSDavid van Moolenbroek return sockdriver_copyout_opt(data, &val, sizeof(val),
2290*ef8d499eSDavid van Moolenbroek len);
2291*ef8d499eSDavid van Moolenbroek
2292*ef8d499eSDavid van Moolenbroek case TCP_MAXSEG:
2293*ef8d499eSDavid van Moolenbroek /* lwIP's listening TCP PCBs do not have this field. */
2294*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb->state == LISTEN)
2295*ef8d499eSDavid van Moolenbroek return EINVAL;
2296*ef8d499eSDavid van Moolenbroek
2297*ef8d499eSDavid van Moolenbroek /* This option is read-only at this time. */
2298*ef8d499eSDavid van Moolenbroek val = tcp->tcp_pcb->mss;
2299*ef8d499eSDavid van Moolenbroek
2300*ef8d499eSDavid van Moolenbroek return sockdriver_copyout_opt(data, &val, sizeof(val),
2301*ef8d499eSDavid van Moolenbroek len);
2302*ef8d499eSDavid van Moolenbroek
2303*ef8d499eSDavid van Moolenbroek case TCP_KEEPIDLE:
2304*ef8d499eSDavid van Moolenbroek /* lwIP's listening TCP PCBs do not have this field. */
2305*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb->state == LISTEN)
2306*ef8d499eSDavid van Moolenbroek return EINVAL;
2307*ef8d499eSDavid van Moolenbroek
2308*ef8d499eSDavid van Moolenbroek val = (int)(tcp->tcp_pcb->keep_idle / 1000);
2309*ef8d499eSDavid van Moolenbroek
2310*ef8d499eSDavid van Moolenbroek return sockdriver_copyout_opt(data, &val, sizeof(val),
2311*ef8d499eSDavid van Moolenbroek len);
2312*ef8d499eSDavid van Moolenbroek
2313*ef8d499eSDavid van Moolenbroek case TCP_KEEPINTVL:
2314*ef8d499eSDavid van Moolenbroek /* lwIP's listening TCP PCBs do not have this field. */
2315*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb->state == LISTEN)
2316*ef8d499eSDavid van Moolenbroek return EINVAL;
2317*ef8d499eSDavid van Moolenbroek
2318*ef8d499eSDavid van Moolenbroek val = (int)(tcp->tcp_pcb->keep_intvl / 1000);
2319*ef8d499eSDavid van Moolenbroek
2320*ef8d499eSDavid van Moolenbroek return sockdriver_copyout_opt(data, &val, sizeof(val),
2321*ef8d499eSDavid van Moolenbroek len);
2322*ef8d499eSDavid van Moolenbroek
2323*ef8d499eSDavid van Moolenbroek case TCP_KEEPCNT:
2324*ef8d499eSDavid van Moolenbroek /* lwIP's listening TCP PCBs do not have this field. */
2325*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb->state == LISTEN)
2326*ef8d499eSDavid van Moolenbroek return EINVAL;
2327*ef8d499eSDavid van Moolenbroek
2328*ef8d499eSDavid van Moolenbroek val = (int)tcp->tcp_pcb->keep_cnt;
2329*ef8d499eSDavid van Moolenbroek
2330*ef8d499eSDavid van Moolenbroek return sockdriver_copyout_opt(data, &val, sizeof(val),
2331*ef8d499eSDavid van Moolenbroek len);
2332*ef8d499eSDavid van Moolenbroek }
2333*ef8d499eSDavid van Moolenbroek
2334*ef8d499eSDavid van Moolenbroek return EOPNOTSUPP;
2335*ef8d499eSDavid van Moolenbroek }
2336*ef8d499eSDavid van Moolenbroek
2337*ef8d499eSDavid van Moolenbroek /* Handle all other options at the IP level. */
2338*ef8d499eSDavid van Moolenbroek tcpsock_get_ipopts(tcp, &ipopts);
2339*ef8d499eSDavid van Moolenbroek
2340*ef8d499eSDavid van Moolenbroek return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data,
2341*ef8d499eSDavid van Moolenbroek len, &ipopts);
2342*ef8d499eSDavid van Moolenbroek }
2343*ef8d499eSDavid van Moolenbroek
2344*ef8d499eSDavid van Moolenbroek /*
2345*ef8d499eSDavid van Moolenbroek * Retrieve the local socket address of a TCP socket.
2346*ef8d499eSDavid van Moolenbroek */
2347*ef8d499eSDavid van Moolenbroek static int
tcpsock_getsockname(struct sock * sock,struct sockaddr * addr,socklen_t * addr_len)2348*ef8d499eSDavid van Moolenbroek tcpsock_getsockname(struct sock * sock, struct sockaddr * addr,
2349*ef8d499eSDavid van Moolenbroek socklen_t * addr_len)
2350*ef8d499eSDavid van Moolenbroek {
2351*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
2352*ef8d499eSDavid van Moolenbroek
2353*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb == NULL)
2354*ef8d499eSDavid van Moolenbroek return EINVAL;
2355*ef8d499eSDavid van Moolenbroek
2356*ef8d499eSDavid van Moolenbroek ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
2357*ef8d499eSDavid van Moolenbroek &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port);
2358*ef8d499eSDavid van Moolenbroek
2359*ef8d499eSDavid van Moolenbroek return OK;
2360*ef8d499eSDavid van Moolenbroek }
2361*ef8d499eSDavid van Moolenbroek
2362*ef8d499eSDavid van Moolenbroek /*
2363*ef8d499eSDavid van Moolenbroek * Retrieve the remote socket address of a TCP socket.
2364*ef8d499eSDavid van Moolenbroek */
2365*ef8d499eSDavid van Moolenbroek static int
tcpsock_getpeername(struct sock * sock,struct sockaddr * addr,socklen_t * addr_len)2366*ef8d499eSDavid van Moolenbroek tcpsock_getpeername(struct sock * sock, struct sockaddr * addr,
2367*ef8d499eSDavid van Moolenbroek socklen_t * addr_len)
2368*ef8d499eSDavid van Moolenbroek {
2369*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
2370*ef8d499eSDavid van Moolenbroek
2371*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED ||
2372*ef8d499eSDavid van Moolenbroek tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT)
2373*ef8d499eSDavid van Moolenbroek return ENOTCONN;
2374*ef8d499eSDavid van Moolenbroek
2375*ef8d499eSDavid van Moolenbroek ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
2376*ef8d499eSDavid van Moolenbroek &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
2377*ef8d499eSDavid van Moolenbroek
2378*ef8d499eSDavid van Moolenbroek return OK;
2379*ef8d499eSDavid van Moolenbroek }
2380*ef8d499eSDavid van Moolenbroek
2381*ef8d499eSDavid van Moolenbroek /*
2382*ef8d499eSDavid van Moolenbroek * Perform a TCP half-close on a TCP socket. This operation may not complete
2383*ef8d499eSDavid van Moolenbroek * immediately due to memory conditions, in which case it will be completed at
2384*ef8d499eSDavid van Moolenbroek * a later time.
2385*ef8d499eSDavid van Moolenbroek */
2386*ef8d499eSDavid van Moolenbroek static void
tcpsock_send_fin(struct tcpsock * tcp)2387*ef8d499eSDavid van Moolenbroek tcpsock_send_fin(struct tcpsock * tcp)
2388*ef8d499eSDavid van Moolenbroek {
2389*ef8d499eSDavid van Moolenbroek
2390*ef8d499eSDavid van Moolenbroek sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR);
2391*ef8d499eSDavid van Moolenbroek
2392*ef8d499eSDavid van Moolenbroek /*
2393*ef8d499eSDavid van Moolenbroek * Attempt to send the FIN. If a fatal error occurs as a result, raise
2394*ef8d499eSDavid van Moolenbroek * it as an asynchronous error, because this function's callers cannot
2395*ef8d499eSDavid van Moolenbroek * do much with it. That happens to match the way these functions are
2396*ef8d499eSDavid van Moolenbroek * used elsewhere. In any case, as a result, the PCB may be closed.
2397*ef8d499eSDavid van Moolenbroek * However, we are never called from a situation where the socket is
2398*ef8d499eSDavid van Moolenbroek * being closed here, so the socket object will not be freed either.
2399*ef8d499eSDavid van Moolenbroek */
2400*ef8d499eSDavid van Moolenbroek if (tcpsock_pcb_enqueue(tcp)) {
2401*ef8d499eSDavid van Moolenbroek assert(!sockevent_is_closing(tcpsock_get_sock(tcp)));
2402*ef8d499eSDavid van Moolenbroek
2403*ef8d499eSDavid van Moolenbroek if (tcpsock_may_close(tcp))
2404*ef8d499eSDavid van Moolenbroek tcpsock_finish_close(tcp);
2405*ef8d499eSDavid van Moolenbroek else
2406*ef8d499eSDavid van Moolenbroek (void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/);
2407*ef8d499eSDavid van Moolenbroek }
2408*ef8d499eSDavid van Moolenbroek }
2409*ef8d499eSDavid van Moolenbroek
2410*ef8d499eSDavid van Moolenbroek /*
2411*ef8d499eSDavid van Moolenbroek * Shut down a TCP socket for reading and/or writing.
2412*ef8d499eSDavid van Moolenbroek */
2413*ef8d499eSDavid van Moolenbroek static int
tcpsock_shutdown(struct sock * sock,unsigned int mask)2414*ef8d499eSDavid van Moolenbroek tcpsock_shutdown(struct sock * sock, unsigned int mask)
2415*ef8d499eSDavid van Moolenbroek {
2416*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
2417*ef8d499eSDavid van Moolenbroek
2418*ef8d499eSDavid van Moolenbroek /*
2419*ef8d499eSDavid van Moolenbroek * If the PCB is gone, we want to allow shutdowns for reading but not
2420*ef8d499eSDavid van Moolenbroek * writing: shutting down for writing affects the PCB, shutting down
2421*ef8d499eSDavid van Moolenbroek * for reading does not. Also, if the PCB is in CLOSED state, we would
2422*ef8d499eSDavid van Moolenbroek * not know how to deal with subsequent operations after a shutdown for
2423*ef8d499eSDavid van Moolenbroek * writing, so forbid such calls altogether.
2424*ef8d499eSDavid van Moolenbroek */
2425*ef8d499eSDavid van Moolenbroek if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) &&
2426*ef8d499eSDavid van Moolenbroek (mask & SFL_SHUT_WR))
2427*ef8d499eSDavid van Moolenbroek return ENOTCONN;
2428*ef8d499eSDavid van Moolenbroek
2429*ef8d499eSDavid van Moolenbroek /*
2430*ef8d499eSDavid van Moolenbroek * Handle listening sockets as a special case. Shutting down a
2431*ef8d499eSDavid van Moolenbroek * listening socket frees its PCB. Sockets pending on the accept queue
2432*ef8d499eSDavid van Moolenbroek * may still be accepted, but after that, accept(2) will start
2433*ef8d499eSDavid van Moolenbroek * returning ECONNABORTED. This feature allows multi-process server
2434*ef8d499eSDavid van Moolenbroek * applications to shut down gracefully, supposedly..
2435*ef8d499eSDavid van Moolenbroek */
2436*ef8d499eSDavid van Moolenbroek if (tcpsock_is_listening(tcp)) {
2437*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb != NULL)
2438*ef8d499eSDavid van Moolenbroek tcpsock_pcb_close(tcp);
2439*ef8d499eSDavid van Moolenbroek
2440*ef8d499eSDavid van Moolenbroek return OK;
2441*ef8d499eSDavid van Moolenbroek }
2442*ef8d499eSDavid van Moolenbroek
2443*ef8d499eSDavid van Moolenbroek /*
2444*ef8d499eSDavid van Moolenbroek * We control shutdown-for-reading locally, and intentially do not tell
2445*ef8d499eSDavid van Moolenbroek * lwIP about it: if we do that and also shut down for writing, the PCB
2446*ef8d499eSDavid van Moolenbroek * may disappear (now or eventually), which is not what we want.
2447*ef8d499eSDavid van Moolenbroek * Instead, we only tell lwIP to shut down for reading once we actually
2448*ef8d499eSDavid van Moolenbroek * want to get rid of the PCB, using tcp_close(). In the meantime, if
2449*ef8d499eSDavid van Moolenbroek * the socket is shut down for reading by the user, we simply discard
2450*ef8d499eSDavid van Moolenbroek * received data as fast as we can--one out of a number of possible
2451*ef8d499eSDavid van Moolenbroek * design choices there, and (reportedly) the one used by the BSDs.
2452*ef8d499eSDavid van Moolenbroek */
2453*ef8d499eSDavid van Moolenbroek if (mask & SFL_SHUT_RD)
2454*ef8d499eSDavid van Moolenbroek (void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/);
2455*ef8d499eSDavid van Moolenbroek
2456*ef8d499eSDavid van Moolenbroek /*
2457*ef8d499eSDavid van Moolenbroek * Shutting down for writing a connecting socket simply closes its PCB.
2458*ef8d499eSDavid van Moolenbroek * Closing a PCB in SYN_SENT state simply deallocates it, so this can
2459*ef8d499eSDavid van Moolenbroek * not fail. On the other hand, for connected sockets we want to send
2460*ef8d499eSDavid van Moolenbroek * a FIN, which may fail due to memory shortage, in which case we have
2461*ef8d499eSDavid van Moolenbroek * to try again later..
2462*ef8d499eSDavid van Moolenbroek */
2463*ef8d499eSDavid van Moolenbroek if (mask & SFL_SHUT_WR) {
2464*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb->state == SYN_SENT)
2465*ef8d499eSDavid van Moolenbroek tcpsock_pcb_close(tcp);
2466*ef8d499eSDavid van Moolenbroek else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
2467*ef8d499eSDavid van Moolenbroek tcpsock_send_fin(tcp);
2468*ef8d499eSDavid van Moolenbroek }
2469*ef8d499eSDavid van Moolenbroek
2470*ef8d499eSDavid van Moolenbroek return OK;
2471*ef8d499eSDavid van Moolenbroek }
2472*ef8d499eSDavid van Moolenbroek
2473*ef8d499eSDavid van Moolenbroek /*
2474*ef8d499eSDavid van Moolenbroek * Close a TCP socket. Complete the operation immediately if possible, or
2475*ef8d499eSDavid van Moolenbroek * otherwise initiate the closing process and complete it later, notifying
2476*ef8d499eSDavid van Moolenbroek * libsockevent about that as well. Depending on linger settings, this
2477*ef8d499eSDavid van Moolenbroek * function may be called twice on the same socket: the first time with the
2478*ef8d499eSDavid van Moolenbroek * 'force' flag cleared, and the second time with the 'force' flag set.
2479*ef8d499eSDavid van Moolenbroek */
2480*ef8d499eSDavid van Moolenbroek static int
tcpsock_close(struct sock * sock,int force)2481*ef8d499eSDavid van Moolenbroek tcpsock_close(struct sock * sock, int force)
2482*ef8d499eSDavid van Moolenbroek {
2483*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
2484*ef8d499eSDavid van Moolenbroek struct tcpsock *queued;
2485*ef8d499eSDavid van Moolenbroek size_t rlen;
2486*ef8d499eSDavid van Moolenbroek
2487*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_listener == NULL);
2488*ef8d499eSDavid van Moolenbroek
2489*ef8d499eSDavid van Moolenbroek /*
2490*ef8d499eSDavid van Moolenbroek * If this was a listening socket, so abort and clean up any and all
2491*ef8d499eSDavid van Moolenbroek * connections on its listener queue. Note that the listening socket
2492*ef8d499eSDavid van Moolenbroek * may or may not have a PCB at this point.
2493*ef8d499eSDavid van Moolenbroek */
2494*ef8d499eSDavid van Moolenbroek if (tcpsock_is_listening(tcp)) {
2495*ef8d499eSDavid van Moolenbroek while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) {
2496*ef8d499eSDavid van Moolenbroek queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head);
2497*ef8d499eSDavid van Moolenbroek
2498*ef8d499eSDavid van Moolenbroek tcpsock_pcb_abort(queued);
2499*ef8d499eSDavid van Moolenbroek
2500*ef8d499eSDavid van Moolenbroek (void)tcpsock_cleanup(queued, TRUE /*may_free*/);
2501*ef8d499eSDavid van Moolenbroek }
2502*ef8d499eSDavid van Moolenbroek }
2503*ef8d499eSDavid van Moolenbroek
2504*ef8d499eSDavid van Moolenbroek /*
2505*ef8d499eSDavid van Moolenbroek * Clear the receive queue, and make sure that we no longer add new
2506*ef8d499eSDavid van Moolenbroek * data to it. The latter is relevant only for the case that we end up
2507*ef8d499eSDavid van Moolenbroek * returning SUSPEND below. Remember whether there were bytes left,
2508*ef8d499eSDavid van Moolenbroek * because we should reset the connection if there were.
2509*ef8d499eSDavid van Moolenbroek */
2510*ef8d499eSDavid van Moolenbroek rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
2511*ef8d499eSDavid van Moolenbroek
2512*ef8d499eSDavid van Moolenbroek sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD);
2513*ef8d499eSDavid van Moolenbroek
2514*ef8d499eSDavid van Moolenbroek /*
2515*ef8d499eSDavid van Moolenbroek * If the socket is connected, perform a graceful shutdown, unless 1)
2516*ef8d499eSDavid van Moolenbroek * we are asked to force-close the socket, or 2) if the local side has
2517*ef8d499eSDavid van Moolenbroek * not consumed all data, as per RFC 1122 Sec.4.2.2.13. Normally lwIP
2518*ef8d499eSDavid van Moolenbroek * would take care of the second point, but we may have data in our
2519*ef8d499eSDavid van Moolenbroek * receive buffer of which lwIP is not aware.
2520*ef8d499eSDavid van Moolenbroek *
2521*ef8d499eSDavid van Moolenbroek * Implementing proper linger support is somewhat difficult with lwIP.
2522*ef8d499eSDavid van Moolenbroek * In particular, we cannot reliably wait for our FIN to be ACK'ed by
2523*ef8d499eSDavid van Moolenbroek * the other side in all cases:
2524*ef8d499eSDavid van Moolenbroek *
2525*ef8d499eSDavid van Moolenbroek * - the lwIP TCP transition from states CLOSING to TIME_WAIT does not
2526*ef8d499eSDavid van Moolenbroek * trigger any event and once in the TIME_WAIT state, the poll event
2527*ef8d499eSDavid van Moolenbroek * no longer triggers either;
2528*ef8d499eSDavid van Moolenbroek * - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to
2529*ef8d499eSDavid van Moolenbroek * TIME_WAIT will trigger a receive event, but it is not clear
2530*ef8d499eSDavid van Moolenbroek * whether we can reliably check that our FIN was ACK'ed from there.
2531*ef8d499eSDavid van Moolenbroek *
2532*ef8d499eSDavid van Moolenbroek * That means we have to compromise. Instead of the proper approach,
2533*ef8d499eSDavid van Moolenbroek * we complete our side of the close operation whenever:
2534*ef8d499eSDavid van Moolenbroek *
2535*ef8d499eSDavid van Moolenbroek * 1. all of or data was acknowledged, AND,
2536*ef8d499eSDavid van Moolenbroek * 2. our FIN was sent, AND,
2537*ef8d499eSDavid van Moolenbroek * 3a. our FIN was acknowledged, OR,
2538*ef8d499eSDavid van Moolenbroek * 3b. we received a FIN from the other side.
2539*ef8d499eSDavid van Moolenbroek *
2540*ef8d499eSDavid van Moolenbroek * With the addition of the rule 3b, we do not run into the above
2541*ef8d499eSDavid van Moolenbroek * reliability problems, but we may return from SO_LINGER-blocked close
2542*ef8d499eSDavid van Moolenbroek * calls too early and thus give callers a false impression of success.
2543*ef8d499eSDavid van Moolenbroek * TODO: if lwIP ever gets improved on this point, the code in this
2544*ef8d499eSDavid van Moolenbroek * module should be rewritten to make use of the improvements.
2545*ef8d499eSDavid van Moolenbroek *
2546*ef8d499eSDavid van Moolenbroek * The set of rules is basically the same as for closing the PCB early
2547*ef8d499eSDavid van Moolenbroek * as per tcpsock_may_close(), except with the check for our FIN being
2548*ef8d499eSDavid van Moolenbroek * acknowledged. Unfortunately only the FIN_WAIT_2, TIME_WAIT, and
2549*ef8d499eSDavid van Moolenbroek * (reentered) CLOSED TCP states guarantee that there are no
2550*ef8d499eSDavid van Moolenbroek * unacknowledged data segments anymore, so we may have to wait for
2551*ef8d499eSDavid van Moolenbroek * reaching any one of these before we can actually finish closing the
2552*ef8d499eSDavid van Moolenbroek * socket with tcp_close().
2553*ef8d499eSDavid van Moolenbroek *
2554*ef8d499eSDavid van Moolenbroek * In addition, lwIP does not tell us when our FIN gets acknowledged,
2555*ef8d499eSDavid van Moolenbroek * so we have to use polling and direct access to lwIP's PCB fields
2556*ef8d499eSDavid van Moolenbroek * instead, just like lwIP's BSD API does. There is no other way.
2557*ef8d499eSDavid van Moolenbroek * Also, we may not even be able to send the FIN right away, in which
2558*ef8d499eSDavid van Moolenbroek * case we must defer that until later.
2559*ef8d499eSDavid van Moolenbroek */
2560*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb != NULL) {
2561*ef8d499eSDavid van Moolenbroek switch (tcp->tcp_pcb->state) {
2562*ef8d499eSDavid van Moolenbroek case CLOSE_WAIT:
2563*ef8d499eSDavid van Moolenbroek case CLOSING:
2564*ef8d499eSDavid van Moolenbroek case LAST_ACK:
2565*ef8d499eSDavid van Moolenbroek assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
2566*ef8d499eSDavid van Moolenbroek
2567*ef8d499eSDavid van Moolenbroek /* FALLTHROUGH */
2568*ef8d499eSDavid van Moolenbroek case SYN_RCVD:
2569*ef8d499eSDavid van Moolenbroek case ESTABLISHED:
2570*ef8d499eSDavid van Moolenbroek case FIN_WAIT_1:
2571*ef8d499eSDavid van Moolenbroek /* First check if we should abort the connection. */
2572*ef8d499eSDavid van Moolenbroek if (force || rlen > 0)
2573*ef8d499eSDavid van Moolenbroek break;
2574*ef8d499eSDavid van Moolenbroek
2575*ef8d499eSDavid van Moolenbroek /*
2576*ef8d499eSDavid van Moolenbroek * If we have not sent a FIN yet, try sending it now;
2577*ef8d499eSDavid van Moolenbroek * if all other conditions are met for closing the
2578*ef8d499eSDavid van Moolenbroek * socket, successful FIN transmission will complete
2579*ef8d499eSDavid van Moolenbroek * the close. Otherwise, perform the close check
2580*ef8d499eSDavid van Moolenbroek * explicitly.
2581*ef8d499eSDavid van Moolenbroek */
2582*ef8d499eSDavid van Moolenbroek if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
2583*ef8d499eSDavid van Moolenbroek tcpsock_send_fin(tcp);
2584*ef8d499eSDavid van Moolenbroek else if (tcpsock_may_close(tcp))
2585*ef8d499eSDavid van Moolenbroek tcpsock_pcb_close(tcp);
2586*ef8d499eSDavid van Moolenbroek
2587*ef8d499eSDavid van Moolenbroek /*
2588*ef8d499eSDavid van Moolenbroek * If at this point the PCB is gone, we managed to
2589*ef8d499eSDavid van Moolenbroek * close the connection immediately, and the socket has
2590*ef8d499eSDavid van Moolenbroek * already been cleaned up by now. This may occur if
2591*ef8d499eSDavid van Moolenbroek * there is no unacknowledged data and we already
2592*ef8d499eSDavid van Moolenbroek * received a FIN earlier on.
2593*ef8d499eSDavid van Moolenbroek */
2594*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb == NULL)
2595*ef8d499eSDavid van Moolenbroek return OK;
2596*ef8d499eSDavid van Moolenbroek
2597*ef8d499eSDavid van Moolenbroek /*
2598*ef8d499eSDavid van Moolenbroek * Complete the close operation at a later time.
2599*ef8d499eSDavid van Moolenbroek * Adjust the polling interval, so that we can detect
2600*ef8d499eSDavid van Moolenbroek * completion of the close as quickly as possible.
2601*ef8d499eSDavid van Moolenbroek */
2602*ef8d499eSDavid van Moolenbroek tcp_poll(tcp->tcp_pcb, tcpsock_event_poll,
2603*ef8d499eSDavid van Moolenbroek TCP_POLL_CLOSE_INTERVAL);
2604*ef8d499eSDavid van Moolenbroek
2605*ef8d499eSDavid van Moolenbroek return SUSPEND;
2606*ef8d499eSDavid van Moolenbroek
2607*ef8d499eSDavid van Moolenbroek default:
2608*ef8d499eSDavid van Moolenbroek /*
2609*ef8d499eSDavid van Moolenbroek * The connection is either not yet established, or
2610*ef8d499eSDavid van Moolenbroek * already in a state where we can close it right now.
2611*ef8d499eSDavid van Moolenbroek */
2612*ef8d499eSDavid van Moolenbroek tcpsock_pcb_close(tcp);
2613*ef8d499eSDavid van Moolenbroek }
2614*ef8d499eSDavid van Moolenbroek }
2615*ef8d499eSDavid van Moolenbroek
2616*ef8d499eSDavid van Moolenbroek /*
2617*ef8d499eSDavid van Moolenbroek * Abort the connection is the PCB is still around, and clean up the
2618*ef8d499eSDavid van Moolenbroek * socket. We cannot let tcpsock_cleanup() free the socket object yet,
2619*ef8d499eSDavid van Moolenbroek * because we are still in the callback from libsockevent, and the
2620*ef8d499eSDavid van Moolenbroek * latter cannot handle the socket object being freed from here.
2621*ef8d499eSDavid van Moolenbroek */
2622*ef8d499eSDavid van Moolenbroek if (tcp->tcp_pcb != NULL)
2623*ef8d499eSDavid van Moolenbroek tcpsock_pcb_abort(tcp);
2624*ef8d499eSDavid van Moolenbroek
2625*ef8d499eSDavid van Moolenbroek (void)tcpsock_cleanup(tcp, FALSE /*may_free*/);
2626*ef8d499eSDavid van Moolenbroek
2627*ef8d499eSDavid van Moolenbroek return OK;
2628*ef8d499eSDavid van Moolenbroek }
2629*ef8d499eSDavid van Moolenbroek
2630*ef8d499eSDavid van Moolenbroek /*
2631*ef8d499eSDavid van Moolenbroek * Free up a closed TCP socket.
2632*ef8d499eSDavid van Moolenbroek */
2633*ef8d499eSDavid van Moolenbroek static void
tcpsock_free(struct sock * sock)2634*ef8d499eSDavid van Moolenbroek tcpsock_free(struct sock * sock)
2635*ef8d499eSDavid van Moolenbroek {
2636*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp = (struct tcpsock *)sock;
2637*ef8d499eSDavid van Moolenbroek
2638*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_pcb == NULL);
2639*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_len == 0);
2640*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_snd.ts_head == NULL);
2641*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_len == 0);
2642*ef8d499eSDavid van Moolenbroek assert(tcp->tcp_rcv.tr_head == NULL);
2643*ef8d499eSDavid van Moolenbroek
2644*ef8d499eSDavid van Moolenbroek TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next);
2645*ef8d499eSDavid van Moolenbroek }
2646*ef8d499eSDavid van Moolenbroek
2647*ef8d499eSDavid van Moolenbroek /* This table maps TCP states from lwIP numbers to NetBSD numbers. */
2648*ef8d499eSDavid van Moolenbroek static const struct {
2649*ef8d499eSDavid van Moolenbroek int tsm_tstate;
2650*ef8d499eSDavid van Moolenbroek int tsm_sostate;
2651*ef8d499eSDavid van Moolenbroek } tcpsock_statemap[] = {
2652*ef8d499eSDavid van Moolenbroek [CLOSED] = { TCPS_CLOSED, SS_ISDISCONNECTED },
2653*ef8d499eSDavid van Moolenbroek [LISTEN] = { TCPS_LISTEN, 0 },
2654*ef8d499eSDavid van Moolenbroek [SYN_SENT] = { TCPS_SYN_SENT, SS_ISCONNECTING },
2655*ef8d499eSDavid van Moolenbroek [SYN_RCVD] = { TCPS_SYN_RECEIVED, SS_ISCONNECTING },
2656*ef8d499eSDavid van Moolenbroek [ESTABLISHED] = { TCPS_ESTABLISHED, SS_ISCONNECTED },
2657*ef8d499eSDavid van Moolenbroek [FIN_WAIT_1] = { TCPS_FIN_WAIT_1, SS_ISDISCONNECTING },
2658*ef8d499eSDavid van Moolenbroek [FIN_WAIT_2] = { TCPS_FIN_WAIT_2, SS_ISDISCONNECTING },
2659*ef8d499eSDavid van Moolenbroek [CLOSE_WAIT] = { TCPS_CLOSE_WAIT, SS_ISCONNECTED },
2660*ef8d499eSDavid van Moolenbroek [CLOSING] = { TCPS_CLOSING, SS_ISDISCONNECTING },
2661*ef8d499eSDavid van Moolenbroek [LAST_ACK] = { TCPS_LAST_ACK, SS_ISDISCONNECTING },
2662*ef8d499eSDavid van Moolenbroek [TIME_WAIT] = { TCPS_TIME_WAIT, SS_ISDISCONNECTED },
2663*ef8d499eSDavid van Moolenbroek };
2664*ef8d499eSDavid van Moolenbroek
2665*ef8d499eSDavid van Moolenbroek /*
2666*ef8d499eSDavid van Moolenbroek * Fill the given kinfo_pcb sysctl(7) structure with information about the TCP
2667*ef8d499eSDavid van Moolenbroek * PCB identified by the given pointer.
2668*ef8d499eSDavid van Moolenbroek */
2669*ef8d499eSDavid van Moolenbroek static void
tcpsock_get_info(struct kinfo_pcb * ki,const void * ptr)2670*ef8d499eSDavid van Moolenbroek tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr)
2671*ef8d499eSDavid van Moolenbroek {
2672*ef8d499eSDavid van Moolenbroek const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr;
2673*ef8d499eSDavid van Moolenbroek struct tcpsock *tcp;
2674*ef8d499eSDavid van Moolenbroek
2675*ef8d499eSDavid van Moolenbroek /*
2676*ef8d499eSDavid van Moolenbroek * Not all TCP PCBs have an associated tcpsock structure. We are
2677*ef8d499eSDavid van Moolenbroek * careful enough clearing the callback argument for PCBs on any of the
2678*ef8d499eSDavid van Moolenbroek * TCP lists that we can use that callback argument to determine
2679*ef8d499eSDavid van Moolenbroek * whether there is an associated tcpsock structure, although with one
2680*ef8d499eSDavid van Moolenbroek * exception: PCBs for incoming connections that have not yet been
2681*ef8d499eSDavid van Moolenbroek * fully established (i.e., in SYN_RCVD state). These will have the
2682*ef8d499eSDavid van Moolenbroek * callback argument of the listening socket (which itself may already
2683*ef8d499eSDavid van Moolenbroek * have been deallocated at this point) but should not be considered as
2684*ef8d499eSDavid van Moolenbroek * associated with the listening socket's tcpsock structure.
2685*ef8d499eSDavid van Moolenbroek */
2686*ef8d499eSDavid van Moolenbroek if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) {
2687*ef8d499eSDavid van Moolenbroek tcp = (struct tcpsock *)pcb->callback_arg;
2688*ef8d499eSDavid van Moolenbroek assert(tcp >= tcp_array &&
2689*ef8d499eSDavid van Moolenbroek tcp < &tcp_array[__arraycount(tcp_array)]);
2690*ef8d499eSDavid van Moolenbroek
2691*ef8d499eSDavid van Moolenbroek /* TODO: change this so that sockstat(1) may work one day. */
2692*ef8d499eSDavid van Moolenbroek ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp);
2693*ef8d499eSDavid van Moolenbroek } else {
2694*ef8d499eSDavid van Moolenbroek /* No tcpsock. Could also be in TIME_WAIT state etc. */
2695*ef8d499eSDavid van Moolenbroek tcp = NULL;
2696*ef8d499eSDavid van Moolenbroek
2697*ef8d499eSDavid van Moolenbroek ki->ki_sostate = SS_NOFDREF;
2698*ef8d499eSDavid van Moolenbroek }
2699*ef8d499eSDavid van Moolenbroek
2700*ef8d499eSDavid van Moolenbroek ki->ki_type = SOCK_STREAM;
2701*ef8d499eSDavid van Moolenbroek
2702*ef8d499eSDavid van Moolenbroek if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) {
2703*ef8d499eSDavid van Moolenbroek ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate;
2704*ef8d499eSDavid van Moolenbroek /* TODO: this needs work, but does anything rely on it? */
2705*ef8d499eSDavid van Moolenbroek ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate;
2706*ef8d499eSDavid van Moolenbroek }
2707*ef8d499eSDavid van Moolenbroek
2708*ef8d499eSDavid van Moolenbroek /* Careful with the LISTEN state here (see below). */
2709*ef8d499eSDavid van Moolenbroek ipsock_get_info(ki, &pcb->local_ip, pcb->local_port,
2710*ef8d499eSDavid van Moolenbroek &pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0);
2711*ef8d499eSDavid van Moolenbroek
2712*ef8d499eSDavid van Moolenbroek /*
2713*ef8d499eSDavid van Moolenbroek * The PCBs for listening sockets are actually smaller. Thus, for
2714*ef8d499eSDavid van Moolenbroek * listening sockets, do not attempt to access any of the fields beyond
2715*ef8d499eSDavid van Moolenbroek * those provided in the smaller structure.
2716*ef8d499eSDavid van Moolenbroek */
2717*ef8d499eSDavid van Moolenbroek if (pcb->state == LISTEN) {
2718*ef8d499eSDavid van Moolenbroek assert(tcp != NULL);
2719*ef8d499eSDavid van Moolenbroek ki->ki_refs =
2720*ef8d499eSDavid van Moolenbroek (uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head);
2721*ef8d499eSDavid van Moolenbroek } else {
2722*ef8d499eSDavid van Moolenbroek if (tcp_nagle_disabled(pcb))
2723*ef8d499eSDavid van Moolenbroek ki->ki_tflags |= NETBSD_TF_NODELAY;
2724*ef8d499eSDavid van Moolenbroek
2725*ef8d499eSDavid van Moolenbroek if (tcp != NULL) {
2726*ef8d499eSDavid van Moolenbroek ki->ki_rcvq = tcp->tcp_rcv.tr_len;
2727*ef8d499eSDavid van Moolenbroek ki->ki_sndq = tcp->tcp_snd.ts_len;
2728*ef8d499eSDavid van Moolenbroek
2729*ef8d499eSDavid van Moolenbroek if (tcp->tcp_listener != NULL)
2730*ef8d499eSDavid van Moolenbroek ki->ki_nextref = (uint64_t)(uintptr_t)
2731*ef8d499eSDavid van Moolenbroek TAILQ_NEXT(tcp, tcp_queue.tq_next);
2732*ef8d499eSDavid van Moolenbroek }
2733*ef8d499eSDavid van Moolenbroek }
2734*ef8d499eSDavid van Moolenbroek }
2735*ef8d499eSDavid van Moolenbroek
2736*ef8d499eSDavid van Moolenbroek /*
2737*ef8d499eSDavid van Moolenbroek * Given either NULL or a previously returned TCP PCB pointer, return the first
2738*ef8d499eSDavid van Moolenbroek * or next TCP PCB pointer, or NULL if there are no more. The current
2739*ef8d499eSDavid van Moolenbroek * implementation supports only one concurrent iteration at once.
2740*ef8d499eSDavid van Moolenbroek */
2741*ef8d499eSDavid van Moolenbroek static const void *
tcpsock_enum(const void * last)2742*ef8d499eSDavid van Moolenbroek tcpsock_enum(const void * last)
2743*ef8d499eSDavid van Moolenbroek {
2744*ef8d499eSDavid van Moolenbroek static struct {
2745*ef8d499eSDavid van Moolenbroek unsigned int i;
2746*ef8d499eSDavid van Moolenbroek const struct tcp_pcb *pcb;
2747*ef8d499eSDavid van Moolenbroek } iter;
2748*ef8d499eSDavid van Moolenbroek
2749*ef8d499eSDavid van Moolenbroek if (last != NULL && (iter.pcb = iter.pcb->next) != NULL)
2750*ef8d499eSDavid van Moolenbroek return (const void *)iter.pcb;
2751*ef8d499eSDavid van Moolenbroek
2752*ef8d499eSDavid van Moolenbroek for (iter.i = (last != NULL) ? iter.i + 1 : 0;
2753*ef8d499eSDavid van Moolenbroek iter.i < __arraycount(tcp_pcb_lists); iter.i++) {
2754*ef8d499eSDavid van Moolenbroek if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL)
2755*ef8d499eSDavid van Moolenbroek return (const void *)iter.pcb;
2756*ef8d499eSDavid van Moolenbroek }
2757*ef8d499eSDavid van Moolenbroek
2758*ef8d499eSDavid van Moolenbroek return NULL;
2759*ef8d499eSDavid van Moolenbroek }
2760*ef8d499eSDavid van Moolenbroek
2761*ef8d499eSDavid van Moolenbroek /*
2762*ef8d499eSDavid van Moolenbroek * Obtain the list of TCP protocol control blocks, for sysctl(7).
2763*ef8d499eSDavid van Moolenbroek */
2764*ef8d499eSDavid van Moolenbroek static ssize_t
tcpsock_pcblist(struct rmib_call * call,struct rmib_node * node __unused,struct rmib_oldp * oldp,struct rmib_newp * newp __unused)2765*ef8d499eSDavid van Moolenbroek tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
2766*ef8d499eSDavid van Moolenbroek struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
2767*ef8d499eSDavid van Moolenbroek {
2768*ef8d499eSDavid van Moolenbroek
2769*ef8d499eSDavid van Moolenbroek return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info);
2770*ef8d499eSDavid van Moolenbroek }
2771*ef8d499eSDavid van Moolenbroek
2772*ef8d499eSDavid van Moolenbroek static const struct sockevent_ops tcpsock_ops = {
2773*ef8d499eSDavid van Moolenbroek .sop_bind = tcpsock_bind,
2774*ef8d499eSDavid van Moolenbroek .sop_listen = tcpsock_listen,
2775*ef8d499eSDavid van Moolenbroek .sop_connect = tcpsock_connect,
2776*ef8d499eSDavid van Moolenbroek .sop_accept = tcpsock_accept,
2777*ef8d499eSDavid van Moolenbroek .sop_test_accept = tcpsock_test_accept,
2778*ef8d499eSDavid van Moolenbroek .sop_pre_send = tcpsock_pre_send,
2779*ef8d499eSDavid van Moolenbroek .sop_send = tcpsock_send,
2780*ef8d499eSDavid van Moolenbroek .sop_test_send = tcpsock_test_send,
2781*ef8d499eSDavid van Moolenbroek .sop_pre_recv = tcpsock_pre_recv,
2782*ef8d499eSDavid van Moolenbroek .sop_recv = tcpsock_recv,
2783*ef8d499eSDavid van Moolenbroek .sop_test_recv = tcpsock_test_recv,
2784*ef8d499eSDavid van Moolenbroek .sop_ioctl = ifconf_ioctl,
2785*ef8d499eSDavid van Moolenbroek .sop_setsockmask = tcpsock_setsockmask,
2786*ef8d499eSDavid van Moolenbroek .sop_setsockopt = tcpsock_setsockopt,
2787*ef8d499eSDavid van Moolenbroek .sop_getsockopt = tcpsock_getsockopt,
2788*ef8d499eSDavid van Moolenbroek .sop_getsockname = tcpsock_getsockname,
2789*ef8d499eSDavid van Moolenbroek .sop_getpeername = tcpsock_getpeername,
2790*ef8d499eSDavid van Moolenbroek .sop_shutdown = tcpsock_shutdown,
2791*ef8d499eSDavid van Moolenbroek .sop_close = tcpsock_close,
2792*ef8d499eSDavid van Moolenbroek .sop_free = tcpsock_free
2793*ef8d499eSDavid van Moolenbroek };
2794