xref: /openbsd-src/sys/netinet/ip_divert.c (revision 92546b594805d4fd45df4c3157e04b6496e71647)
1 /*      $OpenBSD: ip_divert.c,v 1.99 2025/01/23 12:51:51 bluhm Exp $ */
2 
3 /*
4  * Copyright (c) 2009 Michele Marchetto <michele@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/systm.h>
21 #include <sys/mbuf.h>
22 #include <sys/protosw.h>
23 #include <sys/socket.h>
24 #include <sys/socketvar.h>
25 #include <sys/sysctl.h>
26 
27 #include <net/if.h>
28 #include <net/route.h>
29 #include <net/if_var.h>
30 #include <net/netisr.h>
31 
32 #include <netinet/in.h>
33 #include <netinet/in_var.h>
34 #include <netinet/ip.h>
35 #include <netinet/ip_var.h>
36 #include <netinet/in_pcb.h>
37 #include <netinet/ip_divert.h>
38 #include <netinet/tcp.h>
39 #include <netinet/udp.h>
40 #include <netinet/ip_icmp.h>
41 
42 #include <net/pfvar.h>
43 
44 /*
45  * Locks used to protect data:
46  *	a	atomic
47  */
48 
49 struct	inpcbtable	divbtable;
50 struct	cpumem		*divcounters;
51 
52 #ifndef DIVERT_SENDSPACE
53 #define DIVERT_SENDSPACE	(65536 + 100)
54 #endif
55 u_int   divert_sendspace = DIVERT_SENDSPACE;	/* [a] */
56 #ifndef DIVERT_RECVSPACE
57 #define DIVERT_RECVSPACE	(65536 + 100)
58 #endif
59 u_int   divert_recvspace = DIVERT_RECVSPACE;	/* [a] */
60 
61 #ifndef DIVERTHASHSIZE
62 #define DIVERTHASHSIZE	128
63 #endif
64 
65 const struct sysctl_bounded_args divertctl_vars[] = {
66 	{ DIVERTCTL_RECVSPACE, &divert_recvspace, 0, INT_MAX },
67 	{ DIVERTCTL_SENDSPACE, &divert_sendspace, 0, INT_MAX },
68 };
69 
70 const struct pr_usrreqs divert_usrreqs = {
71 	.pru_attach	= divert_attach,
72 	.pru_detach	= divert_detach,
73 	.pru_bind	= divert_bind,
74 	.pru_shutdown	= divert_shutdown,
75 	.pru_send	= divert_send,
76 	.pru_control	= in_control,
77 	.pru_sockaddr	= in_sockaddr,
78 	.pru_peeraddr	= in_peeraddr,
79 };
80 
81 int divbhashsize = DIVERTHASHSIZE;
82 
83 int	divert_output(struct inpcb *, struct mbuf *, struct mbuf *,
84 	    struct mbuf *);
85 void
86 divert_init(void)
87 {
88 	in_pcbinit(&divbtable, divbhashsize);
89 	divcounters = counters_alloc(divs_ncounters);
90 }
91 
92 int
93 divert_output(struct inpcb *inp, struct mbuf *m, struct mbuf *nam,
94     struct mbuf *control)
95 {
96 	struct sockaddr_in *sin;
97 	int error, min_hdrlen, off, dir;
98 	struct ip *ip;
99 
100 	m_freem(control);
101 
102 	if ((error = in_nam2sin(nam, &sin)))
103 		goto fail;
104 
105 	if (m->m_pkthdr.len > IP_MAXPACKET) {
106 		error = EMSGSIZE;
107 		goto fail;
108 	}
109 
110 	m = rip_chkhdr(m, NULL);
111 	if (m == NULL) {
112 		error = EINVAL;
113 		goto fail;
114 	}
115 
116 	ip = mtod(m, struct ip *);
117 	off = ip->ip_hl << 2;
118 
119 	dir = (sin->sin_addr.s_addr == INADDR_ANY ? PF_OUT : PF_IN);
120 
121 	switch (ip->ip_p) {
122 	case IPPROTO_TCP:
123 		min_hdrlen = sizeof(struct tcphdr);
124 		m->m_pkthdr.csum_flags |= M_TCP_CSUM_OUT;
125 		break;
126 	case IPPROTO_UDP:
127 		min_hdrlen = sizeof(struct udphdr);
128 		m->m_pkthdr.csum_flags |= M_UDP_CSUM_OUT;
129 		break;
130 	case IPPROTO_ICMP:
131 		min_hdrlen = ICMP_MINLEN;
132 		m->m_pkthdr.csum_flags |= M_ICMP_CSUM_OUT;
133 		break;
134 	default:
135 		min_hdrlen = 0;
136 		break;
137 	}
138 	if (min_hdrlen && m->m_pkthdr.len < off + min_hdrlen) {
139 		error = EINVAL;
140 		goto fail;
141 	}
142 
143 	m->m_pkthdr.pf.flags |= PF_TAG_DIVERTED_PACKET;
144 
145 	if (dir == PF_IN) {
146 		struct rtentry *rt;
147 		struct ifnet *ifp;
148 
149 		rt = rtalloc(sintosa(sin), 0, inp->inp_rtableid);
150 		if (!rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_LOCAL)) {
151 			rtfree(rt);
152 			error = EADDRNOTAVAIL;
153 			goto fail;
154 		}
155 		m->m_pkthdr.ph_ifidx = rt->rt_ifidx;
156 		rtfree(rt);
157 
158 		/*
159 		 * Recalculate IP and protocol checksums for the inbound packet
160 		 * since the userspace application may have modified the packet
161 		 * prior to reinjection.
162 		 */
163 		in_hdr_cksum_out(m, NULL);
164 		in_proto_cksum_out(m, NULL);
165 
166 		ifp = if_get(m->m_pkthdr.ph_ifidx);
167 		if (ifp == NULL) {
168 			error = ENETDOWN;
169 			goto fail;
170 		}
171 		ipv4_input(ifp, m);
172 		if_put(ifp);
173 	} else {
174 		m->m_pkthdr.ph_rtableid = inp->inp_rtableid;
175 
176 		error = ip_output(m, NULL, &inp->inp_route,
177 		    IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL, 0);
178 	}
179 
180 	divstat_inc(divs_opackets);
181 	return (error);
182 
183 fail:
184 	m_freem(m);
185 	divstat_inc(divs_errors);
186 	return (error);
187 }
188 
189 void
190 divert_packet(struct mbuf *m, int dir, u_int16_t divert_port)
191 {
192 	struct inpcb *inp = NULL;
193 	struct socket *so;
194 	struct sockaddr_in sin;
195 
196 	divstat_inc(divs_ipackets);
197 
198 	if (m->m_len < sizeof(struct ip) &&
199 	    (m = m_pullup(m, sizeof(struct ip))) == NULL) {
200 		divstat_inc(divs_errors);
201 		goto bad;
202 	}
203 
204 	mtx_enter(&divbtable.inpt_mtx);
205 	TAILQ_FOREACH(inp, &divbtable.inpt_queue, inp_queue) {
206 		if (inp->inp_lport != divert_port)
207 			continue;
208 		in_pcbref(inp);
209 		break;
210 	}
211 	mtx_leave(&divbtable.inpt_mtx);
212 	if (inp == NULL) {
213 		divstat_inc(divs_noport);
214 		goto bad;
215 	}
216 
217 	memset(&sin, 0, sizeof(sin));
218 	sin.sin_family = AF_INET;
219 	sin.sin_len = sizeof(sin);
220 
221 	if (dir == PF_IN) {
222 		struct ifaddr *ifa;
223 		struct ifnet *ifp;
224 
225 		ifp = if_get(m->m_pkthdr.ph_ifidx);
226 		if (ifp == NULL) {
227 			divstat_inc(divs_errors);
228 			goto bad;
229 		}
230 		TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
231 			if (ifa->ifa_addr->sa_family != AF_INET)
232 				continue;
233 			sin.sin_addr = satosin(ifa->ifa_addr)->sin_addr;
234 			break;
235 		}
236 		if_put(ifp);
237 	} else {
238 		/*
239 		 * Calculate IP and protocol checksums for outbound packet
240 		 * diverted to userland.  pf rule diverts before cksum offload.
241 		 */
242 		in_hdr_cksum_out(m, NULL);
243 		in_proto_cksum_out(m, NULL);
244 	}
245 
246 	so = inp->inp_socket;
247 	mtx_enter(&so->so_rcv.sb_mtx);
248 	if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) {
249 		mtx_leave(&so->so_rcv.sb_mtx);
250 		divstat_inc(divs_fullsock);
251 		goto bad;
252 	}
253 	mtx_leave(&so->so_rcv.sb_mtx);
254 	sorwakeup(so);
255 
256 	in_pcbunref(inp);
257 	return;
258 
259  bad:
260 	if (inp != NULL)
261 		in_pcbunref(inp);
262 	m_freem(m);
263 }
264 
265 int
266 divert_attach(struct socket *so, int proto, int wait)
267 {
268 	int error;
269 
270 	if (so->so_pcb != NULL)
271 		return EINVAL;
272 	if ((so->so_state & SS_PRIV) == 0)
273 		return EACCES;
274 
275 	error = soreserve(so, atomic_load_int(&divert_sendspace),
276 	    atomic_load_int(&divert_recvspace));
277 	if (error)
278 		return error;
279 	error = in_pcballoc(so, &divbtable, wait);
280 	if (error)
281 		return error;
282 
283 	sotoinpcb(so)->inp_flags |= INP_HDRINCL;
284 	return (0);
285 }
286 
287 int
288 divert_detach(struct socket *so)
289 {
290 	struct inpcb *inp = sotoinpcb(so);
291 
292 	soassertlocked(so);
293 
294 	if (inp == NULL)
295 		return (EINVAL);
296 
297 	in_pcbdetach(inp);
298 	return (0);
299 }
300 
301 int
302 divert_bind(struct socket *so, struct mbuf *addr, struct proc *p)
303 {
304 	struct inpcb *inp = sotoinpcb(so);
305 
306 	soassertlocked(so);
307 	return in_pcbbind(inp, addr, p);
308 }
309 
310 int
311 divert_shutdown(struct socket *so)
312 {
313 	soassertlocked(so);
314 	socantsendmore(so);
315 	return (0);
316 }
317 
318 int
319 divert_send(struct socket *so, struct mbuf *m, struct mbuf *addr,
320     struct mbuf *control)
321 {
322 	struct inpcb *inp = sotoinpcb(so);
323 
324 	soassertlocked(so);
325 	return (divert_output(inp, m, addr, control));
326 }
327 
328 int
329 divert_sysctl_divstat(void *oldp, size_t *oldlenp, void *newp)
330 {
331 	uint64_t counters[divs_ncounters];
332 	struct divstat divstat;
333 	u_long *words = (u_long *)&divstat;
334 	int i;
335 
336 	CTASSERT(sizeof(divstat) == (nitems(counters) * sizeof(u_long)));
337 	memset(&divstat, 0, sizeof divstat);
338 	counters_read(divcounters, counters, nitems(counters), NULL);
339 
340 	for (i = 0; i < nitems(counters); i++)
341 		words[i] = (u_long)counters[i];
342 
343 	return (sysctl_rdstruct(oldp, oldlenp, newp,
344 	    &divstat, sizeof(divstat)));
345 }
346 
347 /*
348  * Sysctl for divert variables.
349  */
350 int
351 divert_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
352     size_t newlen)
353 {
354 	/* All sysctl names at this level are terminal. */
355 	if (namelen != 1)
356 		return (ENOTDIR);
357 
358 	switch (name[0]) {
359 	case DIVERTCTL_STATS:
360 		return (divert_sysctl_divstat(oldp, oldlenp, newp));
361 	default:
362 		return (sysctl_bounded_arr(divertctl_vars,
363 		    nitems(divertctl_vars), name, namelen, oldp, oldlenp,
364 		    newp, newlen));
365 	}
366 	/* NOTREACHED */
367 }
368