1 /* $OpenBSD: ip_divert.c,v 1.99 2025/01/23 12:51:51 bluhm Exp $ */ 2 3 /* 4 * Copyright (c) 2009 Michele Marchetto <michele@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/systm.h> 21 #include <sys/mbuf.h> 22 #include <sys/protosw.h> 23 #include <sys/socket.h> 24 #include <sys/socketvar.h> 25 #include <sys/sysctl.h> 26 27 #include <net/if.h> 28 #include <net/route.h> 29 #include <net/if_var.h> 30 #include <net/netisr.h> 31 32 #include <netinet/in.h> 33 #include <netinet/in_var.h> 34 #include <netinet/ip.h> 35 #include <netinet/ip_var.h> 36 #include <netinet/in_pcb.h> 37 #include <netinet/ip_divert.h> 38 #include <netinet/tcp.h> 39 #include <netinet/udp.h> 40 #include <netinet/ip_icmp.h> 41 42 #include <net/pfvar.h> 43 44 /* 45 * Locks used to protect data: 46 * a atomic 47 */ 48 49 struct inpcbtable divbtable; 50 struct cpumem *divcounters; 51 52 #ifndef DIVERT_SENDSPACE 53 #define DIVERT_SENDSPACE (65536 + 100) 54 #endif 55 u_int divert_sendspace = DIVERT_SENDSPACE; /* [a] */ 56 #ifndef DIVERT_RECVSPACE 57 #define DIVERT_RECVSPACE (65536 + 100) 58 #endif 59 u_int divert_recvspace = DIVERT_RECVSPACE; /* [a] */ 60 61 #ifndef DIVERTHASHSIZE 62 #define DIVERTHASHSIZE 128 63 #endif 64 65 const struct sysctl_bounded_args divertctl_vars[] = { 66 { DIVERTCTL_RECVSPACE, &divert_recvspace, 0, INT_MAX }, 67 { DIVERTCTL_SENDSPACE, &divert_sendspace, 0, INT_MAX }, 68 }; 69 70 const struct pr_usrreqs divert_usrreqs = { 71 .pru_attach = divert_attach, 72 .pru_detach = divert_detach, 73 .pru_bind = divert_bind, 74 .pru_shutdown = divert_shutdown, 75 .pru_send = divert_send, 76 .pru_control = in_control, 77 .pru_sockaddr = in_sockaddr, 78 .pru_peeraddr = in_peeraddr, 79 }; 80 81 int divbhashsize = DIVERTHASHSIZE; 82 83 int divert_output(struct inpcb *, struct mbuf *, struct mbuf *, 84 struct mbuf *); 85 void 86 divert_init(void) 87 { 88 in_pcbinit(&divbtable, divbhashsize); 89 divcounters = counters_alloc(divs_ncounters); 90 } 91 92 int 93 divert_output(struct inpcb *inp, struct mbuf *m, struct mbuf *nam, 94 struct mbuf *control) 95 { 96 struct sockaddr_in *sin; 97 int error, min_hdrlen, off, dir; 98 struct ip *ip; 99 100 m_freem(control); 101 102 if ((error = in_nam2sin(nam, &sin))) 103 goto fail; 104 105 if (m->m_pkthdr.len > IP_MAXPACKET) { 106 error = EMSGSIZE; 107 goto fail; 108 } 109 110 m = rip_chkhdr(m, NULL); 111 if (m == NULL) { 112 error = EINVAL; 113 goto fail; 114 } 115 116 ip = mtod(m, struct ip *); 117 off = ip->ip_hl << 2; 118 119 dir = (sin->sin_addr.s_addr == INADDR_ANY ? PF_OUT : PF_IN); 120 121 switch (ip->ip_p) { 122 case IPPROTO_TCP: 123 min_hdrlen = sizeof(struct tcphdr); 124 m->m_pkthdr.csum_flags |= M_TCP_CSUM_OUT; 125 break; 126 case IPPROTO_UDP: 127 min_hdrlen = sizeof(struct udphdr); 128 m->m_pkthdr.csum_flags |= M_UDP_CSUM_OUT; 129 break; 130 case IPPROTO_ICMP: 131 min_hdrlen = ICMP_MINLEN; 132 m->m_pkthdr.csum_flags |= M_ICMP_CSUM_OUT; 133 break; 134 default: 135 min_hdrlen = 0; 136 break; 137 } 138 if (min_hdrlen && m->m_pkthdr.len < off + min_hdrlen) { 139 error = EINVAL; 140 goto fail; 141 } 142 143 m->m_pkthdr.pf.flags |= PF_TAG_DIVERTED_PACKET; 144 145 if (dir == PF_IN) { 146 struct rtentry *rt; 147 struct ifnet *ifp; 148 149 rt = rtalloc(sintosa(sin), 0, inp->inp_rtableid); 150 if (!rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_LOCAL)) { 151 rtfree(rt); 152 error = EADDRNOTAVAIL; 153 goto fail; 154 } 155 m->m_pkthdr.ph_ifidx = rt->rt_ifidx; 156 rtfree(rt); 157 158 /* 159 * Recalculate IP and protocol checksums for the inbound packet 160 * since the userspace application may have modified the packet 161 * prior to reinjection. 162 */ 163 in_hdr_cksum_out(m, NULL); 164 in_proto_cksum_out(m, NULL); 165 166 ifp = if_get(m->m_pkthdr.ph_ifidx); 167 if (ifp == NULL) { 168 error = ENETDOWN; 169 goto fail; 170 } 171 ipv4_input(ifp, m); 172 if_put(ifp); 173 } else { 174 m->m_pkthdr.ph_rtableid = inp->inp_rtableid; 175 176 error = ip_output(m, NULL, &inp->inp_route, 177 IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL, 0); 178 } 179 180 divstat_inc(divs_opackets); 181 return (error); 182 183 fail: 184 m_freem(m); 185 divstat_inc(divs_errors); 186 return (error); 187 } 188 189 void 190 divert_packet(struct mbuf *m, int dir, u_int16_t divert_port) 191 { 192 struct inpcb *inp = NULL; 193 struct socket *so; 194 struct sockaddr_in sin; 195 196 divstat_inc(divs_ipackets); 197 198 if (m->m_len < sizeof(struct ip) && 199 (m = m_pullup(m, sizeof(struct ip))) == NULL) { 200 divstat_inc(divs_errors); 201 goto bad; 202 } 203 204 mtx_enter(&divbtable.inpt_mtx); 205 TAILQ_FOREACH(inp, &divbtable.inpt_queue, inp_queue) { 206 if (inp->inp_lport != divert_port) 207 continue; 208 in_pcbref(inp); 209 break; 210 } 211 mtx_leave(&divbtable.inpt_mtx); 212 if (inp == NULL) { 213 divstat_inc(divs_noport); 214 goto bad; 215 } 216 217 memset(&sin, 0, sizeof(sin)); 218 sin.sin_family = AF_INET; 219 sin.sin_len = sizeof(sin); 220 221 if (dir == PF_IN) { 222 struct ifaddr *ifa; 223 struct ifnet *ifp; 224 225 ifp = if_get(m->m_pkthdr.ph_ifidx); 226 if (ifp == NULL) { 227 divstat_inc(divs_errors); 228 goto bad; 229 } 230 TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { 231 if (ifa->ifa_addr->sa_family != AF_INET) 232 continue; 233 sin.sin_addr = satosin(ifa->ifa_addr)->sin_addr; 234 break; 235 } 236 if_put(ifp); 237 } else { 238 /* 239 * Calculate IP and protocol checksums for outbound packet 240 * diverted to userland. pf rule diverts before cksum offload. 241 */ 242 in_hdr_cksum_out(m, NULL); 243 in_proto_cksum_out(m, NULL); 244 } 245 246 so = inp->inp_socket; 247 mtx_enter(&so->so_rcv.sb_mtx); 248 if (sbappendaddr(so, &so->so_rcv, sintosa(&sin), m, NULL) == 0) { 249 mtx_leave(&so->so_rcv.sb_mtx); 250 divstat_inc(divs_fullsock); 251 goto bad; 252 } 253 mtx_leave(&so->so_rcv.sb_mtx); 254 sorwakeup(so); 255 256 in_pcbunref(inp); 257 return; 258 259 bad: 260 if (inp != NULL) 261 in_pcbunref(inp); 262 m_freem(m); 263 } 264 265 int 266 divert_attach(struct socket *so, int proto, int wait) 267 { 268 int error; 269 270 if (so->so_pcb != NULL) 271 return EINVAL; 272 if ((so->so_state & SS_PRIV) == 0) 273 return EACCES; 274 275 error = soreserve(so, atomic_load_int(&divert_sendspace), 276 atomic_load_int(&divert_recvspace)); 277 if (error) 278 return error; 279 error = in_pcballoc(so, &divbtable, wait); 280 if (error) 281 return error; 282 283 sotoinpcb(so)->inp_flags |= INP_HDRINCL; 284 return (0); 285 } 286 287 int 288 divert_detach(struct socket *so) 289 { 290 struct inpcb *inp = sotoinpcb(so); 291 292 soassertlocked(so); 293 294 if (inp == NULL) 295 return (EINVAL); 296 297 in_pcbdetach(inp); 298 return (0); 299 } 300 301 int 302 divert_bind(struct socket *so, struct mbuf *addr, struct proc *p) 303 { 304 struct inpcb *inp = sotoinpcb(so); 305 306 soassertlocked(so); 307 return in_pcbbind(inp, addr, p); 308 } 309 310 int 311 divert_shutdown(struct socket *so) 312 { 313 soassertlocked(so); 314 socantsendmore(so); 315 return (0); 316 } 317 318 int 319 divert_send(struct socket *so, struct mbuf *m, struct mbuf *addr, 320 struct mbuf *control) 321 { 322 struct inpcb *inp = sotoinpcb(so); 323 324 soassertlocked(so); 325 return (divert_output(inp, m, addr, control)); 326 } 327 328 int 329 divert_sysctl_divstat(void *oldp, size_t *oldlenp, void *newp) 330 { 331 uint64_t counters[divs_ncounters]; 332 struct divstat divstat; 333 u_long *words = (u_long *)&divstat; 334 int i; 335 336 CTASSERT(sizeof(divstat) == (nitems(counters) * sizeof(u_long))); 337 memset(&divstat, 0, sizeof divstat); 338 counters_read(divcounters, counters, nitems(counters), NULL); 339 340 for (i = 0; i < nitems(counters); i++) 341 words[i] = (u_long)counters[i]; 342 343 return (sysctl_rdstruct(oldp, oldlenp, newp, 344 &divstat, sizeof(divstat))); 345 } 346 347 /* 348 * Sysctl for divert variables. 349 */ 350 int 351 divert_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 352 size_t newlen) 353 { 354 /* All sysctl names at this level are terminal. */ 355 if (namelen != 1) 356 return (ENOTDIR); 357 358 switch (name[0]) { 359 case DIVERTCTL_STATS: 360 return (divert_sysctl_divstat(oldp, oldlenp, newp)); 361 default: 362 return (sysctl_bounded_arr(divertctl_vars, 363 nitems(divertctl_vars), name, namelen, oldp, oldlenp, 364 newp, newlen)); 365 } 366 /* NOTREACHED */ 367 } 368