1 /* $OpenBSD: raw_ip.c,v 1.163 2025/01/01 13:44:22 bluhm Exp $ */ 2 /* $NetBSD: raw_ip.c,v 1.25 1996/02/18 18:58:33 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/protosw.h> 76 #include <sys/socketvar.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/route.h> 81 82 #include <netinet/in.h> 83 #include <netinet/ip.h> 84 #include <netinet/ip_mroute.h> 85 #include <netinet/ip_var.h> 86 #include <netinet/in_pcb.h> 87 #include <netinet/in_var.h> 88 #include <netinet/ip_icmp.h> 89 90 #include <net/pfvar.h> 91 92 #include "pf.h" 93 94 struct inpcbtable rawcbtable; 95 96 /* 97 * Nominal space allocated to a raw ip socket. 98 */ 99 #define RIPSNDQ 8192 100 #define RIPRCVQ 8192 101 102 /* 103 * Raw interface to IP protocol. 104 */ 105 106 const struct pr_usrreqs rip_usrreqs = { 107 .pru_attach = rip_attach, 108 .pru_detach = rip_detach, 109 .pru_bind = rip_bind, 110 .pru_connect = rip_connect, 111 .pru_disconnect = rip_disconnect, 112 .pru_shutdown = rip_shutdown, 113 .pru_send = rip_send, 114 .pru_control = in_control, 115 .pru_sockaddr = in_sockaddr, 116 .pru_peeraddr = in_peeraddr, 117 }; 118 119 void rip_sbappend(struct inpcb *, struct mbuf *, struct ip *, 120 struct sockaddr_in *); 121 122 /* 123 * Initialize raw connection block q. 124 */ 125 void 126 rip_init(void) 127 { 128 in_pcbinit(&rawcbtable, 1); 129 } 130 131 int 132 rip_input(struct mbuf **mp, int *offp, int proto, int af) 133 { 134 struct mbuf *m = *mp; 135 struct ip *ip = mtod(m, struct ip *); 136 struct inpcb_iterator iter = { .inp_table = NULL }; 137 struct inpcb *inp, *last; 138 struct in_addr *key; 139 struct sockaddr_in ripsrc; 140 141 KASSERT(af == AF_INET); 142 143 memset(&ripsrc, 0, sizeof(ripsrc)); 144 ripsrc.sin_family = AF_INET; 145 ripsrc.sin_len = sizeof(ripsrc); 146 ripsrc.sin_addr = ip->ip_src; 147 148 key = &ip->ip_dst; 149 #if NPF > 0 150 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 151 struct pf_divert *divert; 152 153 divert = pf_find_divert(m); 154 KASSERT(divert != NULL); 155 switch (divert->type) { 156 case PF_DIVERT_TO: 157 key = &divert->addr.v4; 158 break; 159 case PF_DIVERT_REPLY: 160 break; 161 default: 162 panic("%s: unknown divert type %d, mbuf %p, divert %p", 163 __func__, divert->type, m, divert); 164 } 165 } 166 #endif 167 mtx_enter(&rawcbtable.inpt_mtx); 168 last = inp = NULL; 169 while ((inp = in_pcb_iterator(&rawcbtable, inp, &iter)) != NULL) { 170 KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); 171 172 /* 173 * Packet must not be inserted after disconnected wakeup 174 * call. To avoid race, check again when holding receive 175 * buffer mutex. 176 */ 177 if (ISSET(READ_ONCE(inp->inp_socket->so_rcv.sb_state), 178 SS_CANTRCVMORE)) 179 continue; 180 if (rtable_l2(inp->inp_rtableid) != 181 rtable_l2(m->m_pkthdr.ph_rtableid)) 182 continue; 183 184 if (inp->inp_ip.ip_p && inp->inp_ip.ip_p != ip->ip_p) 185 continue; 186 if (inp->inp_laddr.s_addr && 187 inp->inp_laddr.s_addr != key->s_addr) 188 continue; 189 if (inp->inp_faddr.s_addr && 190 inp->inp_faddr.s_addr != ip->ip_src.s_addr) 191 continue; 192 193 if (last != NULL) { 194 struct mbuf *n; 195 196 mtx_leave(&rawcbtable.inpt_mtx); 197 198 n = m_copym(m, 0, M_COPYALL, M_NOWAIT); 199 if (n != NULL) 200 rip_sbappend(last, n, ip, &ripsrc); 201 in_pcbunref(last); 202 203 mtx_enter(&rawcbtable.inpt_mtx); 204 } 205 last = in_pcbref(inp); 206 } 207 mtx_leave(&rawcbtable.inpt_mtx); 208 209 if (last == NULL) { 210 struct counters_ref ref; 211 uint64_t *counters; 212 213 if (ip->ip_p == IPPROTO_ICMP) { 214 m_freem(m); 215 } else { 216 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 217 0, 0); 218 } 219 counters = counters_enter(&ref, ipcounters); 220 counters[ips_noproto]++; 221 counters[ips_delivered]--; 222 counters_leave(&ref, ipcounters); 223 224 return IPPROTO_DONE; 225 } 226 227 rip_sbappend(last, m, ip, &ripsrc); 228 in_pcbunref(last); 229 230 return IPPROTO_DONE; 231 } 232 233 void 234 rip_sbappend(struct inpcb *inp, struct mbuf *m, struct ip *ip, 235 struct sockaddr_in *ripsrc) 236 { 237 struct socket *so = inp->inp_socket; 238 struct mbuf *opts = NULL; 239 int ret = 0; 240 241 if (inp->inp_flags & INP_CONTROLOPTS || so->so_options & SO_TIMESTAMP) 242 ip_savecontrol(inp, &opts, ip, m); 243 244 mtx_enter(&so->so_rcv.sb_mtx); 245 if (!ISSET(inp->inp_socket->so_rcv.sb_state, SS_CANTRCVMORE)) 246 ret = sbappendaddr(so, &so->so_rcv, sintosa(ripsrc), m, opts); 247 mtx_leave(&so->so_rcv.sb_mtx); 248 249 if (ret == 0) { 250 m_freem(m); 251 m_freem(opts); 252 ipstat_inc(ips_noproto); 253 } else 254 sorwakeup(so); 255 } 256 257 /* 258 * Generate IP header and pass packet to ip_output. 259 * Tack on options user may have setup with control call. 260 */ 261 int 262 rip_output(struct mbuf *m, struct socket *so, struct sockaddr *dstaddr, 263 struct mbuf *control) 264 { 265 struct sockaddr_in *dst = satosin(dstaddr); 266 struct ip *ip; 267 struct inpcb *inp; 268 int flags, error; 269 270 inp = sotoinpcb(so); 271 flags = IP_ALLOWBROADCAST; 272 273 /* 274 * If the user handed us a complete IP packet, use it. 275 * Otherwise, allocate an mbuf for a header and fill it in. 276 */ 277 if ((inp->inp_flags & INP_HDRINCL) == 0) { 278 if ((m->m_pkthdr.len + sizeof(struct ip)) > IP_MAXPACKET) { 279 m_freem(m); 280 return (EMSGSIZE); 281 } 282 M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); 283 if (!m) 284 return (ENOBUFS); 285 ip = mtod(m, struct ip *); 286 ip->ip_tos = inp->inp_ip.ip_tos; 287 ip->ip_off = htons(0); 288 ip->ip_p = inp->inp_ip.ip_p; 289 ip->ip_len = htons(m->m_pkthdr.len); 290 ip->ip_src.s_addr = INADDR_ANY; 291 ip->ip_dst = dst->sin_addr; 292 ip->ip_ttl = inp->inp_ip.ip_ttl ? inp->inp_ip.ip_ttl : MAXTTL; 293 } else { 294 if (m->m_pkthdr.len > IP_MAXPACKET) { 295 m_freem(m); 296 return (EMSGSIZE); 297 } 298 299 m = rip_chkhdr(m, inp->inp_options); 300 if (m == NULL) 301 return (EINVAL); 302 303 ip = mtod(m, struct ip *); 304 if (ip->ip_id == 0) 305 ip->ip_id = htons(ip_randomid()); 306 dst->sin_addr = ip->ip_dst; 307 308 /* XXX prevent ip_output from overwriting header fields */ 309 flags |= IP_RAWOUTPUT; 310 ipstat_inc(ips_rawout); 311 } 312 313 if (ip->ip_src.s_addr == INADDR_ANY) { 314 error = in_pcbselsrc(&ip->ip_src, dst, inp); 315 if (error != 0) 316 return (error); 317 } 318 319 #ifdef INET6 320 /* 321 * A thought: Even though raw IP shouldn't be able to set IPv6 322 * multicast options, if it does, the last parameter to 323 * ip_output should be guarded against v6/v4 problems. 324 */ 325 #endif 326 /* force routing table */ 327 m->m_pkthdr.ph_rtableid = inp->inp_rtableid; 328 329 #if NPF > 0 330 if (inp->inp_socket->so_state & SS_ISCONNECTED && 331 ip->ip_p != IPPROTO_ICMP) 332 pf_mbuf_link_inpcb(m, inp); 333 #endif 334 335 error = ip_output(m, inp->inp_options, &inp->inp_route, flags, 336 inp->inp_moptions, &inp->inp_seclevel, 0); 337 return (error); 338 } 339 340 struct mbuf * 341 rip_chkhdr(struct mbuf *m, struct mbuf *options) 342 { 343 struct ip *ip; 344 int hlen, opt, optlen, cnt; 345 u_char *cp; 346 347 if (m->m_pkthdr.len < sizeof(struct ip)) { 348 m_freem(m); 349 return NULL; 350 } 351 352 m = m_pullup(m, sizeof (struct ip)); 353 if (m == NULL) 354 return NULL; 355 356 ip = mtod(m, struct ip *); 357 hlen = ip->ip_hl << 2; 358 359 /* Don't allow packet length sizes that will crash. */ 360 if (hlen < sizeof (struct ip) || 361 ntohs(ip->ip_len) < hlen || 362 ntohs(ip->ip_len) != m->m_pkthdr.len) { 363 m_freem(m); 364 return NULL; 365 } 366 m = m_pullup(m, hlen); 367 if (m == NULL) 368 return NULL; 369 370 ip = mtod(m, struct ip *); 371 372 if (ip->ip_v != IPVERSION) { 373 m_freem(m); 374 return NULL; 375 } 376 377 /* 378 * Don't allow both user specified and setsockopt options. 379 * If options are present verify them. 380 */ 381 if (hlen != sizeof(struct ip)) { 382 if (options) { 383 m_freem(m); 384 return NULL; 385 } else { 386 cp = (u_char *)(ip + 1); 387 cnt = hlen - sizeof(struct ip); 388 for (; cnt > 0; cnt -= optlen, cp += optlen) { 389 opt = cp[IPOPT_OPTVAL]; 390 if (opt == IPOPT_EOL) 391 break; 392 if (opt == IPOPT_NOP) 393 optlen = 1; 394 else { 395 if (cnt < IPOPT_OLEN + sizeof(*cp)) { 396 m_freem(m); 397 return NULL; 398 } 399 optlen = cp[IPOPT_OLEN]; 400 if (optlen < IPOPT_OLEN + sizeof(*cp) || 401 optlen > cnt) { 402 m_freem(m); 403 return NULL; 404 } 405 } 406 } 407 } 408 } 409 410 return m; 411 } 412 413 /* 414 * Raw IP socket option processing. 415 */ 416 int 417 rip_ctloutput(int op, struct socket *so, int level, int optname, 418 struct mbuf *m) 419 { 420 struct inpcb *inp = sotoinpcb(so); 421 int error; 422 423 if (level != IPPROTO_IP) 424 return (EINVAL); 425 426 switch (optname) { 427 428 case IP_HDRINCL: 429 error = 0; 430 if (op == PRCO_SETOPT) { 431 if (m == NULL || m->m_len < sizeof (int)) 432 error = EINVAL; 433 else if (*mtod(m, int *)) 434 inp->inp_flags |= INP_HDRINCL; 435 else 436 inp->inp_flags &= ~INP_HDRINCL; 437 } else { 438 m->m_len = sizeof(int); 439 *mtod(m, int *) = inp->inp_flags & INP_HDRINCL; 440 } 441 return (error); 442 443 case MRT_INIT: 444 case MRT_DONE: 445 case MRT_ADD_VIF: 446 case MRT_DEL_VIF: 447 case MRT_ADD_MFC: 448 case MRT_DEL_MFC: 449 case MRT_VERSION: 450 case MRT_ASSERT: 451 case MRT_API_SUPPORT: 452 case MRT_API_CONFIG: 453 #ifdef MROUTING 454 switch (op) { 455 case PRCO_SETOPT: 456 error = ip_mrouter_set(so, optname, m); 457 break; 458 case PRCO_GETOPT: 459 error = ip_mrouter_get(so, optname, m); 460 break; 461 default: 462 error = EINVAL; 463 break; 464 } 465 return (error); 466 #else 467 return (EOPNOTSUPP); 468 #endif 469 } 470 return (ip_ctloutput(op, so, level, optname, m)); 471 } 472 473 u_long rip_sendspace = RIPSNDQ; 474 u_long rip_recvspace = RIPRCVQ; 475 476 int 477 rip_attach(struct socket *so, int proto, int wait) 478 { 479 struct inpcb *inp; 480 int error; 481 482 if (so->so_pcb) 483 panic("rip_attach"); 484 if ((so->so_state & SS_PRIV) == 0) 485 return EACCES; 486 if (proto < 0 || proto >= IPPROTO_MAX) 487 return EPROTONOSUPPORT; 488 489 if ((error = soreserve(so, rip_sendspace, rip_recvspace))) 490 return error; 491 NET_ASSERT_LOCKED(); 492 if ((error = in_pcballoc(so, &rawcbtable, wait))) 493 return error; 494 inp = sotoinpcb(so); 495 inp->inp_ip.ip_p = proto; 496 return 0; 497 } 498 499 int 500 rip_detach(struct socket *so) 501 { 502 struct inpcb *inp = sotoinpcb(so); 503 504 soassertlocked(so); 505 506 if (inp == NULL) 507 return (EINVAL); 508 509 #ifdef MROUTING 510 if (so == ip_mrouter[inp->inp_rtableid]) 511 ip_mrouter_done(so); 512 #endif 513 in_pcbdetach(inp); 514 515 return (0); 516 } 517 518 int 519 rip_bind(struct socket *so, struct mbuf *nam, struct proc *p) 520 { 521 struct inpcb *inp = sotoinpcb(so); 522 struct sockaddr_in *addr; 523 int error; 524 525 soassertlocked(so); 526 527 if ((error = in_nam2sin(nam, &addr))) 528 return (error); 529 530 if (!((so->so_options & SO_BINDANY) || 531 addr->sin_addr.s_addr == INADDR_ANY || 532 addr->sin_addr.s_addr == INADDR_BROADCAST || 533 in_broadcast(addr->sin_addr, inp->inp_rtableid) || 534 ifa_ifwithaddr(sintosa(addr), inp->inp_rtableid))) 535 return (EADDRNOTAVAIL); 536 537 mtx_enter(&rawcbtable.inpt_mtx); 538 inp->inp_laddr = addr->sin_addr; 539 mtx_leave(&rawcbtable.inpt_mtx); 540 541 return (0); 542 } 543 544 int 545 rip_connect(struct socket *so, struct mbuf *nam) 546 { 547 struct inpcb *inp = sotoinpcb(so); 548 struct sockaddr_in *addr; 549 int error; 550 551 soassertlocked(so); 552 553 if ((error = in_nam2sin(nam, &addr))) 554 return (error); 555 556 mtx_enter(&rawcbtable.inpt_mtx); 557 inp->inp_faddr = addr->sin_addr; 558 mtx_leave(&rawcbtable.inpt_mtx); 559 soisconnected(so); 560 561 return (0); 562 } 563 564 int 565 rip_disconnect(struct socket *so) 566 { 567 struct inpcb *inp = sotoinpcb(so); 568 569 soassertlocked(so); 570 571 if ((so->so_state & SS_ISCONNECTED) == 0) 572 return (ENOTCONN); 573 574 soisdisconnected(so); 575 mtx_enter(&rawcbtable.inpt_mtx); 576 inp->inp_faddr.s_addr = INADDR_ANY; 577 mtx_leave(&rawcbtable.inpt_mtx); 578 579 return (0); 580 } 581 582 int 583 rip_shutdown(struct socket *so) 584 { 585 /* 586 * Mark the connection as being incapable of further input. 587 */ 588 589 soassertlocked(so); 590 socantsendmore(so); 591 592 return (0); 593 } 594 595 int 596 rip_send(struct socket *so, struct mbuf *m, struct mbuf *nam, 597 struct mbuf *control) 598 { 599 struct inpcb *inp = sotoinpcb(so); 600 struct sockaddr_in dst; 601 int error; 602 603 soassertlocked(so); 604 605 /* 606 * Ship a packet out. The appropriate raw output 607 * routine handles any massaging necessary. 608 */ 609 memset(&dst, 0, sizeof(dst)); 610 dst.sin_family = AF_INET; 611 dst.sin_len = sizeof(dst); 612 if (so->so_state & SS_ISCONNECTED) { 613 if (nam) { 614 error = EISCONN; 615 goto out; 616 } 617 dst.sin_addr = inp->inp_faddr; 618 } else { 619 struct sockaddr_in *addr; 620 621 if (nam == NULL) { 622 error = ENOTCONN; 623 goto out; 624 } 625 if ((error = in_nam2sin(nam, &addr))) 626 goto out; 627 dst.sin_addr = addr->sin_addr; 628 } 629 #ifdef IPSEC 630 /* XXX Find an IPsec TDB */ 631 #endif 632 error = rip_output(m, so, sintosa(&dst), NULL); 633 m = NULL; 634 635 out: 636 m_freem(control); 637 m_freem(m); 638 639 return (error); 640 } 641