1 /* $OpenBSD: raw_ip.c,v 1.160 2024/07/12 19:50:35 bluhm Exp $ */ 2 /* $NetBSD: raw_ip.c,v 1.25 1996/02/18 18:58:33 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/protosw.h> 76 #include <sys/socketvar.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/route.h> 81 82 #include <netinet/in.h> 83 #include <netinet/ip.h> 84 #include <netinet/ip_mroute.h> 85 #include <netinet/ip_var.h> 86 #include <netinet/in_pcb.h> 87 #include <netinet/in_var.h> 88 #include <netinet/ip_icmp.h> 89 90 #include <net/pfvar.h> 91 92 #include "pf.h" 93 94 struct inpcbtable rawcbtable; 95 96 /* 97 * Nominal space allocated to a raw ip socket. 98 */ 99 #define RIPSNDQ 8192 100 #define RIPRCVQ 8192 101 102 /* 103 * Raw interface to IP protocol. 104 */ 105 106 const struct pr_usrreqs rip_usrreqs = { 107 .pru_attach = rip_attach, 108 .pru_detach = rip_detach, 109 .pru_bind = rip_bind, 110 .pru_connect = rip_connect, 111 .pru_disconnect = rip_disconnect, 112 .pru_shutdown = rip_shutdown, 113 .pru_send = rip_send, 114 .pru_control = in_control, 115 .pru_sockaddr = in_sockaddr, 116 .pru_peeraddr = in_peeraddr, 117 }; 118 119 /* 120 * Initialize raw connection block q. 121 */ 122 void 123 rip_init(void) 124 { 125 in_pcbinit(&rawcbtable, 1); 126 } 127 128 int 129 rip_input(struct mbuf **mp, int *offp, int proto, int af) 130 { 131 struct mbuf *m = *mp; 132 struct ip *ip = mtod(m, struct ip *); 133 struct inpcb *inp; 134 SIMPLEQ_HEAD(, inpcb) inpcblist; 135 struct in_addr *key; 136 struct counters_ref ref; 137 uint64_t *counters; 138 struct sockaddr_in ripsrc; 139 140 KASSERT(af == AF_INET); 141 142 memset(&ripsrc, 0, sizeof(ripsrc)); 143 ripsrc.sin_family = AF_INET; 144 ripsrc.sin_len = sizeof(ripsrc); 145 ripsrc.sin_addr = ip->ip_src; 146 147 key = &ip->ip_dst; 148 #if NPF > 0 149 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 150 struct pf_divert *divert; 151 152 divert = pf_find_divert(m); 153 KASSERT(divert != NULL); 154 switch (divert->type) { 155 case PF_DIVERT_TO: 156 key = &divert->addr.v4; 157 break; 158 case PF_DIVERT_REPLY: 159 break; 160 default: 161 panic("%s: unknown divert type %d, mbuf %p, divert %p", 162 __func__, divert->type, m, divert); 163 } 164 } 165 #endif 166 SIMPLEQ_INIT(&inpcblist); 167 rw_enter_write(&rawcbtable.inpt_notify); 168 mtx_enter(&rawcbtable.inpt_mtx); 169 TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) { 170 KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); 171 172 /* 173 * Packet must not be inserted after disconnected wakeup 174 * call. To avoid race, check again when holding receive 175 * buffer mutex. 176 */ 177 if (ISSET(READ_ONCE(inp->inp_socket->so_rcv.sb_state), 178 SS_CANTRCVMORE)) 179 continue; 180 if (rtable_l2(inp->inp_rtableid) != 181 rtable_l2(m->m_pkthdr.ph_rtableid)) 182 continue; 183 184 if (inp->inp_ip.ip_p && inp->inp_ip.ip_p != ip->ip_p) 185 continue; 186 if (inp->inp_laddr.s_addr && 187 inp->inp_laddr.s_addr != key->s_addr) 188 continue; 189 if (inp->inp_faddr.s_addr && 190 inp->inp_faddr.s_addr != ip->ip_src.s_addr) 191 continue; 192 193 in_pcbref(inp); 194 SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify); 195 } 196 mtx_leave(&rawcbtable.inpt_mtx); 197 198 if (SIMPLEQ_EMPTY(&inpcblist)) { 199 rw_exit_write(&rawcbtable.inpt_notify); 200 201 if (ip->ip_p != IPPROTO_ICMP) 202 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 203 0, 0); 204 else 205 m_freem(m); 206 207 counters = counters_enter(&ref, ipcounters); 208 counters[ips_noproto]++; 209 counters[ips_delivered]--; 210 counters_leave(&ref, ipcounters); 211 212 return IPPROTO_DONE; 213 } 214 215 while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) { 216 struct mbuf *n, *opts = NULL; 217 218 SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify); 219 if (SIMPLEQ_EMPTY(&inpcblist)) 220 n = m; 221 else 222 n = m_copym(m, 0, M_COPYALL, M_NOWAIT); 223 if (n != NULL) { 224 struct socket *so = inp->inp_socket; 225 int ret = 0; 226 227 if (inp->inp_flags & INP_CONTROLOPTS || 228 so->so_options & SO_TIMESTAMP) 229 ip_savecontrol(inp, &opts, ip, n); 230 231 mtx_enter(&so->so_rcv.sb_mtx); 232 if (!ISSET(inp->inp_socket->so_rcv.sb_state, 233 SS_CANTRCVMORE)) { 234 ret = sbappendaddr(so, &so->so_rcv, 235 sintosa(&ripsrc), n, opts); 236 } 237 mtx_leave(&so->so_rcv.sb_mtx); 238 239 if (ret == 0) { 240 m_freem(n); 241 m_freem(opts); 242 ipstat_inc(ips_noproto); 243 } else 244 sorwakeup(so); 245 } 246 in_pcbunref(inp); 247 } 248 rw_exit_write(&rawcbtable.inpt_notify); 249 250 return IPPROTO_DONE; 251 } 252 253 /* 254 * Generate IP header and pass packet to ip_output. 255 * Tack on options user may have setup with control call. 256 */ 257 int 258 rip_output(struct mbuf *m, struct socket *so, struct sockaddr *dstaddr, 259 struct mbuf *control) 260 { 261 struct sockaddr_in *dst = satosin(dstaddr); 262 struct ip *ip; 263 struct inpcb *inp; 264 int flags, error; 265 266 inp = sotoinpcb(so); 267 flags = IP_ALLOWBROADCAST; 268 269 /* 270 * If the user handed us a complete IP packet, use it. 271 * Otherwise, allocate an mbuf for a header and fill it in. 272 */ 273 if ((inp->inp_flags & INP_HDRINCL) == 0) { 274 if ((m->m_pkthdr.len + sizeof(struct ip)) > IP_MAXPACKET) { 275 m_freem(m); 276 return (EMSGSIZE); 277 } 278 M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); 279 if (!m) 280 return (ENOBUFS); 281 ip = mtod(m, struct ip *); 282 ip->ip_tos = inp->inp_ip.ip_tos; 283 ip->ip_off = htons(0); 284 ip->ip_p = inp->inp_ip.ip_p; 285 ip->ip_len = htons(m->m_pkthdr.len); 286 ip->ip_src.s_addr = INADDR_ANY; 287 ip->ip_dst = dst->sin_addr; 288 ip->ip_ttl = inp->inp_ip.ip_ttl ? inp->inp_ip.ip_ttl : MAXTTL; 289 } else { 290 if (m->m_pkthdr.len > IP_MAXPACKET) { 291 m_freem(m); 292 return (EMSGSIZE); 293 } 294 295 m = rip_chkhdr(m, inp->inp_options); 296 if (m == NULL) 297 return (EINVAL); 298 299 ip = mtod(m, struct ip *); 300 if (ip->ip_id == 0) 301 ip->ip_id = htons(ip_randomid()); 302 dst->sin_addr = ip->ip_dst; 303 304 /* XXX prevent ip_output from overwriting header fields */ 305 flags |= IP_RAWOUTPUT; 306 ipstat_inc(ips_rawout); 307 } 308 309 if (ip->ip_src.s_addr == INADDR_ANY) { 310 error = in_pcbselsrc(&ip->ip_src, dst, inp); 311 if (error != 0) 312 return (error); 313 } 314 315 #ifdef INET6 316 /* 317 * A thought: Even though raw IP shouldn't be able to set IPv6 318 * multicast options, if it does, the last parameter to 319 * ip_output should be guarded against v6/v4 problems. 320 */ 321 #endif 322 /* force routing table */ 323 m->m_pkthdr.ph_rtableid = inp->inp_rtableid; 324 325 #if NPF > 0 326 if (inp->inp_socket->so_state & SS_ISCONNECTED && 327 ip->ip_p != IPPROTO_ICMP) 328 pf_mbuf_link_inpcb(m, inp); 329 #endif 330 331 error = ip_output(m, inp->inp_options, &inp->inp_route, flags, 332 inp->inp_moptions, &inp->inp_seclevel, 0); 333 return (error); 334 } 335 336 struct mbuf * 337 rip_chkhdr(struct mbuf *m, struct mbuf *options) 338 { 339 struct ip *ip; 340 int hlen, opt, optlen, cnt; 341 u_char *cp; 342 343 if (m->m_pkthdr.len < sizeof(struct ip)) { 344 m_freem(m); 345 return NULL; 346 } 347 348 m = m_pullup(m, sizeof (struct ip)); 349 if (m == NULL) 350 return NULL; 351 352 ip = mtod(m, struct ip *); 353 hlen = ip->ip_hl << 2; 354 355 /* Don't allow packet length sizes that will crash. */ 356 if (hlen < sizeof (struct ip) || 357 ntohs(ip->ip_len) < hlen || 358 ntohs(ip->ip_len) != m->m_pkthdr.len) { 359 m_freem(m); 360 return NULL; 361 } 362 m = m_pullup(m, hlen); 363 if (m == NULL) 364 return NULL; 365 366 ip = mtod(m, struct ip *); 367 368 if (ip->ip_v != IPVERSION) { 369 m_freem(m); 370 return NULL; 371 } 372 373 /* 374 * Don't allow both user specified and setsockopt options. 375 * If options are present verify them. 376 */ 377 if (hlen != sizeof(struct ip)) { 378 if (options) { 379 m_freem(m); 380 return NULL; 381 } else { 382 cp = (u_char *)(ip + 1); 383 cnt = hlen - sizeof(struct ip); 384 for (; cnt > 0; cnt -= optlen, cp += optlen) { 385 opt = cp[IPOPT_OPTVAL]; 386 if (opt == IPOPT_EOL) 387 break; 388 if (opt == IPOPT_NOP) 389 optlen = 1; 390 else { 391 if (cnt < IPOPT_OLEN + sizeof(*cp)) { 392 m_freem(m); 393 return NULL; 394 } 395 optlen = cp[IPOPT_OLEN]; 396 if (optlen < IPOPT_OLEN + sizeof(*cp) || 397 optlen > cnt) { 398 m_freem(m); 399 return NULL; 400 } 401 } 402 } 403 } 404 } 405 406 return m; 407 } 408 409 /* 410 * Raw IP socket option processing. 411 */ 412 int 413 rip_ctloutput(int op, struct socket *so, int level, int optname, 414 struct mbuf *m) 415 { 416 struct inpcb *inp = sotoinpcb(so); 417 int error; 418 419 if (level != IPPROTO_IP) 420 return (EINVAL); 421 422 switch (optname) { 423 424 case IP_HDRINCL: 425 error = 0; 426 if (op == PRCO_SETOPT) { 427 if (m == NULL || m->m_len < sizeof (int)) 428 error = EINVAL; 429 else if (*mtod(m, int *)) 430 inp->inp_flags |= INP_HDRINCL; 431 else 432 inp->inp_flags &= ~INP_HDRINCL; 433 } else { 434 m->m_len = sizeof(int); 435 *mtod(m, int *) = inp->inp_flags & INP_HDRINCL; 436 } 437 return (error); 438 439 case MRT_INIT: 440 case MRT_DONE: 441 case MRT_ADD_VIF: 442 case MRT_DEL_VIF: 443 case MRT_ADD_MFC: 444 case MRT_DEL_MFC: 445 case MRT_VERSION: 446 case MRT_ASSERT: 447 case MRT_API_SUPPORT: 448 case MRT_API_CONFIG: 449 #ifdef MROUTING 450 switch (op) { 451 case PRCO_SETOPT: 452 error = ip_mrouter_set(so, optname, m); 453 break; 454 case PRCO_GETOPT: 455 error = ip_mrouter_get(so, optname, m); 456 break; 457 default: 458 error = EINVAL; 459 break; 460 } 461 return (error); 462 #else 463 return (EOPNOTSUPP); 464 #endif 465 } 466 return (ip_ctloutput(op, so, level, optname, m)); 467 } 468 469 u_long rip_sendspace = RIPSNDQ; 470 u_long rip_recvspace = RIPRCVQ; 471 472 int 473 rip_attach(struct socket *so, int proto, int wait) 474 { 475 struct inpcb *inp; 476 int error; 477 478 if (so->so_pcb) 479 panic("rip_attach"); 480 if ((so->so_state & SS_PRIV) == 0) 481 return EACCES; 482 if (proto < 0 || proto >= IPPROTO_MAX) 483 return EPROTONOSUPPORT; 484 485 if ((error = soreserve(so, rip_sendspace, rip_recvspace))) 486 return error; 487 NET_ASSERT_LOCKED(); 488 if ((error = in_pcballoc(so, &rawcbtable, wait))) 489 return error; 490 inp = sotoinpcb(so); 491 inp->inp_ip.ip_p = proto; 492 return 0; 493 } 494 495 int 496 rip_detach(struct socket *so) 497 { 498 struct inpcb *inp = sotoinpcb(so); 499 500 soassertlocked(so); 501 502 if (inp == NULL) 503 return (EINVAL); 504 505 #ifdef MROUTING 506 if (so == ip_mrouter[inp->inp_rtableid]) 507 ip_mrouter_done(so); 508 #endif 509 in_pcbdetach(inp); 510 511 return (0); 512 } 513 514 int 515 rip_bind(struct socket *so, struct mbuf *nam, struct proc *p) 516 { 517 struct inpcb *inp = sotoinpcb(so); 518 struct sockaddr_in *addr; 519 int error; 520 521 soassertlocked(so); 522 523 if ((error = in_nam2sin(nam, &addr))) 524 return (error); 525 526 if (!((so->so_options & SO_BINDANY) || 527 addr->sin_addr.s_addr == INADDR_ANY || 528 addr->sin_addr.s_addr == INADDR_BROADCAST || 529 in_broadcast(addr->sin_addr, inp->inp_rtableid) || 530 ifa_ifwithaddr(sintosa(addr), inp->inp_rtableid))) 531 return (EADDRNOTAVAIL); 532 533 mtx_enter(&rawcbtable.inpt_mtx); 534 inp->inp_laddr = addr->sin_addr; 535 mtx_leave(&rawcbtable.inpt_mtx); 536 537 return (0); 538 } 539 540 int 541 rip_connect(struct socket *so, struct mbuf *nam) 542 { 543 struct inpcb *inp = sotoinpcb(so); 544 struct sockaddr_in *addr; 545 int error; 546 547 soassertlocked(so); 548 549 if ((error = in_nam2sin(nam, &addr))) 550 return (error); 551 552 mtx_enter(&rawcbtable.inpt_mtx); 553 inp->inp_faddr = addr->sin_addr; 554 mtx_leave(&rawcbtable.inpt_mtx); 555 soisconnected(so); 556 557 return (0); 558 } 559 560 int 561 rip_disconnect(struct socket *so) 562 { 563 struct inpcb *inp = sotoinpcb(so); 564 565 soassertlocked(so); 566 567 if ((so->so_state & SS_ISCONNECTED) == 0) 568 return (ENOTCONN); 569 570 soisdisconnected(so); 571 mtx_enter(&rawcbtable.inpt_mtx); 572 inp->inp_faddr.s_addr = INADDR_ANY; 573 mtx_leave(&rawcbtable.inpt_mtx); 574 575 return (0); 576 } 577 578 int 579 rip_shutdown(struct socket *so) 580 { 581 /* 582 * Mark the connection as being incapable of further input. 583 */ 584 585 soassertlocked(so); 586 socantsendmore(so); 587 588 return (0); 589 } 590 591 int 592 rip_send(struct socket *so, struct mbuf *m, struct mbuf *nam, 593 struct mbuf *control) 594 { 595 struct inpcb *inp = sotoinpcb(so); 596 struct sockaddr_in dst; 597 int error; 598 599 soassertlocked(so); 600 601 /* 602 * Ship a packet out. The appropriate raw output 603 * routine handles any massaging necessary. 604 */ 605 memset(&dst, 0, sizeof(dst)); 606 dst.sin_family = AF_INET; 607 dst.sin_len = sizeof(dst); 608 if (so->so_state & SS_ISCONNECTED) { 609 if (nam) { 610 error = EISCONN; 611 goto out; 612 } 613 dst.sin_addr = inp->inp_faddr; 614 } else { 615 struct sockaddr_in *addr; 616 617 if (nam == NULL) { 618 error = ENOTCONN; 619 goto out; 620 } 621 if ((error = in_nam2sin(nam, &addr))) 622 goto out; 623 dst.sin_addr = addr->sin_addr; 624 } 625 #ifdef IPSEC 626 /* XXX Find an IPsec TDB */ 627 #endif 628 error = rip_output(m, so, sintosa(&dst), NULL); 629 m = NULL; 630 631 out: 632 m_freem(control); 633 m_freem(m); 634 635 return (error); 636 } 637