xref: /openbsd-src/sys/netinet/raw_ip.c (revision dcc91c2622318df8f66a9bca2d2864253df1bfc3)
1 /*	$OpenBSD: raw_ip.c,v 1.160 2024/07/12 19:50:35 bluhm Exp $	*/
2 /*	$NetBSD: raw_ip.c,v 1.25 1996/02/18 18:58:33 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  *	This product includes software developed by the University of
46  *	California, Berkeley and its contributors.
47  *	This product includes software developed at the Information
48  *	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/protosw.h>
76 #include <sys/socketvar.h>
77 
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/route.h>
81 
82 #include <netinet/in.h>
83 #include <netinet/ip.h>
84 #include <netinet/ip_mroute.h>
85 #include <netinet/ip_var.h>
86 #include <netinet/in_pcb.h>
87 #include <netinet/in_var.h>
88 #include <netinet/ip_icmp.h>
89 
90 #include <net/pfvar.h>
91 
92 #include "pf.h"
93 
94 struct inpcbtable rawcbtable;
95 
96 /*
97  * Nominal space allocated to a raw ip socket.
98  */
99 #define	RIPSNDQ		8192
100 #define	RIPRCVQ		8192
101 
102 /*
103  * Raw interface to IP protocol.
104  */
105 
106 const struct pr_usrreqs rip_usrreqs = {
107 	.pru_attach	= rip_attach,
108 	.pru_detach	= rip_detach,
109 	.pru_bind	= rip_bind,
110 	.pru_connect	= rip_connect,
111 	.pru_disconnect	= rip_disconnect,
112 	.pru_shutdown	= rip_shutdown,
113 	.pru_send	= rip_send,
114 	.pru_control	= in_control,
115 	.pru_sockaddr	= in_sockaddr,
116 	.pru_peeraddr	= in_peeraddr,
117 };
118 
119 /*
120  * Initialize raw connection block q.
121  */
122 void
123 rip_init(void)
124 {
125 	in_pcbinit(&rawcbtable, 1);
126 }
127 
128 int
129 rip_input(struct mbuf **mp, int *offp, int proto, int af)
130 {
131 	struct mbuf *m = *mp;
132 	struct ip *ip = mtod(m, struct ip *);
133 	struct inpcb *inp;
134 	SIMPLEQ_HEAD(, inpcb) inpcblist;
135 	struct in_addr *key;
136 	struct counters_ref ref;
137 	uint64_t *counters;
138 	struct sockaddr_in ripsrc;
139 
140 	KASSERT(af == AF_INET);
141 
142 	memset(&ripsrc, 0, sizeof(ripsrc));
143 	ripsrc.sin_family = AF_INET;
144 	ripsrc.sin_len = sizeof(ripsrc);
145 	ripsrc.sin_addr = ip->ip_src;
146 
147 	key = &ip->ip_dst;
148 #if NPF > 0
149 	if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
150 		struct pf_divert *divert;
151 
152 		divert = pf_find_divert(m);
153 		KASSERT(divert != NULL);
154 		switch (divert->type) {
155 		case PF_DIVERT_TO:
156 			key = &divert->addr.v4;
157 			break;
158 		case PF_DIVERT_REPLY:
159 			break;
160 		default:
161 			panic("%s: unknown divert type %d, mbuf %p, divert %p",
162 			    __func__, divert->type, m, divert);
163 		}
164 	}
165 #endif
166 	SIMPLEQ_INIT(&inpcblist);
167 	rw_enter_write(&rawcbtable.inpt_notify);
168 	mtx_enter(&rawcbtable.inpt_mtx);
169 	TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
170 		KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
171 
172 		/*
173 		 * Packet must not be inserted after disconnected wakeup
174 		 * call.  To avoid race, check again when holding receive
175 		 * buffer mutex.
176 		 */
177 		if (ISSET(READ_ONCE(inp->inp_socket->so_rcv.sb_state),
178 		    SS_CANTRCVMORE))
179 			continue;
180 		if (rtable_l2(inp->inp_rtableid) !=
181 		    rtable_l2(m->m_pkthdr.ph_rtableid))
182 			continue;
183 
184 		if (inp->inp_ip.ip_p && inp->inp_ip.ip_p != ip->ip_p)
185 			continue;
186 		if (inp->inp_laddr.s_addr &&
187 		    inp->inp_laddr.s_addr != key->s_addr)
188 			continue;
189 		if (inp->inp_faddr.s_addr &&
190 		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
191 			continue;
192 
193 		in_pcbref(inp);
194 		SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
195 	}
196 	mtx_leave(&rawcbtable.inpt_mtx);
197 
198 	if (SIMPLEQ_EMPTY(&inpcblist)) {
199 		rw_exit_write(&rawcbtable.inpt_notify);
200 
201 		if (ip->ip_p != IPPROTO_ICMP)
202 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
203 			    0, 0);
204 		else
205 			m_freem(m);
206 
207 		counters = counters_enter(&ref, ipcounters);
208 		counters[ips_noproto]++;
209 		counters[ips_delivered]--;
210 		counters_leave(&ref, ipcounters);
211 
212 		return IPPROTO_DONE;
213 	}
214 
215 	while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
216 		struct mbuf *n, *opts = NULL;
217 
218 		SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
219 		if (SIMPLEQ_EMPTY(&inpcblist))
220 			n = m;
221 		else
222 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
223 		if (n != NULL) {
224 			struct socket *so = inp->inp_socket;
225 			int ret = 0;
226 
227 			if (inp->inp_flags & INP_CONTROLOPTS ||
228 			    so->so_options & SO_TIMESTAMP)
229 				ip_savecontrol(inp, &opts, ip, n);
230 
231 			mtx_enter(&so->so_rcv.sb_mtx);
232 			if (!ISSET(inp->inp_socket->so_rcv.sb_state,
233 			    SS_CANTRCVMORE)) {
234 				ret = sbappendaddr(so, &so->so_rcv,
235 				    sintosa(&ripsrc), n, opts);
236 			}
237 			mtx_leave(&so->so_rcv.sb_mtx);
238 
239 			if (ret == 0) {
240 				m_freem(n);
241 				m_freem(opts);
242 				ipstat_inc(ips_noproto);
243 			} else
244 				sorwakeup(so);
245 		}
246 		in_pcbunref(inp);
247 	}
248 	rw_exit_write(&rawcbtable.inpt_notify);
249 
250 	return IPPROTO_DONE;
251 }
252 
253 /*
254  * Generate IP header and pass packet to ip_output.
255  * Tack on options user may have setup with control call.
256  */
257 int
258 rip_output(struct mbuf *m, struct socket *so, struct sockaddr *dstaddr,
259     struct mbuf *control)
260 {
261 	struct sockaddr_in *dst = satosin(dstaddr);
262 	struct ip *ip;
263 	struct inpcb *inp;
264 	int flags, error;
265 
266 	inp = sotoinpcb(so);
267 	flags = IP_ALLOWBROADCAST;
268 
269 	/*
270 	 * If the user handed us a complete IP packet, use it.
271 	 * Otherwise, allocate an mbuf for a header and fill it in.
272 	 */
273 	if ((inp->inp_flags & INP_HDRINCL) == 0) {
274 		if ((m->m_pkthdr.len + sizeof(struct ip)) > IP_MAXPACKET) {
275 			m_freem(m);
276 			return (EMSGSIZE);
277 		}
278 		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
279 		if (!m)
280 			return (ENOBUFS);
281 		ip = mtod(m, struct ip *);
282 		ip->ip_tos = inp->inp_ip.ip_tos;
283 		ip->ip_off = htons(0);
284 		ip->ip_p = inp->inp_ip.ip_p;
285 		ip->ip_len = htons(m->m_pkthdr.len);
286 		ip->ip_src.s_addr = INADDR_ANY;
287 		ip->ip_dst = dst->sin_addr;
288 		ip->ip_ttl = inp->inp_ip.ip_ttl ? inp->inp_ip.ip_ttl : MAXTTL;
289 	} else {
290 		if (m->m_pkthdr.len > IP_MAXPACKET) {
291 			m_freem(m);
292 			return (EMSGSIZE);
293 		}
294 
295 		m = rip_chkhdr(m, inp->inp_options);
296 		if (m == NULL)
297 			return (EINVAL);
298 
299 		ip = mtod(m, struct ip *);
300 		if (ip->ip_id == 0)
301 			ip->ip_id = htons(ip_randomid());
302 		dst->sin_addr = ip->ip_dst;
303 
304 		/* XXX prevent ip_output from overwriting header fields */
305 		flags |= IP_RAWOUTPUT;
306 		ipstat_inc(ips_rawout);
307 	}
308 
309 	if (ip->ip_src.s_addr == INADDR_ANY) {
310 		error = in_pcbselsrc(&ip->ip_src, dst, inp);
311 		if (error != 0)
312 			return (error);
313 	}
314 
315 #ifdef INET6
316 	/*
317 	 * A thought:  Even though raw IP shouldn't be able to set IPv6
318 	 *             multicast options, if it does, the last parameter to
319 	 *             ip_output should be guarded against v6/v4 problems.
320 	 */
321 #endif
322 	/* force routing table */
323 	m->m_pkthdr.ph_rtableid = inp->inp_rtableid;
324 
325 #if NPF > 0
326 	if (inp->inp_socket->so_state & SS_ISCONNECTED &&
327 	    ip->ip_p != IPPROTO_ICMP)
328 		pf_mbuf_link_inpcb(m, inp);
329 #endif
330 
331 	error = ip_output(m, inp->inp_options, &inp->inp_route, flags,
332 	    inp->inp_moptions, &inp->inp_seclevel, 0);
333 	return (error);
334 }
335 
336 struct mbuf *
337 rip_chkhdr(struct mbuf *m, struct mbuf *options)
338 {
339 	struct ip *ip;
340 	int hlen, opt, optlen, cnt;
341 	u_char *cp;
342 
343 	if (m->m_pkthdr.len < sizeof(struct ip)) {
344 		m_freem(m);
345 		return NULL;
346 	}
347 
348 	m = m_pullup(m, sizeof (struct ip));
349 	if (m == NULL)
350 		return NULL;
351 
352 	ip = mtod(m, struct ip *);
353 	hlen = ip->ip_hl << 2;
354 
355 	/* Don't allow packet length sizes that will crash. */
356 	if (hlen < sizeof (struct ip) ||
357 	    ntohs(ip->ip_len) < hlen ||
358 	    ntohs(ip->ip_len) != m->m_pkthdr.len) {
359 		m_freem(m);
360 		return NULL;
361 	}
362 	m = m_pullup(m, hlen);
363 	if (m == NULL)
364 		return NULL;
365 
366 	ip = mtod(m, struct ip *);
367 
368 	if (ip->ip_v != IPVERSION) {
369 		m_freem(m);
370 		return NULL;
371 	}
372 
373 	/*
374 	 * Don't allow both user specified and setsockopt options.
375 	 * If options are present verify them.
376 	 */
377 	if (hlen != sizeof(struct ip)) {
378 		if (options) {
379 			m_freem(m);
380 			return NULL;
381 		} else {
382 			cp = (u_char *)(ip + 1);
383 			cnt = hlen - sizeof(struct ip);
384 			for (; cnt > 0; cnt -= optlen, cp += optlen) {
385 				opt = cp[IPOPT_OPTVAL];
386 				if (opt == IPOPT_EOL)
387 					break;
388 				if (opt == IPOPT_NOP)
389 					optlen = 1;
390 				else {
391 					if (cnt < IPOPT_OLEN + sizeof(*cp)) {
392 						m_freem(m);
393 						return NULL;
394 					}
395 					optlen = cp[IPOPT_OLEN];
396 					if (optlen < IPOPT_OLEN + sizeof(*cp) ||
397 					    optlen > cnt) {
398 						m_freem(m);
399 						return NULL;
400 					}
401 				}
402 			}
403 		}
404 	}
405 
406 	return m;
407 }
408 
409 /*
410  * Raw IP socket option processing.
411  */
412 int
413 rip_ctloutput(int op, struct socket *so, int level, int optname,
414     struct mbuf *m)
415 {
416 	struct inpcb *inp = sotoinpcb(so);
417 	int error;
418 
419 	if (level != IPPROTO_IP)
420 		return (EINVAL);
421 
422 	switch (optname) {
423 
424 	case IP_HDRINCL:
425 		error = 0;
426 		if (op == PRCO_SETOPT) {
427 			if (m == NULL || m->m_len < sizeof (int))
428 				error = EINVAL;
429 			else if (*mtod(m, int *))
430 				inp->inp_flags |= INP_HDRINCL;
431 			else
432 				inp->inp_flags &= ~INP_HDRINCL;
433 		} else {
434 			m->m_len = sizeof(int);
435 			*mtod(m, int *) = inp->inp_flags & INP_HDRINCL;
436 		}
437 		return (error);
438 
439 	case MRT_INIT:
440 	case MRT_DONE:
441 	case MRT_ADD_VIF:
442 	case MRT_DEL_VIF:
443 	case MRT_ADD_MFC:
444 	case MRT_DEL_MFC:
445 	case MRT_VERSION:
446 	case MRT_ASSERT:
447 	case MRT_API_SUPPORT:
448 	case MRT_API_CONFIG:
449 #ifdef MROUTING
450 		switch (op) {
451 		case PRCO_SETOPT:
452 			error = ip_mrouter_set(so, optname, m);
453 			break;
454 		case PRCO_GETOPT:
455 			error = ip_mrouter_get(so, optname, m);
456 			break;
457 		default:
458 			error = EINVAL;
459 			break;
460 		}
461 		return (error);
462 #else
463 		return (EOPNOTSUPP);
464 #endif
465 	}
466 	return (ip_ctloutput(op, so, level, optname, m));
467 }
468 
469 u_long	rip_sendspace = RIPSNDQ;
470 u_long	rip_recvspace = RIPRCVQ;
471 
472 int
473 rip_attach(struct socket *so, int proto, int wait)
474 {
475 	struct inpcb *inp;
476 	int error;
477 
478 	if (so->so_pcb)
479 		panic("rip_attach");
480 	if ((so->so_state & SS_PRIV) == 0)
481 		return EACCES;
482 	if (proto < 0 || proto >= IPPROTO_MAX)
483 		return EPROTONOSUPPORT;
484 
485 	if ((error = soreserve(so, rip_sendspace, rip_recvspace)))
486 		return error;
487 	NET_ASSERT_LOCKED();
488 	if ((error = in_pcballoc(so, &rawcbtable, wait)))
489 		return error;
490 	inp = sotoinpcb(so);
491 	inp->inp_ip.ip_p = proto;
492 	return 0;
493 }
494 
495 int
496 rip_detach(struct socket *so)
497 {
498 	struct inpcb *inp = sotoinpcb(so);
499 
500 	soassertlocked(so);
501 
502 	if (inp == NULL)
503 		return (EINVAL);
504 
505 #ifdef MROUTING
506 	if (so == ip_mrouter[inp->inp_rtableid])
507 		ip_mrouter_done(so);
508 #endif
509 	in_pcbdetach(inp);
510 
511 	return (0);
512 }
513 
514 int
515 rip_bind(struct socket *so, struct mbuf *nam, struct proc *p)
516 {
517 	struct inpcb *inp = sotoinpcb(so);
518 	struct sockaddr_in *addr;
519 	int error;
520 
521 	soassertlocked(so);
522 
523 	if ((error = in_nam2sin(nam, &addr)))
524 		return (error);
525 
526 	if (!((so->so_options & SO_BINDANY) ||
527 	    addr->sin_addr.s_addr == INADDR_ANY ||
528 	    addr->sin_addr.s_addr == INADDR_BROADCAST ||
529 	    in_broadcast(addr->sin_addr, inp->inp_rtableid) ||
530 	    ifa_ifwithaddr(sintosa(addr), inp->inp_rtableid)))
531 		return (EADDRNOTAVAIL);
532 
533 	mtx_enter(&rawcbtable.inpt_mtx);
534 	inp->inp_laddr = addr->sin_addr;
535 	mtx_leave(&rawcbtable.inpt_mtx);
536 
537 	return (0);
538 }
539 
540 int
541 rip_connect(struct socket *so, struct mbuf *nam)
542 {
543 	struct inpcb *inp = sotoinpcb(so);
544 	struct sockaddr_in *addr;
545 	int error;
546 
547 	soassertlocked(so);
548 
549 	if ((error = in_nam2sin(nam, &addr)))
550 		return (error);
551 
552 	mtx_enter(&rawcbtable.inpt_mtx);
553 	inp->inp_faddr = addr->sin_addr;
554 	mtx_leave(&rawcbtable.inpt_mtx);
555 	soisconnected(so);
556 
557 	return (0);
558 }
559 
560 int
561 rip_disconnect(struct socket *so)
562 {
563 	struct inpcb *inp = sotoinpcb(so);
564 
565 	soassertlocked(so);
566 
567 	if ((so->so_state & SS_ISCONNECTED) == 0)
568 		return (ENOTCONN);
569 
570 	soisdisconnected(so);
571 	mtx_enter(&rawcbtable.inpt_mtx);
572 	inp->inp_faddr.s_addr = INADDR_ANY;
573 	mtx_leave(&rawcbtable.inpt_mtx);
574 
575 	return (0);
576 }
577 
578 int
579 rip_shutdown(struct socket *so)
580 {
581 	/*
582 	 * Mark the connection as being incapable of further input.
583 	 */
584 
585 	soassertlocked(so);
586 	socantsendmore(so);
587 
588 	return (0);
589 }
590 
591 int
592 rip_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
593     struct mbuf *control)
594 {
595 	struct inpcb *inp = sotoinpcb(so);
596 	struct sockaddr_in dst;
597 	int error;
598 
599 	soassertlocked(so);
600 
601 	/*
602 	 * Ship a packet out.  The appropriate raw output
603 	 * routine handles any massaging necessary.
604 	 */
605 	memset(&dst, 0, sizeof(dst));
606 	dst.sin_family = AF_INET;
607 	dst.sin_len = sizeof(dst);
608 	if (so->so_state & SS_ISCONNECTED) {
609 		if (nam) {
610 			error = EISCONN;
611 			goto out;
612 		}
613 		dst.sin_addr = inp->inp_faddr;
614 	} else {
615 		struct sockaddr_in *addr;
616 
617 		if (nam == NULL) {
618 			error = ENOTCONN;
619 			goto out;
620 		}
621 		if ((error = in_nam2sin(nam, &addr)))
622 			goto out;
623 		dst.sin_addr = addr->sin_addr;
624 	}
625 #ifdef IPSEC
626 	/* XXX Find an IPsec TDB */
627 #endif
628 	error = rip_output(m, so, sintosa(&dst), NULL);
629 	m = NULL;
630 
631 out:
632 	m_freem(control);
633 	m_freem(m);
634 
635 	return (error);
636 }
637