xref: /openbsd-src/sys/netinet/raw_ip.c (revision 68dd5bb1859285b71cb62a10bf107b8ad54064d9)
1 /*	$OpenBSD: raw_ip.c,v 1.154 2024/01/21 01:17:20 bluhm Exp $	*/
2 /*	$NetBSD: raw_ip.c,v 1.25 1996/02/18 18:58:33 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  *	This product includes software developed by the University of
46  *	California, Berkeley and its contributors.
47  *	This product includes software developed at the Information
48  *	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/protosw.h>
76 #include <sys/socketvar.h>
77 
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/route.h>
81 
82 #include <netinet/in.h>
83 #include <netinet/ip.h>
84 #include <netinet/ip_mroute.h>
85 #include <netinet/ip_var.h>
86 #include <netinet/in_pcb.h>
87 #include <netinet/in_var.h>
88 #include <netinet/ip_icmp.h>
89 
90 #include <net/pfvar.h>
91 
92 #include "pf.h"
93 
94 struct inpcbtable rawcbtable;
95 
96 /*
97  * Nominal space allocated to a raw ip socket.
98  */
99 #define	RIPSNDQ		8192
100 #define	RIPRCVQ		8192
101 
102 /*
103  * Raw interface to IP protocol.
104  */
105 
106 const struct pr_usrreqs rip_usrreqs = {
107 	.pru_attach	= rip_attach,
108 	.pru_detach	= rip_detach,
109 	.pru_lock	= rip_lock,
110 	.pru_unlock	= rip_unlock,
111 	.pru_bind	= rip_bind,
112 	.pru_connect	= rip_connect,
113 	.pru_disconnect	= rip_disconnect,
114 	.pru_shutdown	= rip_shutdown,
115 	.pru_send	= rip_send,
116 	.pru_control	= in_control,
117 	.pru_sockaddr	= in_sockaddr,
118 	.pru_peeraddr	= in_peeraddr,
119 };
120 
121 /*
122  * Initialize raw connection block q.
123  */
124 void
125 rip_init(void)
126 {
127 	in_pcbinit(&rawcbtable, 1);
128 }
129 
130 struct mbuf	*rip_chkhdr(struct mbuf *, struct mbuf *);
131 
132 int
133 rip_input(struct mbuf **mp, int *offp, int proto, int af)
134 {
135 	struct mbuf *m = *mp;
136 	struct ip *ip = mtod(m, struct ip *);
137 	struct inpcb *inp;
138 	SIMPLEQ_HEAD(, inpcb) inpcblist;
139 	struct in_addr *key;
140 	struct counters_ref ref;
141 	uint64_t *counters;
142 	struct sockaddr_in ripsrc;
143 
144 	KASSERT(af == AF_INET);
145 
146 	memset(&ripsrc, 0, sizeof(ripsrc));
147 	ripsrc.sin_family = AF_INET;
148 	ripsrc.sin_len = sizeof(ripsrc);
149 	ripsrc.sin_addr = ip->ip_src;
150 
151 	key = &ip->ip_dst;
152 #if NPF > 0
153 	if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
154 		struct pf_divert *divert;
155 
156 		divert = pf_find_divert(m);
157 		KASSERT(divert != NULL);
158 		switch (divert->type) {
159 		case PF_DIVERT_TO:
160 			key = &divert->addr.v4;
161 			break;
162 		case PF_DIVERT_REPLY:
163 			break;
164 		default:
165 			panic("%s: unknown divert type %d, mbuf %p, divert %p",
166 			    __func__, divert->type, m, divert);
167 		}
168 	}
169 #endif
170 	SIMPLEQ_INIT(&inpcblist);
171 	rw_enter_write(&rawcbtable.inpt_notify);
172 	mtx_enter(&rawcbtable.inpt_mtx);
173 	TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
174 		KASSERT(!ISSET(inp->inp_flags, INP_IPV6));
175 
176 		if (inp->inp_socket->so_rcv.sb_state & SS_CANTRCVMORE)
177 			continue;
178 		if (rtable_l2(inp->inp_rtableid) !=
179 		    rtable_l2(m->m_pkthdr.ph_rtableid))
180 			continue;
181 
182 		if (inp->inp_ip.ip_p && inp->inp_ip.ip_p != ip->ip_p)
183 			continue;
184 		if (inp->inp_laddr.s_addr &&
185 		    inp->inp_laddr.s_addr != key->s_addr)
186 			continue;
187 		if (inp->inp_faddr.s_addr &&
188 		    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
189 			continue;
190 
191 		in_pcbref(inp);
192 		SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
193 	}
194 	mtx_leave(&rawcbtable.inpt_mtx);
195 
196 	if (SIMPLEQ_EMPTY(&inpcblist)) {
197 		rw_exit_write(&rawcbtable.inpt_notify);
198 
199 		if (ip->ip_p != IPPROTO_ICMP)
200 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
201 			    0, 0);
202 		else
203 			m_freem(m);
204 
205 		counters = counters_enter(&ref, ipcounters);
206 		counters[ips_noproto]++;
207 		counters[ips_delivered]--;
208 		counters_leave(&ref, ipcounters);
209 
210 		return IPPROTO_DONE;
211 	}
212 
213 	while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
214 		struct mbuf *n, *opts = NULL;
215 
216 		SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
217 		if (SIMPLEQ_EMPTY(&inpcblist))
218 			n = m;
219 		else
220 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
221 		if (n != NULL) {
222 			int ret;
223 
224 			if (inp->inp_flags & INP_CONTROLOPTS ||
225 			    inp->inp_socket->so_options & SO_TIMESTAMP)
226 				ip_savecontrol(inp, &opts, ip, n);
227 
228 			mtx_enter(&inp->inp_mtx);
229 			ret = sbappendaddr(inp->inp_socket,
230 			    &inp->inp_socket->so_rcv,
231 			    sintosa(&ripsrc), n, opts);
232 			mtx_leave(&inp->inp_mtx);
233 
234 			if (ret == 0) {
235 				/* should notify about lost packet */
236 				m_freem(n);
237 				m_freem(opts);
238 			} else
239 				sorwakeup(inp->inp_socket);
240 		}
241 		in_pcbunref(inp);
242 	}
243 	rw_exit_write(&rawcbtable.inpt_notify);
244 
245 	return IPPROTO_DONE;
246 }
247 
248 /*
249  * Generate IP header and pass packet to ip_output.
250  * Tack on options user may have setup with control call.
251  */
252 int
253 rip_output(struct mbuf *m, struct socket *so, struct sockaddr *dstaddr,
254     struct mbuf *control)
255 {
256 	struct sockaddr_in *dst = satosin(dstaddr);
257 	struct ip *ip;
258 	struct inpcb *inp;
259 	int flags, error;
260 
261 	inp = sotoinpcb(so);
262 	flags = IP_ALLOWBROADCAST;
263 
264 	/*
265 	 * If the user handed us a complete IP packet, use it.
266 	 * Otherwise, allocate an mbuf for a header and fill it in.
267 	 */
268 	if ((inp->inp_flags & INP_HDRINCL) == 0) {
269 		if ((m->m_pkthdr.len + sizeof(struct ip)) > IP_MAXPACKET) {
270 			m_freem(m);
271 			return (EMSGSIZE);
272 		}
273 		M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
274 		if (!m)
275 			return (ENOBUFS);
276 		ip = mtod(m, struct ip *);
277 		ip->ip_tos = inp->inp_ip.ip_tos;
278 		ip->ip_off = htons(0);
279 		ip->ip_p = inp->inp_ip.ip_p;
280 		ip->ip_len = htons(m->m_pkthdr.len);
281 		ip->ip_src.s_addr = INADDR_ANY;
282 		ip->ip_dst = dst->sin_addr;
283 		ip->ip_ttl = inp->inp_ip.ip_ttl ? inp->inp_ip.ip_ttl : MAXTTL;
284 	} else {
285 		if (m->m_pkthdr.len > IP_MAXPACKET) {
286 			m_freem(m);
287 			return (EMSGSIZE);
288 		}
289 
290 		m = rip_chkhdr(m, inp->inp_options);
291 		if (m == NULL)
292 			return (EINVAL);
293 
294 		ip = mtod(m, struct ip *);
295 		if (ip->ip_id == 0)
296 			ip->ip_id = htons(ip_randomid());
297 		dst->sin_addr = ip->ip_dst;
298 
299 		/* XXX prevent ip_output from overwriting header fields */
300 		flags |= IP_RAWOUTPUT;
301 		ipstat_inc(ips_rawout);
302 	}
303 
304 	if (ip->ip_src.s_addr == INADDR_ANY) {
305 		error = in_pcbselsrc(&ip->ip_src, dst, inp);
306 		if (error != 0)
307 			return (error);
308 	}
309 
310 #ifdef INET6
311 	/*
312 	 * A thought:  Even though raw IP shouldn't be able to set IPv6
313 	 *             multicast options, if it does, the last parameter to
314 	 *             ip_output should be guarded against v6/v4 problems.
315 	 */
316 #endif
317 	/* force routing table */
318 	m->m_pkthdr.ph_rtableid = inp->inp_rtableid;
319 
320 #if NPF > 0
321 	if (inp->inp_socket->so_state & SS_ISCONNECTED &&
322 	    ip->ip_p != IPPROTO_ICMP)
323 		pf_mbuf_link_inpcb(m, inp);
324 #endif
325 
326 	error = ip_output(m, inp->inp_options, &inp->inp_route, flags,
327 	    inp->inp_moptions, inp->inp_seclevel, 0);
328 	return (error);
329 }
330 
331 struct mbuf *
332 rip_chkhdr(struct mbuf *m, struct mbuf *options)
333 {
334 	struct ip *ip;
335 	int hlen, opt, optlen, cnt;
336 	u_char *cp;
337 
338 	if (m->m_pkthdr.len < sizeof(struct ip)) {
339 		m_freem(m);
340 		return NULL;
341 	}
342 
343 	m = m_pullup(m, sizeof (struct ip));
344 	if (m == NULL)
345 		return NULL;
346 
347 	ip = mtod(m, struct ip *);
348 	hlen = ip->ip_hl << 2;
349 
350 	/* Don't allow packet length sizes that will crash. */
351 	if (hlen < sizeof (struct ip) ||
352 	    ntohs(ip->ip_len) < hlen ||
353 	    ntohs(ip->ip_len) != m->m_pkthdr.len) {
354 		m_freem(m);
355 		return NULL;
356 	}
357 	m = m_pullup(m, hlen);
358 	if (m == NULL)
359 		return NULL;
360 
361 	ip = mtod(m, struct ip *);
362 
363 	if (ip->ip_v != IPVERSION) {
364 		m_freem(m);
365 		return NULL;
366 	}
367 
368 	/*
369 	 * Don't allow both user specified and setsockopt options.
370 	 * If options are present verify them.
371 	 */
372 	if (hlen != sizeof(struct ip)) {
373 		if (options) {
374 			m_freem(m);
375 			return NULL;
376 		} else {
377 			cp = (u_char *)(ip + 1);
378 			cnt = hlen - sizeof(struct ip);
379 			for (; cnt > 0; cnt -= optlen, cp += optlen) {
380 				opt = cp[IPOPT_OPTVAL];
381 				if (opt == IPOPT_EOL)
382 					break;
383 				if (opt == IPOPT_NOP)
384 					optlen = 1;
385 				else {
386 					if (cnt < IPOPT_OLEN + sizeof(*cp)) {
387 						m_freem(m);
388 						return NULL;
389 					}
390 					optlen = cp[IPOPT_OLEN];
391 					if (optlen < IPOPT_OLEN + sizeof(*cp) ||
392 					    optlen > cnt) {
393 						m_freem(m);
394 						return NULL;
395 					}
396 				}
397 			}
398 		}
399 	}
400 
401 	return m;
402 }
403 
404 /*
405  * Raw IP socket option processing.
406  */
407 int
408 rip_ctloutput(int op, struct socket *so, int level, int optname,
409     struct mbuf *m)
410 {
411 	struct inpcb *inp = sotoinpcb(so);
412 	int error;
413 
414 	if (level != IPPROTO_IP)
415 		return (EINVAL);
416 
417 	switch (optname) {
418 
419 	case IP_HDRINCL:
420 		error = 0;
421 		if (op == PRCO_SETOPT) {
422 			if (m == NULL || m->m_len < sizeof (int))
423 				error = EINVAL;
424 			else if (*mtod(m, int *))
425 				inp->inp_flags |= INP_HDRINCL;
426 			else
427 				inp->inp_flags &= ~INP_HDRINCL;
428 		} else {
429 			m->m_len = sizeof(int);
430 			*mtod(m, int *) = inp->inp_flags & INP_HDRINCL;
431 		}
432 		return (error);
433 
434 	case MRT_INIT:
435 	case MRT_DONE:
436 	case MRT_ADD_VIF:
437 	case MRT_DEL_VIF:
438 	case MRT_ADD_MFC:
439 	case MRT_DEL_MFC:
440 	case MRT_VERSION:
441 	case MRT_ASSERT:
442 	case MRT_API_SUPPORT:
443 	case MRT_API_CONFIG:
444 #ifdef MROUTING
445 		switch (op) {
446 		case PRCO_SETOPT:
447 			error = ip_mrouter_set(so, optname, m);
448 			break;
449 		case PRCO_GETOPT:
450 			error = ip_mrouter_get(so, optname, m);
451 			break;
452 		default:
453 			error = EINVAL;
454 			break;
455 		}
456 		return (error);
457 #else
458 		return (EOPNOTSUPP);
459 #endif
460 	}
461 	return (ip_ctloutput(op, so, level, optname, m));
462 }
463 
464 u_long	rip_sendspace = RIPSNDQ;
465 u_long	rip_recvspace = RIPRCVQ;
466 
467 int
468 rip_attach(struct socket *so, int proto, int wait)
469 {
470 	struct inpcb *inp;
471 	int error;
472 
473 	if (so->so_pcb)
474 		panic("rip_attach");
475 	if ((so->so_state & SS_PRIV) == 0)
476 		return EACCES;
477 	if (proto < 0 || proto >= IPPROTO_MAX)
478 		return EPROTONOSUPPORT;
479 
480 	if ((error = soreserve(so, rip_sendspace, rip_recvspace)))
481 		return error;
482 	NET_ASSERT_LOCKED();
483 	if ((error = in_pcballoc(so, &rawcbtable, wait)))
484 		return error;
485 	inp = sotoinpcb(so);
486 	inp->inp_ip.ip_p = proto;
487 	return 0;
488 }
489 
490 int
491 rip_detach(struct socket *so)
492 {
493 	struct inpcb *inp = sotoinpcb(so);
494 
495 	soassertlocked(so);
496 
497 	if (inp == NULL)
498 		return (EINVAL);
499 
500 #ifdef MROUTING
501 	if (so == ip_mrouter[inp->inp_rtableid])
502 		ip_mrouter_done(so);
503 #endif
504 	in_pcbdetach(inp);
505 
506 	return (0);
507 }
508 
509 void
510 rip_lock(struct socket *so)
511 {
512 	struct inpcb *inp = sotoinpcb(so);
513 
514 	NET_ASSERT_LOCKED();
515 	mtx_enter(&inp->inp_mtx);
516 }
517 
518 void
519 rip_unlock(struct socket *so)
520 {
521 	struct inpcb *inp = sotoinpcb(so);
522 
523 	NET_ASSERT_LOCKED();
524 	mtx_leave(&inp->inp_mtx);
525 }
526 
527 int
528 rip_bind(struct socket *so, struct mbuf *nam, struct proc *p)
529 {
530 	struct inpcb *inp = sotoinpcb(so);
531 	struct sockaddr_in *addr;
532 	int error;
533 
534 	soassertlocked(so);
535 
536 	if ((error = in_nam2sin(nam, &addr)))
537 		return (error);
538 
539 	if (!((so->so_options & SO_BINDANY) ||
540 	    addr->sin_addr.s_addr == INADDR_ANY ||
541 	    addr->sin_addr.s_addr == INADDR_BROADCAST ||
542 	    in_broadcast(addr->sin_addr, inp->inp_rtableid) ||
543 	    ifa_ifwithaddr(sintosa(addr), inp->inp_rtableid)))
544 		return (EADDRNOTAVAIL);
545 
546 	mtx_enter(&rawcbtable.inpt_mtx);
547 	inp->inp_laddr = addr->sin_addr;
548 	mtx_leave(&rawcbtable.inpt_mtx);
549 
550 	return (0);
551 }
552 
553 int
554 rip_connect(struct socket *so, struct mbuf *nam)
555 {
556 	struct inpcb *inp = sotoinpcb(so);
557 	struct sockaddr_in *addr;
558 	int error;
559 
560 	soassertlocked(so);
561 
562 	if ((error = in_nam2sin(nam, &addr)))
563 		return (error);
564 
565 	mtx_enter(&rawcbtable.inpt_mtx);
566 	inp->inp_faddr = addr->sin_addr;
567 	mtx_leave(&rawcbtable.inpt_mtx);
568 	soisconnected(so);
569 
570 	return (0);
571 }
572 
573 int
574 rip_disconnect(struct socket *so)
575 {
576 	struct inpcb *inp = sotoinpcb(so);
577 
578 	soassertlocked(so);
579 
580 	if ((so->so_state & SS_ISCONNECTED) == 0)
581 		return (ENOTCONN);
582 
583 	soisdisconnected(so);
584 	mtx_enter(&rawcbtable.inpt_mtx);
585 	inp->inp_faddr.s_addr = INADDR_ANY;
586 	mtx_leave(&rawcbtable.inpt_mtx);
587 
588 	return (0);
589 }
590 
591 int
592 rip_shutdown(struct socket *so)
593 {
594 	/*
595 	 * Mark the connection as being incapable of further input.
596 	 */
597 
598 	soassertlocked(so);
599 	socantsendmore(so);
600 
601 	return (0);
602 }
603 
604 int
605 rip_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
606     struct mbuf *control)
607 {
608 	struct inpcb *inp = sotoinpcb(so);
609 	struct sockaddr_in dst;
610 	int error;
611 
612 	soassertlocked(so);
613 
614 	/*
615 	 * Ship a packet out.  The appropriate raw output
616 	 * routine handles any massaging necessary.
617 	 */
618 	memset(&dst, 0, sizeof(dst));
619 	dst.sin_family = AF_INET;
620 	dst.sin_len = sizeof(dst);
621 	if (so->so_state & SS_ISCONNECTED) {
622 		if (nam) {
623 			error = EISCONN;
624 			goto out;
625 		}
626 		dst.sin_addr = inp->inp_faddr;
627 	} else {
628 		struct sockaddr_in *addr;
629 
630 		if (nam == NULL) {
631 			error = ENOTCONN;
632 			goto out;
633 		}
634 		if ((error = in_nam2sin(nam, &addr)))
635 			goto out;
636 		dst.sin_addr = addr->sin_addr;
637 	}
638 #ifdef IPSEC
639 	/* XXX Find an IPsec TDB */
640 #endif
641 	error = rip_output(m, so, sintosa(&dst), NULL);
642 	m = NULL;
643 
644 out:
645 	m_freem(control);
646 	m_freem(m);
647 
648 	return (error);
649 }
650