xref: /csrg-svn/sys/netinet/ip_input.c (revision 6583)
1 /*	ip_input.c	1.41	82/04/24	*/
2 
3 #include "../h/param.h"
4 #include "../h/systm.h"
5 #include "../h/clock.h"
6 #include "../h/mbuf.h"
7 #include "../h/protosw.h"
8 #include "../h/socket.h"
9 #include "../net/in.h"
10 #include "../net/in_systm.h"
11 #include "../net/if.h"
12 #include "../net/ip.h"			/* belongs before in.h */
13 #include "../net/ip_var.h"
14 #include "../net/ip_icmp.h"
15 #include "../net/tcp.h"
16 #include <errno.h>
17 
18 u_char	ip_protox[IPPROTO_MAX];
19 int	ipqmaxlen = IFQ_MAXLEN;
20 struct	ifnet *ifinet;			/* first inet interface */
21 
22 /*
23  * IP initialization: fill in IP protocol switch table.
24  * All protocols not implemented in kernel go to raw IP protocol handler.
25  */
26 ip_init()
27 {
28 	register struct protosw *pr;
29 	register int i;
30 
31 COUNT(IP_INIT);
32 	pr = pffindproto(PF_INET, IPPROTO_RAW);
33 	if (pr == 0)
34 		panic("ip_init");
35 	for (i = 0; i < IPPROTO_MAX; i++)
36 		ip_protox[i] = pr - protosw;
37 	for (pr = protosw; pr <= protoswLAST; pr++)
38 		if (pr->pr_family == PF_INET &&
39 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
40 			ip_protox[pr->pr_protocol] = pr - protosw;
41 	ipq.next = ipq.prev = &ipq;
42 	ip_id = time & 0xffff;
43 	ipintrq.ifq_maxlen = ipqmaxlen;
44 	ifinet = if_ifwithaf(AF_INET);
45 }
46 
47 u_char	ipcksum = 1;
48 struct	ip *ip_reass();
49 struct	sockaddr_in ipaddr = { AF_INET };
50 
51 /*
52  * Ip input routine.  Checksum and byte swap header.  If fragmented
53  * try to reassamble.  If complete and fragment queue exists, discard.
54  * Process options.  Pass to next level.
55  */
56 ipintr()
57 {
58 	register struct ip *ip;
59 	register struct mbuf *m;
60 	struct mbuf *m0, *mopt;
61 	register int i;
62 	register struct ipq *fp;
63 	int hlen, s;
64 
65 COUNT(IPINTR);
66 next:
67 	/*
68 	 * Get next datagram off input queue and get IP header
69 	 * in first mbuf.
70 	 */
71 	s = splimp();
72 	IF_DEQUEUE(&ipintrq, m);
73 	splx(s);
74 	if (m == 0)
75 		return;
76 	if ((m->m_off > MMAXOFF || m->m_len < sizeof (struct ip)) &&
77 	    (m = m_pullup(m, sizeof (struct ip))) == 0)
78 		return;
79 	ip = mtod(m, struct ip *);
80 	if ((hlen = ip->ip_hl << 2) > m->m_len) {
81 		if ((m = m_pullup(m, hlen)) == 0)
82 			return;
83 		ip = mtod(m, struct ip *);
84 	}
85 	if (ipcksum)
86 		if (ip->ip_sum = in_cksum(m, hlen)) {
87 			printf("ip_sum %x\n", ip->ip_sum);	/* XXX */
88 			ipstat.ips_badsum++;
89 			goto bad;
90 		}
91 
92 #if vax
93 	/*
94 	 * Convert fields to host representation.
95 	 */
96 	ip->ip_len = ntohs((u_short)ip->ip_len);
97 	ip->ip_id = ntohs(ip->ip_id);
98 	ip->ip_off = ntohs((u_short)ip->ip_off);
99 #endif
100 
101 	/*
102 	 * Check that the amount of data in the buffers
103 	 * is as at least much as the IP header would have us expect.
104 	 * Trim mbufs if longer than we expect.
105 	 * Drop packet if shorter than we expect.
106 	 */
107 	i = -ip->ip_len;
108 	m0 = m;
109 	for (;;) {
110 		i += m->m_len;
111 		if (m->m_next == 0)
112 			break;
113 		m = m->m_next;
114 	}
115 	if (i != 0) {
116 		if (i < 0) {
117 			ipstat.ips_tooshort++;
118 			goto bad;
119 		}
120 		if (i <= m->m_len)
121 			m->m_len -= i;
122 		else
123 			m_adj(m0, -i);
124 	}
125 	m = m0;
126 
127 	/*
128 	 * Process options and, if not destined for us,
129 	 * ship it on.  ip_dooptions returns 1 when an
130 	 * error was detected (causing an icmp message
131 	 * to be sent).
132 	 */
133 	if (hlen > sizeof (struct ip) && ip_dooptions(ip))
134 		goto next;
135 
136 	/*
137 	 * Fast check on the first internet
138 	 * interface in the list.
139 	 */
140 	if (ifinet) {
141 		struct sockaddr_in *sin;
142 
143 		sin = (struct sockaddr_in *)&ifinet->if_addr;
144 		if (sin->sin_addr.s_addr == ip->ip_dst.s_addr)
145 			goto ours;
146 		sin = (struct sockaddr_in *)&ifinet->if_broadaddr;
147 		if ((ifinet->if_flags & IFF_BROADCAST) &&
148 		    sin->sin_addr.s_addr == ip->ip_dst.s_addr)
149 			goto ours;
150 	}
151 	ipaddr.sin_addr = ip->ip_dst;
152 	if (if_ifwithaddr((struct sockaddr *)&ipaddr) == 0) {
153 		ip_forward(ip);
154 		goto next;
155 	}
156 
157 ours:
158 	/*
159 	 * Look for queue of fragments
160 	 * of this datagram.
161 	 */
162 	for (fp = ipq.next; fp != &ipq; fp = fp->next)
163 		if (ip->ip_id == fp->ipq_id &&
164 		    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
165 		    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
166 		    ip->ip_p == fp->ipq_p)
167 			goto found;
168 	fp = 0;
169 found:
170 
171 	/*
172 	 * Adjust ip_len to not reflect header,
173 	 * set ip_mff if more fragments are expected,
174 	 * convert offset of this to bytes.
175 	 */
176 	ip->ip_len -= hlen;
177 	((struct ipasfrag *)ip)->ipf_mff = 0;
178 	if (ip->ip_off & IP_MF)
179 		((struct ipasfrag *)ip)->ipf_mff = 1;
180 	ip->ip_off <<= 3;
181 
182 	/*
183 	 * If datagram marked as having more fragments
184 	 * or if this is not the first fragment,
185 	 * attempt reassembly; if it succeeds, proceed.
186 	 */
187 	if (((struct ipasfrag *)ip)->ipf_mff || ip->ip_off) {
188 		ip = ip_reass((struct ipasfrag *)ip, fp);
189 		if (ip == 0)
190 			goto next;
191 		hlen = ip->ip_hl << 2;
192 		m = dtom(ip);
193 	} else
194 		if (fp)
195 			(void) ip_freef(fp);
196 
197 	/*
198 	 * Switch out to protocol's input routine.
199 	 */
200 	(*protosw[ip_protox[ip->ip_p]].pr_input)(m);
201 	goto next;
202 bad:
203 	m_freem(m);
204 	goto next;
205 }
206 
207 /*
208  * Take incoming datagram fragment and try to
209  * reassemble it into whole datagram.  If a chain for
210  * reassembly of this datagram already exists, then it
211  * is given as fp; otherwise have to make a chain.
212  */
213 struct ip *
214 ip_reass(ip, fp)
215 	register struct ipasfrag *ip;
216 	register struct ipq *fp;
217 {
218 	register struct mbuf *m = dtom(ip);
219 	register struct ipasfrag *q;
220 	struct mbuf *t;
221 	int hlen = ip->ip_hl << 2;
222 	int i, next;
223 COUNT(IP_REASS);
224 
225 	/*
226 	 * Presence of header sizes in mbufs
227 	 * would confuse code below.
228 	 */
229 	m->m_off += hlen;
230 	m->m_len -= hlen;
231 
232 	/*
233 	 * If first fragment to arrive, create a reassembly queue.
234 	 */
235 	if (fp == 0) {
236 		if ((t = m_get(M_WAIT)) == NULL)
237 			goto dropfrag;
238 		t->m_off = MMINOFF;
239 		fp = mtod(t, struct ipq *);
240 		insque(fp, &ipq);
241 		fp->ipq_ttl = IPFRAGTTL;
242 		fp->ipq_p = ip->ip_p;
243 		fp->ipq_id = ip->ip_id;
244 		fp->ipq_next = fp->ipq_prev = (struct ipasfrag *)fp;
245 		fp->ipq_src = ((struct ip *)ip)->ip_src;
246 		fp->ipq_dst = ((struct ip *)ip)->ip_dst;
247 		q = (struct ipasfrag *)fp;
248 		goto insert;
249 	}
250 
251 	/*
252 	 * Find a segment which begins after this one does.
253 	 */
254 	for (q = fp->ipq_next; q != (struct ipasfrag *)fp; q = q->ipf_next)
255 		if (q->ip_off > ip->ip_off)
256 			break;
257 
258 	/*
259 	 * If there is a preceding segment, it may provide some of
260 	 * our data already.  If so, drop the data from the incoming
261 	 * segment.  If it provides all of our data, drop us.
262 	 */
263 	if (q->ipf_prev != (struct ipasfrag *)fp) {
264 		i = q->ipf_prev->ip_off + q->ipf_prev->ip_len - ip->ip_off;
265 		if (i > 0) {
266 			if (i >= ip->ip_len)
267 				goto dropfrag;
268 			m_adj(dtom(ip), i);
269 			ip->ip_off += i;
270 			ip->ip_len -= i;
271 		}
272 	}
273 
274 	/*
275 	 * While we overlap succeeding segments trim them or,
276 	 * if they are completely covered, dequeue them.
277 	 */
278 	while (q != (struct ipasfrag *)fp && ip->ip_off + ip->ip_len > q->ip_off) {
279 		i = (ip->ip_off + ip->ip_len) - q->ip_off;
280 		if (i < q->ip_len) {
281 			q->ip_len -= i;
282 			q->ip_off += i;
283 			m_adj(dtom(q), i);
284 			break;
285 		}
286 		q = q->ipf_next;
287 		m_freem(dtom(q->ipf_prev));
288 		ip_deq(q->ipf_prev);
289 	}
290 
291 insert:
292 	/*
293 	 * Stick new segment in its place;
294 	 * check for complete reassembly.
295 	 */
296 	ip_enq(ip, q->ipf_prev);
297 	next = 0;
298 	for (q = fp->ipq_next; q != (struct ipasfrag *)fp; q = q->ipf_next) {
299 		if (q->ip_off != next)
300 			return (0);
301 		next += q->ip_len;
302 	}
303 	if (q->ipf_prev->ipf_mff)
304 		return (0);
305 
306 	/*
307 	 * Reassembly is complete; concatenate fragments.
308 	 */
309 	q = fp->ipq_next;
310 	m = dtom(q);
311 	t = m->m_next;
312 	m->m_next = 0;
313 	m_cat(m, t);
314 	q = q->ipf_next;
315 	while (q != (struct ipasfrag *)fp) {
316 		t = dtom(q);
317 		q = q->ipf_next;
318 		m_cat(m, t);
319 	}
320 
321 	/*
322 	 * Create header for new ip packet by
323 	 * modifying header of first packet;
324 	 * dequeue and discard fragment reassembly header.
325 	 * Make header visible.
326 	 */
327 	ip = fp->ipq_next;
328 	ip->ip_len = next;
329 	((struct ip *)ip)->ip_src = fp->ipq_src;
330 	((struct ip *)ip)->ip_dst = fp->ipq_dst;
331 	remque(fp);
332 	(void) m_free(dtom(fp));
333 	m = dtom(ip);
334 	m->m_len += sizeof (struct ipasfrag);
335 	m->m_off -= sizeof (struct ipasfrag);
336 	return ((struct ip *)ip);
337 
338 dropfrag:
339 	m_freem(m);
340 	return (0);
341 }
342 
343 /*
344  * Free a fragment reassembly header and all
345  * associated datagrams.
346  */
347 struct ipq *
348 ip_freef(fp)
349 	struct ipq *fp;
350 {
351 	register struct ipasfrag *q;
352 	struct mbuf *m;
353 COUNT(IP_FREEF);
354 
355 	for (q = fp->ipq_next; q != (struct ipasfrag *)fp; q = q->ipf_next)
356 		m_freem(dtom(q));
357 	m = dtom(fp);
358 	fp = fp->next;
359 	remque(fp->prev);
360 	(void) m_free(m);
361 	return (fp);
362 }
363 
364 /*
365  * Put an ip fragment on a reassembly chain.
366  * Like insque, but pointers in middle of structure.
367  */
368 ip_enq(p, prev)
369 	register struct ipasfrag *p, *prev;
370 {
371 
372 COUNT(IP_ENQ);
373 	p->ipf_prev = prev;
374 	p->ipf_next = prev->ipf_next;
375 	prev->ipf_next->ipf_prev = p;
376 	prev->ipf_next = p;
377 }
378 
379 /*
380  * To ip_enq as remque is to insque.
381  */
382 ip_deq(p)
383 	register struct ipasfrag *p;
384 {
385 
386 COUNT(IP_DEQ);
387 	p->ipf_prev->ipf_next = p->ipf_next;
388 	p->ipf_next->ipf_prev = p->ipf_prev;
389 }
390 
391 /*
392  * IP timer processing;
393  * if a timer expires on a reassembly
394  * queue, discard it.
395  */
396 ip_slowtimo()
397 {
398 	register struct ipq *fp;
399 	int s = splnet();
400 
401 COUNT(IP_SLOWTIMO);
402 	fp = ipq.next;
403 	if (fp == 0) {
404 		splx(s);
405 		return;
406 	}
407 	while (fp != &ipq)
408 		if (--fp->ipq_ttl == 0)
409 			fp = ip_freef(fp);
410 		else
411 			fp = fp->next;
412 	splx(s);
413 }
414 
415 /*
416  * Drain off all datagram fragments.
417  */
418 ip_drain()
419 {
420 
421 COUNT(IP_DRAIN);
422 	while (ipq.next != &ipq)
423 		(void) ip_freef(ipq.next);
424 }
425 
426 /*
427  * Do option processing on a datagram,
428  * possibly discarding it if bad options
429  * are encountered.
430  */
431 ip_dooptions(ip)
432 	struct ip *ip;
433 {
434 	register u_char *cp;
435 	int opt, optlen, cnt, code, type;
436 	struct in_addr *sin;
437 	register struct ip_timestamp *ipt;
438 	register struct ifnet *ifp;
439 	struct in_addr t;
440 
441 COUNT(IP_DOOPTIONS);
442 	cp = (u_char *)(ip + 1);
443 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
444 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
445 		opt = cp[0];
446 		if (opt == IPOPT_EOL)
447 			break;
448 		if (opt == IPOPT_NOP)
449 			optlen = 1;
450 		else
451 			optlen = cp[1];
452 		switch (opt) {
453 
454 		default:
455 			break;
456 
457 		/*
458 		 * Source routing with record.
459 		 * Find interface with current destination address.
460 		 * If none on this machine then drop if strictly routed,
461 		 * or do nothing if loosely routed.
462 		 * Record interface address and bring up next address
463 		 * component.  If strictly routed make sure next
464 		 * address on directly accessible net.
465 		 */
466 		case IPOPT_LSRR:
467 			if (cp[2] < 4 || cp[2] > optlen - (sizeof (long) - 1))
468 				break;
469 			sin = (struct in_addr *)(cp + cp[2]);
470 			ipaddr.sin_addr = *sin;
471 			ifp = if_ifwithaddr((struct sockaddr *)&ipaddr);
472 			type = ICMP_UNREACH, code = ICMP_UNREACH_SRCFAIL;
473 			if (ifp == 0) {
474 				if (opt == IPOPT_SSRR)
475 					goto bad;
476 				break;
477 			}
478 			t = ip->ip_dst; ip->ip_dst = *sin; *sin = t;
479 			cp[2] += 4;
480 			if (cp[2] > optlen - (sizeof (long) - 1))
481 				break;
482 			ip->ip_dst = sin[1];
483 			if (opt == IPOPT_SSRR &&
484 			    if_ifonnetof(ip->ip_dst.s_net) == 0)
485 				goto bad;
486 			break;
487 
488 		case IPOPT_TS:
489 			code = cp - (u_char *)ip;
490 			type = ICMP_PARAMPROB;
491 			ipt = (struct ip_timestamp *)cp;
492 			if (ipt->ipt_len < 5)
493 				goto bad;
494 			if (ipt->ipt_ptr > ipt->ipt_len - sizeof (long)) {
495 				if (++ipt->ipt_oflw == 0)
496 					goto bad;
497 				break;
498 			}
499 			sin = (struct in_addr *)(cp+cp[2]);
500 			switch (ipt->ipt_flg) {
501 
502 			case IPOPT_TS_TSONLY:
503 				break;
504 
505 			case IPOPT_TS_TSANDADDR:
506 				if (ipt->ipt_ptr + 8 > ipt->ipt_len)
507 					goto bad;
508 				if (ifinet == 0)
509 					goto bad;	/* ??? */
510 				*sin++ = ((struct sockaddr_in *)&ifinet->if_addr)->sin_addr;
511 				break;
512 
513 			case IPOPT_TS_PRESPEC:
514 				ipaddr.sin_addr = *sin;
515 				if (!if_ifwithaddr((struct sockaddr *)&ipaddr))
516 					continue;
517 				if (ipt->ipt_ptr + 8 > ipt->ipt_len)
518 					goto bad;
519 				ipt->ipt_ptr += 4;
520 				break;
521 
522 			default:
523 				goto bad;
524 			}
525 			*(n_time *)sin = iptime();
526 			ipt->ipt_ptr += 4;
527 		}
528 	}
529 	return (0);
530 bad:
531 	icmp_error(ip, type, code);
532 	return (1);
533 }
534 
535 /*
536  * Strip out IP options, at higher
537  * level protocol in the kernel.
538  * Second argument is buffer to which options
539  * will be moved, and return value is their length.
540  */
541 ip_stripoptions(ip, mopt)
542 	struct ip *ip;
543 	struct mbuf *mopt;
544 {
545 	register int i;
546 	register struct mbuf *m;
547 	int olen;
548 COUNT(IP_STRIPOPTIONS);
549 
550 	olen = (ip->ip_hl<<2) - sizeof (struct ip);
551 	m = dtom(ip);
552 	ip++;
553 	if (mopt) {
554 		mopt->m_len = olen;
555 		mopt->m_off = MMINOFF;
556 		bcopy((caddr_t)ip, mtod(m, caddr_t), (unsigned)olen);
557 	}
558 	i = m->m_len - (sizeof (struct ip) + olen);
559 	bcopy((caddr_t)ip+olen, (caddr_t)ip, (unsigned)i);
560 	m->m_len -= olen;
561 }
562 
563 static u_char ctlerrmap[] = {
564 	ECONNABORTED,	ECONNABORTED,	0,		0,
565 	0,
566 #ifdef notdef
567 	EHOSTUNREACH,	EHOSTDOWN,	ENETUNREACH,	EHOSTUNREACH,
568 #else
569 	ENETUNREACH,	ENETUNREACH,	ENETUNREACH,	ENETUNREACH,
570 #endif
571 	ECONNREFUSED,	ECONNREFUSED,	EMSGSIZE,	0,
572 	0,		0,		0,		0
573 };
574 
575 ip_ctlinput(cmd, arg)
576 	int cmd;
577 	caddr_t arg;
578 {
579 	struct in_addr *sin;
580 	extern int tcp_abort(), udp_abort();
581 	extern struct inpcb tcb, udb;
582 
583 	if (cmd < 0 || cmd > PRC_NCMDS)
584 		return;
585 	if (ctlerrmap[cmd] == 0)
586 		return;		/* XXX */
587 	if (cmd == PRC_IFDOWN)
588 		sin = &((struct sockaddr_in *)arg)->sin_addr;
589 	else if (cmd == PRC_HOSTDEAD || cmd == PRC_HOSTUNREACH)
590 		sin = (struct in_addr *)arg;
591 	else
592 		sin = &((struct icmp *)arg)->icmp_ip.ip_dst;
593 	in_pcbnotify(&tcb, sin, ctlerrmap[cmd], tcp_abort);
594 	in_pcbnotify(&udb, sin, ctlerrmap[cmd], udp_abort);
595 }
596 
597 int	ipprintfs = 0;
598 int	ipforwarding = 1;
599 /*
600  * Forward a packet.  If some error occurs return the sender
601  * and icmp packet.  Note we can't always generate a meaningful
602  * icmp message because icmp doesn't have a large enough repetoire
603  * of codes and types.
604  */
605 ip_forward(ip)
606 	register struct ip *ip;
607 {
608 	register int error, type, code;
609 	struct mbuf *mopt;
610 
611 	if (ipprintfs)
612 		printf("forward: src %x dst %x ttl %x\n", ip->ip_src,
613 			ip->ip_dst, ip->ip_ttl);
614 	if (ipforwarding == 0) {
615 		/* can't tell difference between net and host */
616 		type = ICMP_UNREACH, code = ICMP_UNREACH_NET;
617 		goto sendicmp;
618 	}
619 	if (ip->ip_ttl < IPTTLDEC) {
620 		type = ICMP_TIMXCEED, code = ICMP_TIMXCEED_INTRANS;
621 		goto sendicmp;
622 	}
623 	ip->ip_ttl -= IPTTLDEC;
624 	mopt = m_get(M_DONTWAIT);
625 	if (mopt == 0) {
626 		m_freem(dtom(ip));
627 		return;
628 	}
629 	ip_stripoptions(ip, mopt);
630 
631 	/* last 0 here means no directed broadcast */
632 	if ((error = ip_output(dtom(ip), mopt, 0, 0)) == 0)
633 		return;
634 #ifdef notdef
635 	/*
636 	 * Want to generate a message, but lower
637 	 * layers assume they can free up a message
638 	 * in the event of an error.  This causes
639 	 * the call to icmp_error to work on ``freed''
640 	 * mbuf's, and worse.
641 	 */
642 	type = ICMP_UNREACH, code = 0;	/* need ``undefined'' */
643 	if (error == ENETUNREACH || error == ENETDOWN)
644 		code = ICMP_UNREACH_NET;
645 	else if (error == EMSGSIZE)
646 		code = ICMP_UNREACH_NEEDFRAG;
647 #else
648 	return;
649 #endif
650 sendicmp:
651 	icmp_error(ip, type, code);
652 }
653