xref: /dflybsd-src/sys/kern/uipc_mbuf.c (revision b5b0912b1891e95ccc48cad83f09239ccb7ffc16)
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
36  *
37  * License terms: all terms for the DragonFly license above plus the following:
38  *
39  * 4. All advertising materials mentioning features or use of this software
40  *    must display the following acknowledgement:
41  *
42  *	This product includes software developed by Jeffrey M. Hsu
43  *	for the DragonFly Project.
44  *
45  *    This requirement may be waived with permission from Jeffrey Hsu.
46  *    This requirement will sunset and may be removed on July 8 2005,
47  *    after which the standard DragonFly license (as shown above) will
48  *    apply.
49  */
50 
51 /*
52  * Copyright (c) 1982, 1986, 1988, 1991, 1993
53  *	The Regents of the University of California.  All rights reserved.
54  *
55  * Redistribution and use in source and binary forms, with or without
56  * modification, are permitted provided that the following conditions
57  * are met:
58  * 1. Redistributions of source code must retain the above copyright
59  *    notice, this list of conditions and the following disclaimer.
60  * 2. Redistributions in binary form must reproduce the above copyright
61  *    notice, this list of conditions and the following disclaimer in the
62  *    documentation and/or other materials provided with the distribution.
63  * 3. All advertising materials mentioning features or use of this software
64  *    must display the following acknowledgement:
65  *	This product includes software developed by the University of
66  *	California, Berkeley and its contributors.
67  * 4. Neither the name of the University nor the names of its contributors
68  *    may be used to endorse or promote products derived from this software
69  *    without specific prior written permission.
70  *
71  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
72  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
75  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81  * SUCH DAMAGE.
82  *
83  * @(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
84  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
85  * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.40 2005/05/31 14:11:43 joerg Exp $
86  */
87 
88 #include "opt_param.h"
89 #include "opt_mbuf_stress_test.h"
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/malloc.h>
93 #include <sys/mbuf.h>
94 #include <sys/kernel.h>
95 #include <sys/sysctl.h>
96 #include <sys/domain.h>
97 #include <sys/protosw.h>
98 #include <sys/uio.h>
99 #include <sys/thread.h>
100 #include <sys/globaldata.h>
101 #include <sys/thread2.h>
102 
103 #include <vm/vm.h>
104 #include <vm/vm_kern.h>
105 #include <vm/vm_extern.h>
106 
107 #ifdef INVARIANTS
108 #include <machine/cpu.h>
109 #endif
110 
111 /*
112  * mbuf cluster meta-data
113  */
114 typedef struct mbcluster {
115 	struct mbcluster *mcl_next;
116 	int32_t	mcl_magic;
117 	int32_t	mcl_refs;
118 	void	*mcl_data;
119 } *mbcluster_t;
120 
121 typedef struct mbuf *mbuf_t;
122 
123 #define MCL_MAGIC	0x6d62636c
124 
125 static void mbinit (void *);
126 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
127 
128 static u_long	mbtypes[MT_NTYPES];
129 
130 struct mbstat mbstat;
131 int	max_linkhdr;
132 int	max_protohdr;
133 int	max_hdr;
134 int	max_datalen;
135 int	m_defragpackets;
136 int	m_defragbytes;
137 int	m_defraguseless;
138 int	m_defragfailure;
139 #ifdef MBUF_STRESS_TEST
140 int	m_defragrandomfailures;
141 #endif
142 
143 int	nmbclusters;
144 int	nmbufs;
145 u_int	m_mballoc_wid = 0;
146 u_int	m_clalloc_wid = 0;
147 
148 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
149 	   &max_linkhdr, 0, "");
150 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
151 	   &max_protohdr, 0, "");
152 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
153 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
154 	   &max_datalen, 0, "");
155 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
156 	   &mbuf_wait, 0, "");
157 SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
158 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
159 	   sizeof(mbtypes), "LU", "");
160 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RW,
161 	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
162 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RW, &nmbufs, 0,
163 	   "Maximum number of mbufs available");
164 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
165 	   &m_defragpackets, 0, "");
166 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
167 	   &m_defragbytes, 0, "");
168 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
169 	   &m_defraguseless, 0, "");
170 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
171 	   &m_defragfailure, 0, "");
172 #ifdef MBUF_STRESS_TEST
173 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
174 	   &m_defragrandomfailures, 0, "");
175 #endif
176 
177 static int mcl_pool_count;
178 static int mcl_pool_max = 20;
179 static int mcl_free_max = 1000;
180 static int mbuf_free_max = 5000;
181 
182 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_pool_max, CTLFLAG_RW, &mcl_pool_max, 0,
183            "Maximum number of mbufs+cluster in free list");
184 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_pool_count, CTLFLAG_RD, &mcl_pool_count, 0,
185            "Current number of mbufs+cluster in free list");
186 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_free_max, CTLFLAG_RW, &mcl_free_max, 0,
187            "Maximum number of clusters on the free list");
188 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_free_max, CTLFLAG_RW, &mbuf_free_max, 0,
189            "Maximum number of mbufs on the free list");
190 
191 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
192 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
193 
194 static mbuf_t mmbfree;
195 static mbcluster_t mclfree;
196 static struct mbuf *mcl_pool;
197 
198 static void m_reclaim (void);
199 static int m_mballoc(int nmb, int how);
200 static int m_clalloc(int ncl, int how);
201 static struct mbuf *m_mballoc_wait(int caller, int type);
202 static void m_mclref(void *arg);
203 static void m_mclfree(void *arg);
204 
205 #ifndef NMBCLUSTERS
206 #define NMBCLUSTERS	(512 + maxusers * 16)
207 #endif
208 #ifndef NMBUFS
209 #define NMBUFS		(nmbclusters * 4)
210 #endif
211 
212 /*
213  * Perform sanity checks of tunables declared above.
214  */
215 static void
216 tunable_mbinit(void *dummy)
217 {
218 
219 	/*
220 	 * This has to be done before VM init.
221 	 */
222 	nmbclusters = NMBCLUSTERS;
223 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
224 	nmbufs = NMBUFS;
225 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
226 	/* Sanity checks */
227 	if (nmbufs < nmbclusters * 2)
228 		nmbufs = nmbclusters * 2;
229 
230 	return;
231 }
232 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
233 
234 /* "number of clusters of pages" */
235 #define NCL_INIT	1
236 
237 #define NMB_INIT	16
238 
239 /* ARGSUSED*/
240 static void
241 mbinit(void *dummy)
242 {
243 	mmbfree = NULL;
244 	mclfree = NULL;
245 	mbstat.m_msize = MSIZE;
246 	mbstat.m_mclbytes = MCLBYTES;
247 	mbstat.m_minclsize = MINCLSIZE;
248 	mbstat.m_mlen = MLEN;
249 	mbstat.m_mhlen = MHLEN;
250 
251 	crit_enter();
252 	if (m_mballoc(NMB_INIT, MB_DONTWAIT) == 0)
253 		goto bad;
254 #if MCLBYTES <= PAGE_SIZE
255 	if (m_clalloc(NCL_INIT, MB_DONTWAIT) == 0)
256 		goto bad;
257 #else
258 	/* It's OK to call contigmalloc in this context. */
259 	if (m_clalloc(16, MB_WAIT) == 0)
260 		goto bad;
261 #endif
262 	crit_exit();
263 	return;
264 bad:
265 	crit_exit();
266 	panic("mbinit");
267 }
268 
269 /*
270  * Allocate at least nmb mbufs and place on mbuf free list.
271  * Returns the number of mbufs successfully allocated, 0 if none.
272  *
273  * Must be called while in a critical section.
274  */
275 static int
276 m_mballoc(int nmb, int how)
277 {
278 	int i;
279 	struct mbuf *m;
280 
281 	/*
282 	 * If we've hit the mbuf limit, stop allocating (or trying to)
283 	 * in order to avoid exhausting kernel memory entirely.
284 	 */
285 	if ((nmb + mbstat.m_mbufs) > nmbufs)
286 		return (0);
287 
288 	/*
289 	 * Attempt to allocate the requested number of mbufs, terminate when
290 	 * the allocation fails but if blocking is allowed allocate at least
291 	 * one.
292 	 */
293 	for (i = 0; i < nmb; ++i) {
294 		m = malloc(MSIZE, M_MBUF, M_NOWAIT|M_NULLOK|M_ZERO);
295 		if (m == NULL) {
296 			if (how == MB_WAIT) {
297 				mbstat.m_wait++;
298 				m = malloc(MSIZE, M_MBUF,
299 					    M_WAITOK|M_NULLOK|M_ZERO);
300 			}
301 			if (m == NULL)
302 				break;
303 		}
304 		m->m_next = mmbfree;
305 		mmbfree = m;
306 		++mbstat.m_mbufs;
307 		++mbtypes[MT_FREE];
308 		how = MB_DONTWAIT;
309 	}
310 	return(i);
311 }
312 
313 /*
314  * Once mbuf memory has been exhausted and if the call to the allocation macros
315  * (or, in some cases, functions) is with MB_WAIT, then it is necessary to rely
316  * solely on reclaimed mbufs. Here we wait for an mbuf to be freed for a
317  * designated (mbuf_wait) time.
318  */
319 static struct mbuf *
320 m_mballoc_wait(int caller, int type)
321 {
322 	struct mbuf *m;
323 
324 	crit_enter();
325 	m_mballoc_wid++;
326 	if ((tsleep(&m_mballoc_wid, 0, "mballc", mbuf_wait)) == EWOULDBLOCK)
327 		m_mballoc_wid--;
328 	crit_exit();
329 
330 	/*
331 	 * Now that we (think) that we've got something, we will redo an
332 	 * MGET, but avoid getting into another instance of m_mballoc_wait()
333 	 * XXX: We retry to fetch _even_ if the sleep timed out. This is left
334 	 *      this way, purposely, in the [unlikely] case that an mbuf was
335 	 *      freed but the sleep was not awakened in time.
336 	 */
337 	m = NULL;
338 	switch (caller) {
339 	case MGET_C:
340 		MGET(m, MB_DONTWAIT, type);
341 		break;
342 	case MGETHDR_C:
343 		MGETHDR(m, MB_DONTWAIT, type);
344 		break;
345 	default:
346 		panic("m_mballoc_wait: invalid caller (%d)", caller);
347 	}
348 
349 	crit_enter();
350 	if (m != NULL) {		/* We waited and got something... */
351 		mbstat.m_wait++;
352 		/* Wake up another if we have more free. */
353 		if (mmbfree != NULL)
354 			MMBWAKEUP();
355 	}
356 	crit_exit();
357 	return (m);
358 }
359 
360 #if MCLBYTES > PAGE_SIZE
361 static int i_want_my_mcl;
362 
363 static void
364 kproc_mclalloc(void)
365 {
366 	int status;
367 
368 	crit_enter();
369 	for (;;) {
370 		tsleep(&i_want_my_mcl, 0, "mclalloc", 0);
371 
372 		while (i_want_my_mcl > 0) {
373 			if (m_clalloc(1, MB_WAIT) == 0)
374 				printf("m_clalloc failed even in thread context!\n");
375 			--i_want_my_mcl;
376 		}
377 	}
378 	/* not reached */
379 	crit_exit();
380 }
381 
382 static struct thread *mclallocthread;
383 static struct kproc_desc mclalloc_kp = {
384 	"mclalloc",
385 	kproc_mclalloc,
386 	&mclallocthread
387 };
388 SYSINIT(mclallocthread, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
389 	   &mclalloc_kp);
390 #endif
391 
392 /*
393  * Allocate at least nmb mbuf clusters and place on mbuf free list.
394  * Returns the number of mbuf clusters successfully allocated, 0 if none.
395  *
396  * Must be called while in a critical section.
397  */
398 static int
399 m_clalloc(int ncl, int how)
400 {
401 	static int last_report;
402 	mbcluster_t mcl;
403 	void *data;
404 	int i;
405 
406 	/*
407 	 * If we've hit the mbuf cluster limit, stop allocating (or trying to).
408 	 */
409 	if ((ncl + mbstat.m_clusters) > nmbclusters)
410 		ncl = 0;
411 
412 	/*
413 	 * Attempt to allocate the requested number of mbuf clusters,
414 	 * terminate when the allocation fails but if blocking is allowed
415 	 * allocate at least one.
416 	 *
417 	 * We need to allocate two structures for each cluster... a
418 	 * ref counting / governing structure and the actual data.  MCLBYTES
419 	 * should be a power of 2 which means that the slab allocator will
420 	 * return a buffer that does not cross a page boundary.
421 	 */
422 	for (i = 0; i < ncl; ++i) {
423 		/*
424 		 * Meta structure
425 		 */
426 		mcl = malloc(sizeof(*mcl), M_MBUFCL, M_NOWAIT|M_NULLOK|M_ZERO);
427 		if (mcl == NULL) {
428 			if (how == MB_WAIT) {
429 				mbstat.m_wait++;
430 				mcl = malloc(sizeof(*mcl),
431 					    M_MBUFCL, M_WAITOK|M_NULLOK|M_ZERO);
432 			}
433 			if (mcl == NULL)
434 				break;
435 		}
436 
437 		/*
438 		 * Physically contiguous data buffer.
439 		 */
440 #if MCLBYTES > PAGE_SIZE
441 		if (how != MB_WAIT) {
442 			i_want_my_mcl += ncl - i;
443 			wakeup(&i_want_my_mcl);
444 			mbstat.m_wait++;
445 			data = NULL;
446 		} else {
447 			data = contigmalloc_map(MCLBYTES, M_MBUFCL,
448 				M_WAITOK, 0ul, ~0ul, PAGE_SIZE, 0, kernel_map);
449 		}
450 #else
451 		data = malloc(MCLBYTES, M_MBUFCL, M_NOWAIT|M_NULLOK);
452 		if (data == NULL) {
453 			if (how == MB_WAIT) {
454 				mbstat.m_wait++;
455 				data = malloc(MCLBYTES, M_MBUFCL,
456 						M_WAITOK|M_NULLOK);
457 			}
458 		}
459 #endif
460 		if (data == NULL) {
461 			free(mcl, M_MBUFCL);
462 			break;
463 		}
464 		mcl->mcl_next = mclfree;
465 		mcl->mcl_data = data;
466 		mcl->mcl_magic = MCL_MAGIC;
467 		mcl->mcl_refs = 0;
468 		mclfree = mcl;
469 		++mbstat.m_clfree;
470 		++mbstat.m_clusters;
471 		how = MB_DONTWAIT;
472 	}
473 
474 	/*
475 	 * If we could not allocate any report failure no more often then
476 	 * once a second.
477 	 */
478 	if (i == 0) {
479 		mbstat.m_drops++;
480 		if (ticks < last_report || (ticks - last_report) >= hz) {
481 			last_report = ticks;
482 			printf("All mbuf clusters exhausted, please see tuning(7).\n");
483 		}
484 	}
485 	return (i);
486 }
487 
488 /*
489  * Once cluster memory has been exhausted and the allocation is called with
490  * MB_WAIT, we rely on the mclfree pointers. If nothing is free, we will
491  * sleep for a designated amount of time (mbuf_wait) or until we're woken up
492  * due to sudden mcluster availability.
493  *
494  * Must be called while in a critical section.
495  */
496 static void
497 m_clalloc_wait(void)
498 {
499 	/* If in interrupt context, and INVARIANTS, maintain sanity and die. */
500 	KASSERT(mycpu->gd_intr_nesting_level == 0,
501 		("CLALLOC: CANNOT WAIT IN INTERRUPT"));
502 
503 	/*
504 	 * Sleep until something's available or until we expire.
505 	 */
506 	m_clalloc_wid++;
507 	if ((tsleep(&m_clalloc_wid, 0, "mclalc", mbuf_wait)) == EWOULDBLOCK)
508 		m_clalloc_wid--;
509 
510 	/*
511 	 * Try the allocation once more, and if we see mor then two
512 	 * free entries wake up others as well.
513 	 */
514 	m_clalloc(1, MB_WAIT);
515 	if (mclfree && mclfree->mcl_next) {
516 		MCLWAKEUP();
517 	}
518 }
519 
520 /*
521  * Return the number of references to this mbuf's data.  0 is returned
522  * if the mbuf is not M_EXT, a reference count is returned if it is
523  * M_EXT|M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
524  */
525 int
526 m_sharecount(struct mbuf *m)
527 {
528     int count;
529 
530     switch(m->m_flags & (M_EXT|M_EXT_CLUSTER)) {
531     case 0:
532 	count = 0;
533 	break;
534     case M_EXT:
535 	count = 99;
536 	break;
537     case M_EXT|M_EXT_CLUSTER:
538 	count = ((mbcluster_t)m->m_ext.ext_arg)->mcl_refs;
539 	break;
540     default:
541 	panic("bad mbuf flags: %p", m);
542 	count = 0;
543     }
544     return(count);
545 }
546 
547 /*
548  * change mbuf to new type
549  */
550 void
551 m_chtype(struct mbuf *m, int type)
552 {
553 	crit_enter();
554 	--mbtypes[m->m_type];
555 	++mbtypes[type];
556 	m->m_type = type;
557 	crit_exit();
558 }
559 
560 /*
561  * When MGET fails, ask protocols to free space when short of memory,
562  * then re-attempt to allocate an mbuf.
563  */
564 struct mbuf *
565 m_retry(int how, int t)
566 {
567 	struct mbuf *m;
568 
569 	/*
570 	 * Must only do the reclaim if not in an interrupt context.
571 	 */
572 	if (how == MB_WAIT) {
573 		KASSERT(mycpu->gd_intr_nesting_level == 0,
574 		    ("MBALLOC: CANNOT WAIT IN INTERRUPT"));
575 		m_reclaim();
576 	}
577 
578 	/*
579 	 * Try to pull a new mbuf out of the cache, if the cache is empty
580 	 * try to allocate a new one and if that doesn't work we give up.
581 	 */
582 	crit_enter();
583 	if ((m = mmbfree) == NULL) {
584 		m_mballoc(1, how);
585 		if ((m = mmbfree) == NULL) {
586 			static int last_report;
587 
588 			mbstat.m_drops++;
589 			crit_exit();
590 			if (ticks < last_report ||
591 			    (ticks - last_report) >= hz) {
592 				last_report = ticks;
593 				printf("All mbufs exhausted, please see tuning(7).\n");
594 			}
595 			return (NULL);
596 		}
597 	}
598 
599 	/*
600 	 * Cache case, adjust globals before leaving the critical section
601 	 */
602 	mmbfree = m->m_next;
603 	mbtypes[MT_FREE]--;
604 	mbtypes[t]++;
605 	mbstat.m_wait++;
606 	crit_exit();
607 
608 	m->m_type = t;
609 	m->m_next = NULL;
610 	m->m_nextpkt = NULL;
611 	m->m_data = m->m_dat;
612 	m->m_flags = 0;
613 	return (m);
614 }
615 
616 /*
617  * As above; retry an MGETHDR.
618  */
619 struct mbuf *
620 m_retryhdr(int how, int t)
621 {
622 	struct mbuf *m;
623 
624 	/*
625 	 * Must only do the reclaim if not in an interrupt context.
626 	 */
627 	if (how == MB_WAIT) {
628 		KASSERT(mycpu->gd_intr_nesting_level == 0,
629 		    ("MBALLOC: CANNOT WAIT IN INTERRUPT"));
630 		m_reclaim();
631 	}
632 
633 	/*
634 	 * Try to pull a new mbuf out of the cache, if the cache is empty
635 	 * try to allocate a new one and if that doesn't work we give up.
636 	 */
637 	crit_enter();
638 	if ((m = mmbfree) == NULL) {
639 		m_mballoc(1, how);
640 		if ((m = mmbfree) == NULL) {
641 			static int last_report;
642 
643 			mbstat.m_drops++;
644 			crit_exit();
645 			if (ticks < last_report ||
646 			    (ticks - last_report) >= hz) {
647 				last_report = ticks;
648 				printf("All mbufs exhausted, please see tuning(7).\n");
649 			}
650 			return (NULL);
651 		}
652 	}
653 
654 	/*
655 	 * Cache case, adjust globals before leaving the critical section
656 	 */
657 	mmbfree = m->m_next;
658 	mbtypes[MT_FREE]--;
659 	mbtypes[t]++;
660 	mbstat.m_wait++;
661 	crit_exit();
662 
663 	m->m_type = t;
664 	m->m_next = NULL;
665 	m->m_nextpkt = NULL;
666 	m->m_data = m->m_pktdat;
667 	m->m_flags = M_PKTHDR;
668 	m->m_pkthdr.rcvif = NULL;
669 	SLIST_INIT(&m->m_pkthdr.tags);
670 	m->m_pkthdr.csum_flags = 0;
671 	return (m);
672 }
673 
674 static void
675 m_reclaim(void)
676 {
677 	struct domain *dp;
678 	struct protosw *pr;
679 
680 	crit_enter();
681 	SLIST_FOREACH(dp, &domains, dom_next) {
682 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
683 			if (pr->pr_drain)
684 				(*pr->pr_drain)();
685 		}
686 	}
687 	crit_exit();
688 	mbstat.m_drain++;
689 }
690 
691 /*
692  * Allocate an mbuf.  If no mbufs are immediately available try to
693  * bring a bunch more into our cache (mmbfree list).  A critical
694  * section is required to protect the mmbfree list and counters
695  * against interrupts.
696  */
697 struct mbuf *
698 m_get(int how, int type)
699 {
700 	struct mbuf *m;
701 
702 	/*
703 	 * Try to pull a new mbuf out of the cache, if the cache is empty
704 	 * try to allocate a new one and if that doesn't work try even harder
705 	 * by calling m_retryhdr().
706 	 */
707 	crit_enter();
708 	if ((m = mmbfree) == NULL) {
709 		m_mballoc(1, how);
710 		if ((m = mmbfree) == NULL) {
711 			crit_exit();
712 			m = m_retry(how, type);
713 			if (m == NULL && how == MB_WAIT)
714 				m = m_mballoc_wait(MGET_C, type);
715 			return (m);
716 		}
717 	}
718 
719 	/*
720 	 * Cache case, adjust globals before leaving the critical section
721 	 */
722 	mmbfree = m->m_next;
723 	mbtypes[MT_FREE]--;
724 	mbtypes[type]++;
725 	crit_exit();
726 
727 	m->m_type = type;
728 	m->m_next = NULL;
729 	m->m_nextpkt = NULL;
730 	m->m_data = m->m_dat;
731 	m->m_flags = 0;
732 	return (m);
733 }
734 
735 struct mbuf *
736 m_gethdr(int how, int type)
737 {
738 	struct mbuf *m;
739 
740 	/*
741 	 * Try to pull a new mbuf out of the cache, if the cache is empty
742 	 * try to allocate a new one and if that doesn't work try even harder
743 	 * by calling m_retryhdr().
744 	 */
745 	crit_enter();
746 	if ((m = mmbfree) == NULL) {
747 		m_mballoc(1, how);
748 		if ((m = mmbfree) == NULL) {
749 			crit_exit();
750 			m = m_retryhdr(how, type);
751 			if (m == NULL && how == MB_WAIT)
752 				m = m_mballoc_wait(MGETHDR_C, type);
753 			return(m);
754 		}
755 	}
756 
757 	/*
758 	 * Cache case, adjust globals before leaving the critical section
759 	 */
760 	mmbfree = m->m_next;
761 	mbtypes[MT_FREE]--;
762 	mbtypes[type]++;
763 	crit_exit();
764 
765 	m->m_type = type;
766 	m->m_next = NULL;
767 	m->m_nextpkt = NULL;
768 	m->m_data = m->m_pktdat;
769 	m->m_flags = M_PKTHDR;
770 	m->m_pkthdr.rcvif = NULL;
771 	SLIST_INIT(&m->m_pkthdr.tags);
772 	m->m_pkthdr.csum_flags = 0;
773 	m->m_pkthdr.fw_flags = 0;
774 	return (m);
775 }
776 
777 struct mbuf *
778 m_getclr(int how, int type)
779 {
780 	struct mbuf *m;
781 
782 	if ((m = m_get(how, type)) != NULL) {
783 		bzero(mtod(m, caddr_t), MLEN);
784 	}
785 	return (m);
786 }
787 
788 /*
789  * m_getcl() returns an mbuf with an attached cluster.
790  * Because many network drivers use this kind of buffers a lot, it is
791  * convenient to keep a small pool of free buffers of this kind.
792  * Even a small size such as 10 gives about 10% improvement in the
793  * forwarding rate in a bridge or router.
794  * The size of this free list is controlled by the sysctl variable
795  * mcl_pool_max. The list is populated on m_freem(), and used in
796  * m_getcl() if elements are available.
797  */
798 struct mbuf *
799 m_getcl(int how, short type, int flags)
800 {
801 	struct mbuf *mp;
802 
803 	crit_enter();
804 	if (flags & M_PKTHDR) {
805 		if (type == MT_DATA && mcl_pool) {
806 			mp = mcl_pool;
807 			mcl_pool = mp->m_nextpkt;
808 			--mcl_pool_count;
809 			crit_exit();
810 			mp->m_nextpkt = NULL;
811 			mp->m_data = mp->m_ext.ext_buf;
812 			mp->m_flags = M_PKTHDR|M_EXT|M_EXT_CLUSTER;
813 			mp->m_pkthdr.rcvif = NULL;
814 			mp->m_pkthdr.csum_flags = 0;
815 			return mp;
816 		}
817 		MGETHDR(mp, how, type);
818 	} else {
819 		MGET(mp, how, type);
820 	}
821 	if (mp) {
822 		m_mclget(mp, how);
823 		if ((mp->m_flags & M_EXT) == 0) {
824 			m_free(mp);
825 			mp = NULL;
826 		}
827 	}
828 	crit_exit();
829 	return (mp);
830 }
831 
832 /*
833  * Allocate chain of requested length.
834  */
835 struct mbuf *
836 m_getc(int len, int how, int type)
837 {
838 	struct mbuf *n, *nfirst = NULL, **ntail = &nfirst;
839 	int nsize;
840 
841 	while (len > 0) {
842 		n = m_getl(len, how, type, 0, &nsize);
843 		if (n == NULL)
844 			goto failed;
845 		n->m_len = 0;
846 		*ntail = n;
847 		ntail = &n->m_next;
848 		len -= nsize;
849 	}
850 	return (nfirst);
851 
852 failed:
853 	m_freem(nfirst);
854 	return (NULL);
855 }
856 
857 /*
858  * Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best)
859  * and return a pointer to the head of the allocated chain. If m0 is
860  * non-null, then we assume that it is a single mbuf or an mbuf chain to
861  * which we want len bytes worth of mbufs and/or clusters attached, and so
862  * if we succeed in allocating it, we will just return a pointer to m0.
863  *
864  * If we happen to fail at any point during the allocation, we will free
865  * up everything we have already allocated and return NULL.
866  *
867  * Deprecated.  Use m_getc() and m_cat() instead.
868  */
869 struct mbuf *
870 m_getm(struct mbuf *m0, int len, int how, int type)
871 {
872 	struct mbuf *nfirst;
873 
874 	nfirst = m_getc(len, how, type);
875 
876 	if (m0 != NULL) {
877 		m_last(m0)->m_next = nfirst;
878 		return (m0);
879 	}
880 
881 	return (nfirst);
882 }
883 
884 /*
885  *  m_mclget() - Adds a cluster to a normal mbuf, M_EXT is set on success.
886  */
887 void
888 m_mclget(struct mbuf *m, int how)
889 {
890 	mbcluster_t mcl;
891 
892 	/*
893 	 * Allocate a cluster, return if we can't get one.
894 	 */
895 	crit_enter();
896 	if ((mcl = mclfree) == NULL) {
897 		m_clalloc(1, how);
898 		if ((mcl = mclfree) == NULL) {
899 			if (how == MB_WAIT) {
900 				m_clalloc_wait();
901 				mcl = mclfree;
902 			}
903 			if (mcl == NULL) {
904 				crit_exit();
905 				return;
906 			}
907 		}
908 	}
909 
910 	/*
911 	 * We have a cluster, unlink it from the free list and set the ref
912 	 * count.
913 	 */
914 	KKASSERT(mcl->mcl_refs == 0);
915 	mclfree = mcl->mcl_next;
916 	mcl->mcl_refs = 1;
917 	--mbstat.m_clfree;
918 	crit_exit();
919 
920 	/*
921 	 * Add the cluster to the mbuf.  The caller will detect that the
922 	 * mbuf now has an attached cluster.
923 	 */
924 	m->m_ext.ext_arg = mcl;
925 	m->m_ext.ext_buf = mcl->mcl_data;
926 	m->m_ext.ext_ref = m_mclref;
927 	m->m_ext.ext_free = m_mclfree;
928 	m->m_ext.ext_size = MCLBYTES;
929 
930 	m->m_data = m->m_ext.ext_buf;
931 	m->m_flags |= M_EXT | M_EXT_CLUSTER;
932 }
933 
934 static void
935 m_mclfree(void *arg)
936 {
937 	mbcluster_t mcl = arg;
938 
939 	KKASSERT(mcl->mcl_magic == MCL_MAGIC);
940 	KKASSERT(mcl->mcl_refs > 0);
941 	crit_enter();
942 	if (--mcl->mcl_refs == 0) {
943 		if (mbstat.m_clfree < mcl_free_max) {
944 			mcl->mcl_next = mclfree;
945 			mclfree = mcl;
946 			++mbstat.m_clfree;
947 			MCLWAKEUP();
948 		} else {
949 			mcl->mcl_magic = -1;
950 			free(mcl->mcl_data, M_MBUFCL);
951 			free(mcl, M_MBUFCL);
952 			--mbstat.m_clusters;
953 		}
954 	}
955 	crit_exit();
956 }
957 
958 static void
959 m_mclref(void *arg)
960 {
961 	mbcluster_t mcl = arg;
962 
963 	KKASSERT(mcl->mcl_magic == MCL_MAGIC);
964 	crit_enter();
965 	++mcl->mcl_refs;
966 	crit_exit();
967 }
968 
969 /*
970  * Helper routines for M_EXT reference/free
971  */
972 static __inline void
973 m_extref(const struct mbuf *m)
974 {
975 	KKASSERT(m->m_ext.ext_free != NULL);
976 	crit_enter();
977 	m->m_ext.ext_ref(m->m_ext.ext_arg);
978 	crit_exit();
979 }
980 
981 /*
982  * m_free()
983  *
984  * Free a single mbuf and any associated external storage.  The successor,
985  * if any, is returned.
986  *
987  * We do need to check non-first mbuf for m_aux, since some of existing
988  * code does not call M_PREPEND properly.
989  * (example: call to bpf_mtap from drivers)
990  */
991 struct mbuf *
992 m_free(struct mbuf *m)
993 {
994 	struct mbuf *n;
995 
996 	crit_enter();
997 	KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
998 
999 	/*
1000 	 * Adjust our type count and delete any attached chains if the
1001 	 * mbuf is a packet header.
1002 	 */
1003 	if ((m->m_flags & M_PKTHDR) != 0)
1004 		m_tag_delete_chain(m, NULL);
1005 
1006 	/*
1007 	 * Place the mbuf on the appropriate free list.  Try to maintain a
1008 	 * small cache of mbuf+cluster pairs.
1009 	 */
1010 	n = m->m_next;
1011 	m->m_next = NULL;
1012 	if (m->m_flags & M_EXT) {
1013 		KKASSERT(m->m_ext.ext_free != NULL);
1014 		if (mcl_pool_count < mcl_pool_max && m && m->m_next == NULL &&
1015 		    (m->m_flags & (M_PKTHDR|M_EXT_CLUSTER)) == (M_PKTHDR|M_EXT_CLUSTER) &&
1016 		    m->m_type == MT_DATA && M_EXT_WRITABLE(m) ) {
1017 			KKASSERT(((mbcluster_t)m->m_ext.ext_arg)->mcl_magic == MCL_MAGIC);
1018 			m->m_nextpkt = mcl_pool;
1019 			mcl_pool = m;
1020 			++mcl_pool_count;
1021 			m = NULL;
1022 		} else {
1023 			m->m_ext.ext_free(m->m_ext.ext_arg);
1024 			m->m_flags = 0;
1025 			m->m_ext.ext_arg = NULL;
1026 			m->m_ext.ext_ref = NULL;
1027 			m->m_ext.ext_free = NULL;
1028 		}
1029 	}
1030 	if (m) {
1031 		--mbtypes[m->m_type];
1032 		if (mbtypes[MT_FREE] < mbuf_free_max) {
1033 			m->m_type = MT_FREE;
1034 			mbtypes[MT_FREE]++;
1035 			m->m_next = mmbfree;
1036 			mmbfree = m;
1037 			MMBWAKEUP();
1038 		} else {
1039 			free(m, M_MBUF);
1040 			--mbstat.m_mbufs;
1041 		}
1042 	}
1043 	crit_exit();
1044 	return (n);
1045 }
1046 
1047 void
1048 m_freem(struct mbuf *m)
1049 {
1050 	crit_enter();
1051 	while (m)
1052 		m = m_free(m);
1053 	crit_exit();
1054 }
1055 
1056 /*
1057  * mbuf utility routines
1058  */
1059 
1060 /*
1061  * Lesser-used path for M_PREPEND:
1062  * allocate new mbuf to prepend to chain,
1063  * copy junk along.
1064  */
1065 struct mbuf *
1066 m_prepend(struct mbuf *m, int len, int how)
1067 {
1068 	struct mbuf *mn;
1069 
1070 	MGET(mn, how, m->m_type);
1071 	if (mn == (struct mbuf *)NULL) {
1072 		m_freem(m);
1073 		return ((struct mbuf *)NULL);
1074 	}
1075 	if (m->m_flags & M_PKTHDR)
1076 		M_MOVE_PKTHDR(mn, m);
1077 	mn->m_next = m;
1078 	m = mn;
1079 	if (len < MHLEN)
1080 		MH_ALIGN(m, len);
1081 	m->m_len = len;
1082 	return (m);
1083 }
1084 
1085 /*
1086  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
1087  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
1088  * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
1089  * Note that the copy is read-only, because clusters are not copied,
1090  * only their reference counts are incremented.
1091  */
1092 #define MCFail (mbstat.m_mcfail)
1093 
1094 struct mbuf *
1095 m_copym(const struct mbuf *m, int off0, int len, int wait)
1096 {
1097 	struct mbuf *n, **np;
1098 	int off = off0;
1099 	struct mbuf *top;
1100 	int copyhdr = 0;
1101 
1102 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
1103 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
1104 	if (off == 0 && m->m_flags & M_PKTHDR)
1105 		copyhdr = 1;
1106 	while (off > 0) {
1107 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
1108 		if (off < m->m_len)
1109 			break;
1110 		off -= m->m_len;
1111 		m = m->m_next;
1112 	}
1113 	np = &top;
1114 	top = 0;
1115 	while (len > 0) {
1116 		if (m == 0) {
1117 			KASSERT(len == M_COPYALL,
1118 			    ("m_copym, length > size of mbuf chain"));
1119 			break;
1120 		}
1121 		MGET(n, wait, m->m_type);
1122 		*np = n;
1123 		if (n == 0)
1124 			goto nospace;
1125 		if (copyhdr) {
1126 			if (!m_dup_pkthdr(n, m, wait))
1127 				goto nospace;
1128 			if (len == M_COPYALL)
1129 				n->m_pkthdr.len -= off0;
1130 			else
1131 				n->m_pkthdr.len = len;
1132 			copyhdr = 0;
1133 		}
1134 		n->m_len = min(len, m->m_len - off);
1135 		if (m->m_flags & M_EXT) {
1136 			n->m_data = m->m_data + off;
1137 			m_extref(m);
1138 			n->m_ext = m->m_ext;
1139 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1140 		} else {
1141 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
1142 			    (unsigned)n->m_len);
1143 		}
1144 		if (len != M_COPYALL)
1145 			len -= n->m_len;
1146 		off = 0;
1147 		m = m->m_next;
1148 		np = &n->m_next;
1149 	}
1150 	if (top == 0)
1151 		MCFail++;
1152 	return (top);
1153 nospace:
1154 	m_freem(top);
1155 	MCFail++;
1156 	return (0);
1157 }
1158 
1159 /*
1160  * Copy an entire packet, including header (which must be present).
1161  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
1162  * Note that the copy is read-only, because clusters are not copied,
1163  * only their reference counts are incremented.
1164  * Preserve alignment of the first mbuf so if the creator has left
1165  * some room at the beginning (e.g. for inserting protocol headers)
1166  * the copies also have the room available.
1167  */
1168 struct mbuf *
1169 m_copypacket(struct mbuf *m, int how)
1170 {
1171 	struct mbuf *top, *n, *o;
1172 
1173 	MGET(n, how, m->m_type);
1174 	top = n;
1175 	if (!n)
1176 		goto nospace;
1177 
1178 	if (!m_dup_pkthdr(n, m, how))
1179 		goto nospace;
1180 	n->m_len = m->m_len;
1181 	if (m->m_flags & M_EXT) {
1182 		n->m_data = m->m_data;
1183 		m_extref(m);
1184 		n->m_ext = m->m_ext;
1185 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1186 	} else {
1187 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
1188 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1189 	}
1190 
1191 	m = m->m_next;
1192 	while (m) {
1193 		MGET(o, how, m->m_type);
1194 		if (!o)
1195 			goto nospace;
1196 
1197 		n->m_next = o;
1198 		n = n->m_next;
1199 
1200 		n->m_len = m->m_len;
1201 		if (m->m_flags & M_EXT) {
1202 			n->m_data = m->m_data;
1203 			m_extref(m);
1204 			n->m_ext = m->m_ext;
1205 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1206 		} else {
1207 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1208 		}
1209 
1210 		m = m->m_next;
1211 	}
1212 	return top;
1213 nospace:
1214 	m_freem(top);
1215 	MCFail++;
1216 	return 0;
1217 }
1218 
1219 /*
1220  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1221  * continuing for "len" bytes, into the indicated buffer.
1222  */
1223 void
1224 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1225 {
1226 	unsigned count;
1227 
1228 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1229 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1230 	while (off > 0) {
1231 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1232 		if (off < m->m_len)
1233 			break;
1234 		off -= m->m_len;
1235 		m = m->m_next;
1236 	}
1237 	while (len > 0) {
1238 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1239 		count = min(m->m_len - off, len);
1240 		bcopy(mtod(m, caddr_t) + off, cp, count);
1241 		len -= count;
1242 		cp += count;
1243 		off = 0;
1244 		m = m->m_next;
1245 	}
1246 }
1247 
1248 /*
1249  * Copy a packet header mbuf chain into a completely new chain, including
1250  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1251  * you need a writable copy of an mbuf chain.
1252  */
1253 struct mbuf *
1254 m_dup(struct mbuf *m, int how)
1255 {
1256 	struct mbuf **p, *top = NULL;
1257 	int remain, moff, nsize;
1258 
1259 	/* Sanity check */
1260 	if (m == NULL)
1261 		return (NULL);
1262 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
1263 
1264 	/* While there's more data, get a new mbuf, tack it on, and fill it */
1265 	remain = m->m_pkthdr.len;
1266 	moff = 0;
1267 	p = &top;
1268 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
1269 		struct mbuf *n;
1270 
1271 		/* Get the next new mbuf */
1272 		n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0,
1273 			   &nsize);
1274 		if (n == NULL)
1275 			goto nospace;
1276 		if (top == NULL)
1277 			if (!m_dup_pkthdr(n, m, how))
1278 				goto nospace0;
1279 
1280 		/* Link it into the new chain */
1281 		*p = n;
1282 		p = &n->m_next;
1283 
1284 		/* Copy data from original mbuf(s) into new mbuf */
1285 		n->m_len = 0;
1286 		while (n->m_len < nsize && m != NULL) {
1287 			int chunk = min(nsize - n->m_len, m->m_len - moff);
1288 
1289 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1290 			moff += chunk;
1291 			n->m_len += chunk;
1292 			remain -= chunk;
1293 			if (moff == m->m_len) {
1294 				m = m->m_next;
1295 				moff = 0;
1296 			}
1297 		}
1298 
1299 		/* Check correct total mbuf length */
1300 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1301 			("%s: bogus m_pkthdr.len", __func__));
1302 	}
1303 	return (top);
1304 
1305 nospace:
1306 	m_freem(top);
1307 nospace0:
1308 	mbstat.m_mcfail++;
1309 	return (NULL);
1310 }
1311 
1312 /*
1313  * Concatenate mbuf chain n to m.
1314  * Both chains must be of the same type (e.g. MT_DATA).
1315  * Any m_pkthdr is not updated.
1316  */
1317 void
1318 m_cat(struct mbuf *m, struct mbuf *n)
1319 {
1320 	m = m_last(m);
1321 	while (n) {
1322 		if (m->m_flags & M_EXT ||
1323 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1324 			/* just join the two chains */
1325 			m->m_next = n;
1326 			return;
1327 		}
1328 		/* splat the data from one into the other */
1329 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1330 		    (u_int)n->m_len);
1331 		m->m_len += n->m_len;
1332 		n = m_free(n);
1333 	}
1334 }
1335 
1336 void
1337 m_adj(struct mbuf *mp, int req_len)
1338 {
1339 	int len = req_len;
1340 	struct mbuf *m;
1341 	int count;
1342 
1343 	if ((m = mp) == NULL)
1344 		return;
1345 	if (len >= 0) {
1346 		/*
1347 		 * Trim from head.
1348 		 */
1349 		while (m != NULL && len > 0) {
1350 			if (m->m_len <= len) {
1351 				len -= m->m_len;
1352 				m->m_len = 0;
1353 				m = m->m_next;
1354 			} else {
1355 				m->m_len -= len;
1356 				m->m_data += len;
1357 				len = 0;
1358 			}
1359 		}
1360 		m = mp;
1361 		if (mp->m_flags & M_PKTHDR)
1362 			m->m_pkthdr.len -= (req_len - len);
1363 	} else {
1364 		/*
1365 		 * Trim from tail.  Scan the mbuf chain,
1366 		 * calculating its length and finding the last mbuf.
1367 		 * If the adjustment only affects this mbuf, then just
1368 		 * adjust and return.  Otherwise, rescan and truncate
1369 		 * after the remaining size.
1370 		 */
1371 		len = -len;
1372 		count = 0;
1373 		for (;;) {
1374 			count += m->m_len;
1375 			if (m->m_next == (struct mbuf *)0)
1376 				break;
1377 			m = m->m_next;
1378 		}
1379 		if (m->m_len >= len) {
1380 			m->m_len -= len;
1381 			if (mp->m_flags & M_PKTHDR)
1382 				mp->m_pkthdr.len -= len;
1383 			return;
1384 		}
1385 		count -= len;
1386 		if (count < 0)
1387 			count = 0;
1388 		/*
1389 		 * Correct length for chain is "count".
1390 		 * Find the mbuf with last data, adjust its length,
1391 		 * and toss data from remaining mbufs on chain.
1392 		 */
1393 		m = mp;
1394 		if (m->m_flags & M_PKTHDR)
1395 			m->m_pkthdr.len = count;
1396 		for (; m; m = m->m_next) {
1397 			if (m->m_len >= count) {
1398 				m->m_len = count;
1399 				break;
1400 			}
1401 			count -= m->m_len;
1402 		}
1403 		while (m->m_next)
1404 			(m = m->m_next) ->m_len = 0;
1405 	}
1406 }
1407 
1408 /*
1409  * Rearange an mbuf chain so that len bytes are contiguous
1410  * and in the data area of an mbuf (so that mtod will work for a structure
1411  * of size len).  Returns the resulting mbuf chain on success, frees it and
1412  * returns null on failure.  If there is room, it will add up to
1413  * max_protohdr-len extra bytes to the contiguous region in an attempt to
1414  * avoid being called next time.
1415  */
1416 #define MPFail (mbstat.m_mpfail)
1417 
1418 struct mbuf *
1419 m_pullup(struct mbuf *n, int len)
1420 {
1421 	struct mbuf *m;
1422 	int count;
1423 	int space;
1424 
1425 	/*
1426 	 * If first mbuf has no cluster, and has room for len bytes
1427 	 * without shifting current data, pullup into it,
1428 	 * otherwise allocate a new mbuf to prepend to the chain.
1429 	 */
1430 	if ((n->m_flags & M_EXT) == 0 &&
1431 	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
1432 		if (n->m_len >= len)
1433 			return (n);
1434 		m = n;
1435 		n = n->m_next;
1436 		len -= m->m_len;
1437 	} else {
1438 		if (len > MHLEN)
1439 			goto bad;
1440 		MGET(m, MB_DONTWAIT, n->m_type);
1441 		if (m == 0)
1442 			goto bad;
1443 		m->m_len = 0;
1444 		if (n->m_flags & M_PKTHDR)
1445 			M_MOVE_PKTHDR(m, n);
1446 	}
1447 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1448 	do {
1449 		count = min(min(max(len, max_protohdr), space), n->m_len);
1450 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1451 		  (unsigned)count);
1452 		len -= count;
1453 		m->m_len += count;
1454 		n->m_len -= count;
1455 		space -= count;
1456 		if (n->m_len)
1457 			n->m_data += count;
1458 		else
1459 			n = m_free(n);
1460 	} while (len > 0 && n);
1461 	if (len > 0) {
1462 		(void) m_free(m);
1463 		goto bad;
1464 	}
1465 	m->m_next = n;
1466 	return (m);
1467 bad:
1468 	m_freem(n);
1469 	MPFail++;
1470 	return (0);
1471 }
1472 
1473 /*
1474  * Partition an mbuf chain in two pieces, returning the tail --
1475  * all but the first len0 bytes.  In case of failure, it returns NULL and
1476  * attempts to restore the chain to its original state.
1477  *
1478  * Note that the resulting mbufs might be read-only, because the new
1479  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1480  * the "breaking point" happens to lie within a cluster mbuf. Use the
1481  * M_WRITABLE() macro to check for this case.
1482  */
1483 struct mbuf *
1484 m_split(struct mbuf *m0, int len0, int wait)
1485 {
1486 	struct mbuf *m, *n;
1487 	unsigned len = len0, remain;
1488 
1489 	for (m = m0; m && len > m->m_len; m = m->m_next)
1490 		len -= m->m_len;
1491 	if (m == 0)
1492 		return (0);
1493 	remain = m->m_len - len;
1494 	if (m0->m_flags & M_PKTHDR) {
1495 		MGETHDR(n, wait, m0->m_type);
1496 		if (n == 0)
1497 			return (0);
1498 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1499 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1500 		m0->m_pkthdr.len = len0;
1501 		if (m->m_flags & M_EXT)
1502 			goto extpacket;
1503 		if (remain > MHLEN) {
1504 			/* m can't be the lead packet */
1505 			MH_ALIGN(n, 0);
1506 			n->m_next = m_split(m, len, wait);
1507 			if (n->m_next == 0) {
1508 				(void) m_free(n);
1509 				return (0);
1510 			} else {
1511 				n->m_len = 0;
1512 				return (n);
1513 			}
1514 		} else
1515 			MH_ALIGN(n, remain);
1516 	} else if (remain == 0) {
1517 		n = m->m_next;
1518 		m->m_next = 0;
1519 		return (n);
1520 	} else {
1521 		MGET(n, wait, m->m_type);
1522 		if (n == 0)
1523 			return (0);
1524 		M_ALIGN(n, remain);
1525 	}
1526 extpacket:
1527 	if (m->m_flags & M_EXT) {
1528 		n->m_data = m->m_data + len;
1529 		m_extref(m);
1530 		n->m_ext = m->m_ext;
1531 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1532 	} else {
1533 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1534 	}
1535 	n->m_len = remain;
1536 	m->m_len = len;
1537 	n->m_next = m->m_next;
1538 	m->m_next = 0;
1539 	return (n);
1540 }
1541 
1542 /*
1543  * Routine to copy from device local memory into mbufs.
1544  * Note: "offset" is ill-defined and always called as 0, so ignore it.
1545  */
1546 struct mbuf *
1547 m_devget(char *buf, int len, int offset, struct ifnet *ifp,
1548     void (*copy)(volatile const void *from, volatile void *to, size_t length))
1549 {
1550 	struct mbuf *m, *mfirst = NULL, **mtail;
1551 	int nsize, flags;
1552 
1553 	if (copy == NULL)
1554 		copy = bcopy;
1555 	mtail = &mfirst;
1556 	flags = M_PKTHDR;
1557 
1558 	while (len > 0) {
1559 		m = m_getl(len, MB_DONTWAIT, MT_DATA, flags, &nsize);
1560 		if (m == NULL) {
1561 			m_freem(mfirst);
1562 			return (NULL);
1563 		}
1564 		m->m_len = min(len, nsize);
1565 
1566 		if (flags & M_PKTHDR) {
1567 			if (len + max_linkhdr <= nsize)
1568 				m->m_data += max_linkhdr;
1569 			m->m_pkthdr.rcvif = ifp;
1570 			m->m_pkthdr.len = len;
1571 			flags = 0;
1572 		}
1573 
1574 		copy(buf, m->m_data, (unsigned)m->m_len);
1575 		buf += m->m_len;
1576 		len -= m->m_len;
1577 		*mtail = m;
1578 		mtail = &m->m_next;
1579 	}
1580 
1581 	return (mfirst);
1582 }
1583 
1584 /*
1585  * Copy data from a buffer back into the indicated mbuf chain,
1586  * starting "off" bytes from the beginning, extending the mbuf
1587  * chain if necessary.
1588  */
1589 void
1590 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1591 {
1592 	int mlen;
1593 	struct mbuf *m = m0, *n;
1594 	int totlen = 0;
1595 
1596 	if (m0 == 0)
1597 		return;
1598 	while (off > (mlen = m->m_len)) {
1599 		off -= mlen;
1600 		totlen += mlen;
1601 		if (m->m_next == 0) {
1602 			n = m_getclr(MB_DONTWAIT, m->m_type);
1603 			if (n == 0)
1604 				goto out;
1605 			n->m_len = min(MLEN, len + off);
1606 			m->m_next = n;
1607 		}
1608 		m = m->m_next;
1609 	}
1610 	while (len > 0) {
1611 		mlen = min (m->m_len - off, len);
1612 		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1613 		cp += mlen;
1614 		len -= mlen;
1615 		mlen += off;
1616 		off = 0;
1617 		totlen += mlen;
1618 		if (len == 0)
1619 			break;
1620 		if (m->m_next == 0) {
1621 			n = m_get(MB_DONTWAIT, m->m_type);
1622 			if (n == 0)
1623 				break;
1624 			n->m_len = min(MLEN, len);
1625 			m->m_next = n;
1626 		}
1627 		m = m->m_next;
1628 	}
1629 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1630 		m->m_pkthdr.len = totlen;
1631 }
1632 
1633 void
1634 m_print(const struct mbuf *m)
1635 {
1636 	int len;
1637 	const struct mbuf *m2;
1638 
1639 	len = m->m_pkthdr.len;
1640 	m2 = m;
1641 	while (len) {
1642 		printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1643 		len -= m2->m_len;
1644 		m2 = m2->m_next;
1645 	}
1646 	return;
1647 }
1648 
1649 /*
1650  * "Move" mbuf pkthdr from "from" to "to".
1651  * "from" must have M_PKTHDR set, and "to" must be empty.
1652  */
1653 void
1654 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
1655 {
1656 	KASSERT((to->m_flags & M_EXT) == 0, ("m_move_pkthdr: to has cluster"));
1657 
1658 	to->m_flags = from->m_flags & M_COPYFLAGS;
1659 	to->m_data = to->m_pktdat;
1660 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
1661 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
1662 	from->m_flags &= ~M_PKTHDR;
1663 }
1664 
1665 /*
1666  * Duplicate "from"'s mbuf pkthdr in "to".
1667  * "from" must have M_PKTHDR set, and "to" must be empty.
1668  * In particular, this does a deep copy of the packet tags.
1669  */
1670 int
1671 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
1672 {
1673 	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
1674 	if ((to->m_flags & M_EXT) == 0)
1675 		to->m_data = to->m_pktdat;
1676 	to->m_pkthdr = from->m_pkthdr;
1677 	SLIST_INIT(&to->m_pkthdr.tags);
1678 	return (m_tag_copy_chain(to, from, how));
1679 }
1680 
1681 /*
1682  * Defragment a mbuf chain, returning the shortest possible
1683  * chain of mbufs and clusters.  If allocation fails and
1684  * this cannot be completed, NULL will be returned, but
1685  * the passed in chain will be unchanged.  Upon success,
1686  * the original chain will be freed, and the new chain
1687  * will be returned.
1688  *
1689  * If a non-packet header is passed in, the original
1690  * mbuf (chain?) will be returned unharmed.
1691  *
1692  * m_defrag_nofree doesn't free the passed in mbuf.
1693  */
1694 struct mbuf *
1695 m_defrag(struct mbuf *m0, int how)
1696 {
1697 	struct mbuf *m_new;
1698 
1699 	if ((m_new = m_defrag_nofree(m0, how)) == NULL)
1700 		return (NULL);
1701 	if (m_new != m0)
1702 		m_freem(m0);
1703 	return (m_new);
1704 }
1705 
1706 struct mbuf *
1707 m_defrag_nofree(struct mbuf *m0, int how)
1708 {
1709 	struct mbuf	*m_new = NULL, *m_final = NULL;
1710 	int		progress = 0, length, nsize;
1711 
1712 	if (!(m0->m_flags & M_PKTHDR))
1713 		return (m0);
1714 
1715 #ifdef MBUF_STRESS_TEST
1716 	if (m_defragrandomfailures) {
1717 		int temp = arc4random() & 0xff;
1718 		if (temp == 0xba)
1719 			goto nospace;
1720 	}
1721 #endif
1722 
1723 	m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize);
1724 	if (m_final == NULL)
1725 		goto nospace;
1726 	m_final->m_len = 0;	/* in case m0->m_pkthdr.len is zero */
1727 
1728 	if (m_dup_pkthdr(m_final, m0, how) == NULL)
1729 		goto nospace;
1730 
1731 	m_new = m_final;
1732 
1733 	while (progress < m0->m_pkthdr.len) {
1734 		length = m0->m_pkthdr.len - progress;
1735 		if (length > MCLBYTES)
1736 			length = MCLBYTES;
1737 
1738 		if (m_new == NULL) {
1739 			m_new = m_getl(length, how, MT_DATA, 0, &nsize);
1740 			if (m_new == NULL)
1741 				goto nospace;
1742 		}
1743 
1744 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1745 		progress += length;
1746 		m_new->m_len = length;
1747 		if (m_new != m_final)
1748 			m_cat(m_final, m_new);
1749 		m_new = NULL;
1750 	}
1751 	if (m0->m_next == NULL)
1752 		m_defraguseless++;
1753 	m_defragpackets++;
1754 	m_defragbytes += m_final->m_pkthdr.len;
1755 	return (m_final);
1756 nospace:
1757 	m_defragfailure++;
1758 	if (m_new)
1759 		m_free(m_new);
1760 	m_freem(m_final);
1761 	return (NULL);
1762 }
1763 
1764 /*
1765  * Move data from uio into mbufs.
1766  */
1767 struct mbuf *
1768 m_uiomove(struct uio *uio)
1769 {
1770 	struct mbuf *m;			/* current working mbuf */
1771 	struct mbuf *head = NULL;	/* result mbuf chain */
1772 	struct mbuf **mp = &head;
1773 	int resid = uio->uio_resid, nsize, flags = M_PKTHDR, error;
1774 
1775 	do {
1776 		m = m_getl(resid, MB_WAIT, MT_DATA, flags, &nsize);
1777 		if (flags) {
1778 			m->m_pkthdr.len = 0;
1779 			/* Leave room for protocol headers. */
1780 			if (resid < MHLEN)
1781 				MH_ALIGN(m, resid);
1782 			flags = 0;
1783 		}
1784 		m->m_len = min(nsize, resid);
1785 		error = uiomove(mtod(m, caddr_t), m->m_len, uio);
1786 		if (error) {
1787 			m_free(m);
1788 			goto failed;
1789 		}
1790 		*mp = m;
1791 		mp = &m->m_next;
1792 		head->m_pkthdr.len += m->m_len;
1793 		resid -= m->m_len;
1794 	} while (resid > 0);
1795 
1796 	return (head);
1797 
1798 failed:
1799 	m_freem(head);
1800 	return (NULL);
1801 }
1802 
1803 struct mbuf *
1804 m_last(struct mbuf *m)
1805 {
1806 	while (m->m_next)
1807 		m = m->m_next;
1808 	return (m);
1809 }
1810 
1811 /*
1812  * Return the number of bytes in an mbuf chain.
1813  * If lastm is not NULL, also return the last mbuf.
1814  */
1815 u_int
1816 m_lengthm(struct mbuf *m, struct mbuf **lastm)
1817 {
1818 	u_int len = 0;
1819 	struct mbuf *prev = m;
1820 
1821 	while (m) {
1822 		len += m->m_len;
1823 		prev = m;
1824 		m = m->m_next;
1825 	}
1826 	if (lastm != NULL)
1827 		*lastm = prev;
1828 	return (len);
1829 }
1830 
1831 /*
1832  * Like m_lengthm(), except also keep track of mbuf usage.
1833  */
1834 u_int
1835 m_countm(struct mbuf *m, struct mbuf **lastm, u_int *pmbcnt)
1836 {
1837 	u_int len = 0, mbcnt = 0;
1838 	struct mbuf *prev = m;
1839 
1840 	while (m) {
1841 		len += m->m_len;
1842 		mbcnt += MSIZE;
1843 		if (m->m_flags & M_EXT)
1844 			mbcnt += m->m_ext.ext_size;
1845 		prev = m;
1846 		m = m->m_next;
1847 	}
1848 	if (lastm != NULL)
1849 		*lastm = prev;
1850 	*pmbcnt = mbcnt;
1851 	return (len);
1852 }
1853