xref: /dflybsd-src/sys/kern/uipc_mbuf.c (revision 9b5a99654f820c32a9fb8fa9bae3c3b12fe27a0b)
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 1982, 1986, 1988, 1991, 1993
36  *	The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. All advertising materials mentioning features or use of this software
47  *    must display the following acknowledgement:
48  *	This product includes software developed by the University of
49  *	California, Berkeley and its contributors.
50  * 4. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  * @(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
67  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
68  * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.62 2007/05/13 22:56:59 dillon Exp $
69  */
70 
71 #include "opt_param.h"
72 #include "opt_ddb.h"
73 #include "opt_mbuf_stress_test.h"
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/malloc.h>
77 #include <sys/mbuf.h>
78 #include <sys/kernel.h>
79 #include <sys/sysctl.h>
80 #include <sys/domain.h>
81 #include <sys/objcache.h>
82 #include <sys/protosw.h>
83 #include <sys/uio.h>
84 #include <sys/thread.h>
85 #include <sys/globaldata.h>
86 #include <sys/serialize.h>
87 #include <sys/thread2.h>
88 
89 #include <vm/vm.h>
90 #include <vm/vm_kern.h>
91 #include <vm/vm_extern.h>
92 
93 #ifdef INVARIANTS
94 #include <machine/cpu.h>
95 #endif
96 
97 /*
98  * mbuf cluster meta-data
99  */
100 struct mbcluster {
101 	int32_t	mcl_refs;
102 	void	*mcl_data;
103 	struct lwkt_serialize mcl_serializer;
104 };
105 
106 static void mbinit(void *);
107 SYSINIT(mbuf, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, mbinit, NULL)
108 
109 static u_long	mbtypes[MT_NTYPES];
110 
111 struct mbstat mbstat;
112 int	max_linkhdr;
113 int	max_protohdr;
114 int	max_hdr;
115 int	max_datalen;
116 int	m_defragpackets;
117 int	m_defragbytes;
118 int	m_defraguseless;
119 int	m_defragfailure;
120 #ifdef MBUF_STRESS_TEST
121 int	m_defragrandomfailures;
122 #endif
123 
124 struct objcache *mbuf_cache, *mbufphdr_cache;
125 struct objcache *mclmeta_cache;
126 struct objcache *mbufcluster_cache, *mbufphdrcluster_cache;
127 
128 int	nmbclusters;
129 int	nmbufs;
130 
131 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
132 	   &max_linkhdr, 0, "");
133 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
134 	   &max_protohdr, 0, "");
135 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
136 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
137 	   &max_datalen, 0, "");
138 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
139 	   &mbuf_wait, 0, "");
140 SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
141 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
142 	   sizeof(mbtypes), "LU", "");
143 
144 /*
145  * These are read-only because we do not currently have any code
146  * to adjust the objcache limits after the fact.  The variables
147  * may only be set as boot-time tunables.
148  */
149 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD,
150 	   &nmbclusters, 0, "Maximum number of mbuf clusters available");
151 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
152 	   "Maximum number of mbufs available");
153 
154 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
155 	   &m_defragpackets, 0, "");
156 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
157 	   &m_defragbytes, 0, "");
158 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
159 	   &m_defraguseless, 0, "");
160 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
161 	   &m_defragfailure, 0, "");
162 #ifdef MBUF_STRESS_TEST
163 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
164 	   &m_defragrandomfailures, 0, "");
165 #endif
166 
167 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
168 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
169 static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta");
170 
171 static void m_reclaim (void);
172 static void m_mclref(void *arg);
173 static void m_mclfree(void *arg);
174 
175 #ifndef NMBCLUSTERS
176 #define NMBCLUSTERS	(512 + maxusers * 16)
177 #endif
178 #ifndef NMBUFS
179 #define NMBUFS		(nmbclusters * 2)
180 #endif
181 
182 /*
183  * Perform sanity checks of tunables declared above.
184  */
185 static void
186 tunable_mbinit(void *dummy)
187 {
188 	/*
189 	 * This has to be done before VM init.
190 	 */
191 	nmbclusters = NMBCLUSTERS;
192 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
193 	nmbufs = NMBUFS;
194 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
195 	/* Sanity checks */
196 	if (nmbufs < nmbclusters * 2)
197 		nmbufs = nmbclusters * 2;
198 }
199 SYSINIT(tunable_mbinit, SI_BOOT1_TUNABLES, SI_ORDER_ANY,
200 	tunable_mbinit, NULL);
201 
202 /* "number of clusters of pages" */
203 #define NCL_INIT	1
204 
205 #define NMB_INIT	16
206 
207 /*
208  * The mbuf object cache only guarantees that m_next and m_nextpkt are
209  * NULL and that m_data points to the beginning of the data area.  In
210  * particular, m_len and m_pkthdr.len are uninitialized.  It is the
211  * responsibility of the caller to initialize those fields before use.
212  */
213 
214 static boolean_t __inline
215 mbuf_ctor(void *obj, void *private, int ocflags)
216 {
217 	struct mbuf *m = obj;
218 
219 	m->m_next = NULL;
220 	m->m_nextpkt = NULL;
221 	m->m_data = m->m_dat;
222 	m->m_flags = 0;
223 
224 	return (TRUE);
225 }
226 
227 /*
228  * Initialize the mbuf and the packet header fields.
229  */
230 static boolean_t
231 mbufphdr_ctor(void *obj, void *private, int ocflags)
232 {
233 	struct mbuf *m = obj;
234 
235 	m->m_next = NULL;
236 	m->m_nextpkt = NULL;
237 	m->m_data = m->m_pktdat;
238 	m->m_flags = M_PKTHDR | M_PHCACHE;
239 
240 	m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
241 	SLIST_INIT(&m->m_pkthdr.tags);
242 	m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
243 	m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
244 
245 	return (TRUE);
246 }
247 
248 /*
249  * A mbcluster object consists of 2K (MCLBYTES) cluster and a refcount.
250  */
251 static boolean_t
252 mclmeta_ctor(void *obj, void *private, int ocflags)
253 {
254 	struct mbcluster *cl = obj;
255 	void *buf;
256 
257 	if (ocflags & M_NOWAIT)
258 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_NOWAIT | M_ZERO);
259 	else
260 		buf = kmalloc(MCLBYTES, M_MBUFCL, M_INTWAIT | M_ZERO);
261 	if (buf == NULL)
262 		return (FALSE);
263 	cl->mcl_refs = 0;
264 	cl->mcl_data = buf;
265 	lwkt_serialize_init(&cl->mcl_serializer);
266 	return (TRUE);
267 }
268 
269 static void
270 mclmeta_dtor(void *obj, void *private)
271 {
272 	struct mbcluster *mcl = obj;
273 
274 	KKASSERT(mcl->mcl_refs == 0);
275 	kfree(mcl->mcl_data, M_MBUFCL);
276 }
277 
278 static void
279 linkcluster(struct mbuf *m, struct mbcluster *cl)
280 {
281 	/*
282 	 * Add the cluster to the mbuf.  The caller will detect that the
283 	 * mbuf now has an attached cluster.
284 	 */
285 	m->m_ext.ext_arg = cl;
286 	m->m_ext.ext_buf = cl->mcl_data;
287 	m->m_ext.ext_ref = m_mclref;
288 	m->m_ext.ext_free = m_mclfree;
289 	m->m_ext.ext_size = MCLBYTES;
290 	atomic_add_int(&cl->mcl_refs, 1);
291 
292 	m->m_data = m->m_ext.ext_buf;
293 	m->m_flags |= M_EXT | M_EXT_CLUSTER;
294 }
295 
296 static boolean_t
297 mbufphdrcluster_ctor(void *obj, void *private, int ocflags)
298 {
299 	struct mbuf *m = obj;
300 	struct mbcluster *cl;
301 
302 	mbufphdr_ctor(obj, private, ocflags);
303 	cl = objcache_get(mclmeta_cache, ocflags);
304 	if (cl == NULL)
305 		return (FALSE);
306 	m->m_flags |= M_CLCACHE;
307 	linkcluster(m, cl);
308 	return (TRUE);
309 }
310 
311 static boolean_t
312 mbufcluster_ctor(void *obj, void *private, int ocflags)
313 {
314 	struct mbuf *m = obj;
315 	struct mbcluster *cl;
316 
317 	mbuf_ctor(obj, private, ocflags);
318 	cl = objcache_get(mclmeta_cache, ocflags);
319 	if (cl == NULL)
320 		return (FALSE);
321 	m->m_flags |= M_CLCACHE;
322 	linkcluster(m, cl);
323 	return (TRUE);
324 }
325 
326 /*
327  * Used for both the cluster and cluster PHDR caches.
328  *
329  * The mbuf may have lost its cluster due to sharing, deal
330  * with the situation by checking M_EXT.
331  */
332 static void
333 mbufcluster_dtor(void *obj, void *private)
334 {
335 	struct mbuf *m = obj;
336 	struct mbcluster *mcl;
337 
338 	if (m->m_flags & M_EXT) {
339 		KKASSERT((m->m_flags & M_EXT_CLUSTER) != 0);
340 		mcl = m->m_ext.ext_arg;
341 		KKASSERT(mcl->mcl_refs == 1);
342 		mcl->mcl_refs = 0;
343 		objcache_put(mclmeta_cache, mcl);
344 	}
345 }
346 
347 struct objcache_malloc_args mbuf_malloc_args = { MSIZE, M_MBUF };
348 struct objcache_malloc_args mclmeta_malloc_args =
349 	{ sizeof(struct mbcluster), M_MCLMETA };
350 
351 /* ARGSUSED*/
352 static void
353 mbinit(void *dummy)
354 {
355 	mbstat.m_msize = MSIZE;
356 	mbstat.m_mclbytes = MCLBYTES;
357 	mbstat.m_minclsize = MINCLSIZE;
358 	mbstat.m_mlen = MLEN;
359 	mbstat.m_mhlen = MHLEN;
360 
361 	mbuf_cache = objcache_create("mbuf", nmbufs, 0,
362 	    mbuf_ctor, NULL, NULL,
363 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
364 	mbufphdr_cache = objcache_create("mbuf pkt hdr", nmbufs, 64,
365 	    mbufphdr_ctor, NULL, NULL,
366 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
367 	mclmeta_cache = objcache_create("cluster mbuf", nmbclusters , 0,
368 	    mclmeta_ctor, mclmeta_dtor, NULL,
369 	    objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
370 	mbufcluster_cache = objcache_create("mbuf + cluster", nmbclusters, 0,
371 	    mbufcluster_ctor, mbufcluster_dtor, NULL,
372 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
373 	mbufphdrcluster_cache = objcache_create("mbuf pkt hdr + cluster",
374 	    nmbclusters, 64, mbufphdrcluster_ctor, mbufcluster_dtor, NULL,
375 	    objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
376 	return;
377 }
378 
379 /*
380  * Return the number of references to this mbuf's data.  0 is returned
381  * if the mbuf is not M_EXT, a reference count is returned if it is
382  * M_EXT | M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
383  */
384 int
385 m_sharecount(struct mbuf *m)
386 {
387 	switch (m->m_flags & (M_EXT | M_EXT_CLUSTER)) {
388 	case 0:
389 		return (0);
390 	case M_EXT:
391 		return (99);
392 	case M_EXT | M_EXT_CLUSTER:
393 		return (((struct mbcluster *)m->m_ext.ext_arg)->mcl_refs);
394 	}
395 	/* NOTREACHED */
396 	return (0);		/* to shut up compiler */
397 }
398 
399 /*
400  * change mbuf to new type
401  */
402 void
403 m_chtype(struct mbuf *m, int type)
404 {
405 	crit_enter();
406 	++mbtypes[type];
407 	--mbtypes[m->m_type];
408 	m->m_type = type;
409 	crit_exit();
410 }
411 
412 static void
413 m_reclaim(void)
414 {
415 	struct domain *dp;
416 	struct protosw *pr;
417 
418 	crit_enter();
419 	SLIST_FOREACH(dp, &domains, dom_next) {
420 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
421 			if (pr->pr_drain)
422 				(*pr->pr_drain)();
423 		}
424 	}
425 	crit_exit();
426 	mbstat.m_drain++;
427 }
428 
429 static void __inline
430 updatestats(struct mbuf *m, int type)
431 {
432 	m->m_type = type;
433 
434 	crit_enter();
435 	++mbtypes[type];
436 	++mbstat.m_mbufs;
437 	crit_exit();
438 }
439 
440 /*
441  * Allocate an mbuf.
442  */
443 struct mbuf *
444 m_get(int how, int type)
445 {
446 	struct mbuf *m;
447 	int ntries = 0;
448 	int ocf = MBTOM(how);
449 
450 retryonce:
451 
452 	m = objcache_get(mbuf_cache, ocf);
453 
454 	if (m == NULL) {
455 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
456 			struct objcache *reclaimlist[] = {
457 				mbufphdr_cache,
458 				mbufcluster_cache, mbufphdrcluster_cache
459 			};
460 			const int nreclaims = __arysize(reclaimlist);
461 
462 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
463 				m_reclaim();
464 			goto retryonce;
465 		}
466 		return (NULL);
467 	}
468 
469 	updatestats(m, type);
470 	return (m);
471 }
472 
473 struct mbuf *
474 m_gethdr(int how, int type)
475 {
476 	struct mbuf *m;
477 	int ocf = MBTOM(how);
478 	int ntries = 0;
479 
480 retryonce:
481 
482 	m = objcache_get(mbufphdr_cache, ocf);
483 
484 	if (m == NULL) {
485 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
486 			struct objcache *reclaimlist[] = {
487 				mbuf_cache,
488 				mbufcluster_cache, mbufphdrcluster_cache
489 			};
490 			const int nreclaims = __arysize(reclaimlist);
491 
492 			if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
493 				m_reclaim();
494 			goto retryonce;
495 		}
496 		return (NULL);
497 	}
498 
499 	updatestats(m, type);
500 	return (m);
501 }
502 
503 /*
504  * Get a mbuf (not a mbuf cluster!) and zero it.
505  * Deprecated.
506  */
507 struct mbuf *
508 m_getclr(int how, int type)
509 {
510 	struct mbuf *m;
511 
512 	m = m_get(how, type);
513 	if (m != NULL)
514 		bzero(m->m_data, MLEN);
515 	return (m);
516 }
517 
518 /*
519  * Returns an mbuf with an attached cluster.
520  * Because many network drivers use this kind of buffers a lot, it is
521  * convenient to keep a small pool of free buffers of this kind.
522  * Even a small size such as 10 gives about 10% improvement in the
523  * forwarding rate in a bridge or router.
524  */
525 struct mbuf *
526 m_getcl(int how, short type, int flags)
527 {
528 	struct mbuf *m;
529 	int ocflags = MBTOM(how);
530 	int ntries = 0;
531 
532 retryonce:
533 
534 	if (flags & M_PKTHDR)
535 		m = objcache_get(mbufphdrcluster_cache, ocflags);
536 	else
537 		m = objcache_get(mbufcluster_cache, ocflags);
538 
539 	if (m == NULL) {
540 		if ((how & MB_TRYWAIT) && ntries++ == 0) {
541 			struct objcache *reclaimlist[1];
542 
543 			if (flags & M_PKTHDR)
544 				reclaimlist[0] = mbufcluster_cache;
545 			else
546 				reclaimlist[0] = mbufphdrcluster_cache;
547 			if (!objcache_reclaimlist(reclaimlist, 1, ocflags))
548 				m_reclaim();
549 			goto retryonce;
550 		}
551 		return (NULL);
552 	}
553 
554 	m->m_type = type;
555 
556 	crit_enter();
557 	++mbtypes[type];
558 	++mbstat.m_clusters;
559 	crit_exit();
560 	return (m);
561 }
562 
563 /*
564  * Allocate chain of requested length.
565  */
566 struct mbuf *
567 m_getc(int len, int how, int type)
568 {
569 	struct mbuf *n, *nfirst = NULL, **ntail = &nfirst;
570 	int nsize;
571 
572 	while (len > 0) {
573 		n = m_getl(len, how, type, 0, &nsize);
574 		if (n == NULL)
575 			goto failed;
576 		n->m_len = 0;
577 		*ntail = n;
578 		ntail = &n->m_next;
579 		len -= nsize;
580 	}
581 	return (nfirst);
582 
583 failed:
584 	m_freem(nfirst);
585 	return (NULL);
586 }
587 
588 /*
589  * Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best)
590  * and return a pointer to the head of the allocated chain. If m0 is
591  * non-null, then we assume that it is a single mbuf or an mbuf chain to
592  * which we want len bytes worth of mbufs and/or clusters attached, and so
593  * if we succeed in allocating it, we will just return a pointer to m0.
594  *
595  * If we happen to fail at any point during the allocation, we will free
596  * up everything we have already allocated and return NULL.
597  *
598  * Deprecated.  Use m_getc() and m_cat() instead.
599  */
600 struct mbuf *
601 m_getm(struct mbuf *m0, int len, int type, int how)
602 {
603 	struct mbuf *nfirst;
604 
605 	nfirst = m_getc(len, how, type);
606 
607 	if (m0 != NULL) {
608 		m_last(m0)->m_next = nfirst;
609 		return (m0);
610 	}
611 
612 	return (nfirst);
613 }
614 
615 /*
616  * Adds a cluster to a normal mbuf, M_EXT is set on success.
617  * Deprecated.  Use m_getcl() instead.
618  */
619 void
620 m_mclget(struct mbuf *m, int how)
621 {
622 	struct mbcluster *mcl;
623 
624 	KKASSERT((m->m_flags & M_EXT) == 0);
625 	mcl = objcache_get(mclmeta_cache, MBTOM(how));
626 	if (mcl != NULL) {
627 		linkcluster(m, mcl);
628 		crit_enter();
629 		++mbstat.m_clusters;
630 		/* leave the m_mbufs count intact for original mbuf */
631 		crit_exit();
632 	}
633 }
634 
635 /*
636  * Updates to mbcluster must be MPSAFE.  Only an entity which already has
637  * a reference to the cluster can ref it, so we are in no danger of
638  * racing an add with a subtract.  But the operation must still be atomic
639  * since multiple entities may have a reference on the cluster.
640  *
641  * m_mclfree() is almost the same but it must contend with two entities
642  * freeing the cluster at the same time.  If there is only one reference
643  * count we are the only entity referencing the cluster and no further
644  * locking is required.  Otherwise we must protect against a race to 0
645  * with the serializer.
646  */
647 static void
648 m_mclref(void *arg)
649 {
650 	struct mbcluster *mcl = arg;
651 
652 	atomic_add_int(&mcl->mcl_refs, 1);
653 }
654 
655 static void
656 m_mclfree(void *arg)
657 {
658 	struct mbcluster *mcl = arg;
659 
660 	if (mcl->mcl_refs == 1) {
661 		mcl->mcl_refs = 0;
662 		objcache_put(mclmeta_cache, mcl);
663 	} else {
664 		lwkt_serialize_enter(&mcl->mcl_serializer);
665 		if (mcl->mcl_refs > 1) {
666 			atomic_subtract_int(&mcl->mcl_refs, 1);
667 			lwkt_serialize_exit(&mcl->mcl_serializer);
668 		} else {
669 			lwkt_serialize_exit(&mcl->mcl_serializer);
670 			KKASSERT(mcl->mcl_refs == 1);
671 			mcl->mcl_refs = 0;
672 			objcache_put(mclmeta_cache, mcl);
673 		}
674 	}
675 }
676 
677 extern void db_print_backtrace(void);
678 
679 /*
680  * Free a single mbuf and any associated external storage.  The successor,
681  * if any, is returned.
682  *
683  * We do need to check non-first mbuf for m_aux, since some of existing
684  * code does not call M_PREPEND properly.
685  * (example: call to bpf_mtap from drivers)
686  */
687 struct mbuf *
688 m_free(struct mbuf *m)
689 {
690 	struct mbuf *n;
691 
692 	KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
693 	--mbtypes[m->m_type];
694 
695 	n = m->m_next;
696 
697 	/*
698 	 * Make sure the mbuf is in constructed state before returning it
699 	 * to the objcache.
700 	 */
701 	m->m_next = NULL;
702 #ifdef notyet
703 	KKASSERT(m->m_nextpkt == NULL);
704 #else
705 	if (m->m_nextpkt != NULL) {
706 #ifdef DDB
707 		static int afewtimes = 10;
708 
709 		if (afewtimes-- > 0) {
710 			kprintf("mfree: m->m_nextpkt != NULL\n");
711 			db_print_backtrace();
712 		}
713 #endif
714 		m->m_nextpkt = NULL;
715 	}
716 #endif
717 	if (m->m_flags & M_PKTHDR) {
718 		m_tag_delete_chain(m);		/* eliminate XXX JH */
719 	}
720 
721 	m->m_flags &= (M_EXT | M_EXT_CLUSTER | M_CLCACHE | M_PHCACHE);
722 
723 	/*
724 	 * Clean the M_PKTHDR state so we can return the mbuf to its original
725 	 * cache.  This is based on the PHCACHE flag which tells us whether
726 	 * the mbuf was originally allocated out of a packet-header cache
727 	 * or a non-packet-header cache.
728 	 */
729 	if (m->m_flags & M_PHCACHE) {
730 		m->m_flags |= M_PKTHDR;
731 		m->m_pkthdr.rcvif = NULL;	/* eliminate XXX JH */
732 		m->m_pkthdr.csum_flags = 0;	/* eliminate XXX JH */
733 		m->m_pkthdr.fw_flags = 0;	/* eliminate XXX JH */
734 		SLIST_INIT(&m->m_pkthdr.tags);
735 	}
736 
737 	/*
738 	 * Handle remaining flags combinations.  M_CLCACHE tells us whether
739 	 * the mbuf was originally allocated from a cluster cache or not,
740 	 * and is totally separate from whether the mbuf is currently
741 	 * associated with a cluster.
742 	 */
743 	crit_enter();
744 	switch(m->m_flags & (M_CLCACHE | M_EXT | M_EXT_CLUSTER)) {
745 	case M_CLCACHE | M_EXT | M_EXT_CLUSTER:
746 		/*
747 		 * mbuf+cluster cache case.  The mbuf was allocated from the
748 		 * combined mbuf_cluster cache and can be returned to the
749 		 * cache if the cluster hasn't been shared.
750 		 */
751 		if (m_sharecount(m) == 1) {
752 			/*
753 			 * The cluster has not been shared, we can just
754 			 * reset the data pointer and return the mbuf
755 			 * to the cluster cache.  Note that the reference
756 			 * count is left intact (it is still associated with
757 			 * an mbuf).
758 			 */
759 			m->m_data = m->m_ext.ext_buf;
760 			if (m->m_flags & M_PHCACHE)
761 				objcache_put(mbufphdrcluster_cache, m);
762 			else
763 				objcache_put(mbufcluster_cache, m);
764 			--mbstat.m_clusters;
765 		} else {
766 			/*
767 			 * Hell.  Someone else has a ref on this cluster,
768 			 * we have to disconnect it which means we can't
769 			 * put it back into the mbufcluster_cache, we
770 			 * have to destroy the mbuf.
771 			 *
772 			 * Other mbuf references to the cluster will typically
773 			 * be M_EXT | M_EXT_CLUSTER but without M_CLCACHE.
774 			 *
775 			 * XXX we could try to connect another cluster to
776 			 * it.
777 			 */
778 			m->m_ext.ext_free(m->m_ext.ext_arg);
779 			m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
780 			if (m->m_flags & M_PHCACHE)
781 				objcache_dtor(mbufphdrcluster_cache, m);
782 			else
783 				objcache_dtor(mbufcluster_cache, m);
784 		}
785 		break;
786 	case M_EXT | M_EXT_CLUSTER:
787 		/*
788 		 * Normal cluster associated with an mbuf that was allocated
789 		 * from the normal mbuf pool rather then the cluster pool.
790 		 * The cluster has to be independantly disassociated from the
791 		 * mbuf.
792 		 */
793 		if (m_sharecount(m) == 1)
794 			--mbstat.m_clusters;
795 		/* fall through */
796 	case M_EXT:
797 		/*
798 		 * Normal cluster association case, disconnect the cluster from
799 		 * the mbuf.  The cluster may or may not be custom.
800 		 */
801 		m->m_ext.ext_free(m->m_ext.ext_arg);
802 		m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
803 		/* fall through */
804 	case 0:
805 		/*
806 		 * return the mbuf to the mbuf cache.
807 		 */
808 		if (m->m_flags & M_PHCACHE) {
809 			m->m_data = m->m_pktdat;
810 			objcache_put(mbufphdr_cache, m);
811 		} else {
812 			m->m_data = m->m_dat;
813 			objcache_put(mbuf_cache, m);
814 		}
815 		--mbstat.m_mbufs;
816 		break;
817 	default:
818 		if (!panicstr)
819 			panic("bad mbuf flags %p %08x\n", m, m->m_flags);
820 		break;
821 	}
822 	crit_exit();
823 	return (n);
824 }
825 
826 void
827 m_freem(struct mbuf *m)
828 {
829 	crit_enter();
830 	while (m)
831 		m = m_free(m);
832 	crit_exit();
833 }
834 
835 /*
836  * mbuf utility routines
837  */
838 
839 /*
840  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain and
841  * copy junk along.
842  */
843 struct mbuf *
844 m_prepend(struct mbuf *m, int len, int how)
845 {
846 	struct mbuf *mn;
847 
848 	if (m->m_flags & M_PKTHDR)
849 	    mn = m_gethdr(how, m->m_type);
850 	else
851 	    mn = m_get(how, m->m_type);
852 	if (mn == NULL) {
853 		m_freem(m);
854 		return (NULL);
855 	}
856 	if (m->m_flags & M_PKTHDR)
857 		M_MOVE_PKTHDR(mn, m);
858 	mn->m_next = m;
859 	m = mn;
860 	if (len < MHLEN)
861 		MH_ALIGN(m, len);
862 	m->m_len = len;
863 	return (m);
864 }
865 
866 /*
867  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
868  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
869  * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
870  * Note that the copy is read-only, because clusters are not copied,
871  * only their reference counts are incremented.
872  */
873 struct mbuf *
874 m_copym(const struct mbuf *m, int off0, int len, int wait)
875 {
876 	struct mbuf *n, **np;
877 	int off = off0;
878 	struct mbuf *top;
879 	int copyhdr = 0;
880 
881 	KASSERT(off >= 0, ("m_copym, negative off %d", off));
882 	KASSERT(len >= 0, ("m_copym, negative len %d", len));
883 	if (off == 0 && m->m_flags & M_PKTHDR)
884 		copyhdr = 1;
885 	while (off > 0) {
886 		KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
887 		if (off < m->m_len)
888 			break;
889 		off -= m->m_len;
890 		m = m->m_next;
891 	}
892 	np = &top;
893 	top = 0;
894 	while (len > 0) {
895 		if (m == NULL) {
896 			KASSERT(len == M_COPYALL,
897 			    ("m_copym, length > size of mbuf chain"));
898 			break;
899 		}
900 		/*
901 		 * Because we are sharing any cluster attachment below,
902 		 * be sure to get an mbuf that does not have a cluster
903 		 * associated with it.
904 		 */
905 		if (copyhdr)
906 			n = m_gethdr(wait, m->m_type);
907 		else
908 			n = m_get(wait, m->m_type);
909 		*np = n;
910 		if (n == NULL)
911 			goto nospace;
912 		if (copyhdr) {
913 			if (!m_dup_pkthdr(n, m, wait))
914 				goto nospace;
915 			if (len == M_COPYALL)
916 				n->m_pkthdr.len -= off0;
917 			else
918 				n->m_pkthdr.len = len;
919 			copyhdr = 0;
920 		}
921 		n->m_len = min(len, m->m_len - off);
922 		if (m->m_flags & M_EXT) {
923 			KKASSERT((n->m_flags & M_EXT) == 0);
924 			n->m_data = m->m_data + off;
925 			m->m_ext.ext_ref(m->m_ext.ext_arg);
926 			n->m_ext = m->m_ext;
927 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
928 		} else {
929 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
930 			    (unsigned)n->m_len);
931 		}
932 		if (len != M_COPYALL)
933 			len -= n->m_len;
934 		off = 0;
935 		m = m->m_next;
936 		np = &n->m_next;
937 	}
938 	if (top == NULL)
939 		mbstat.m_mcfail++;
940 	return (top);
941 nospace:
942 	m_freem(top);
943 	mbstat.m_mcfail++;
944 	return (NULL);
945 }
946 
947 /*
948  * Copy an entire packet, including header (which must be present).
949  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
950  * Note that the copy is read-only, because clusters are not copied,
951  * only their reference counts are incremented.
952  * Preserve alignment of the first mbuf so if the creator has left
953  * some room at the beginning (e.g. for inserting protocol headers)
954  * the copies also have the room available.
955  */
956 struct mbuf *
957 m_copypacket(struct mbuf *m, int how)
958 {
959 	struct mbuf *top, *n, *o;
960 
961 	n = m_gethdr(how, m->m_type);
962 	top = n;
963 	if (!n)
964 		goto nospace;
965 
966 	if (!m_dup_pkthdr(n, m, how))
967 		goto nospace;
968 	n->m_len = m->m_len;
969 	if (m->m_flags & M_EXT) {
970 		KKASSERT((n->m_flags & M_EXT) == 0);
971 		n->m_data = m->m_data;
972 		m->m_ext.ext_ref(m->m_ext.ext_arg);
973 		n->m_ext = m->m_ext;
974 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
975 	} else {
976 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
977 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
978 	}
979 
980 	m = m->m_next;
981 	while (m) {
982 		o = m_get(how, m->m_type);
983 		if (!o)
984 			goto nospace;
985 
986 		n->m_next = o;
987 		n = n->m_next;
988 
989 		n->m_len = m->m_len;
990 		if (m->m_flags & M_EXT) {
991 			KKASSERT((n->m_flags & M_EXT) == 0);
992 			n->m_data = m->m_data;
993 			m->m_ext.ext_ref(m->m_ext.ext_arg);
994 			n->m_ext = m->m_ext;
995 			n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
996 		} else {
997 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
998 		}
999 
1000 		m = m->m_next;
1001 	}
1002 	return top;
1003 nospace:
1004 	m_freem(top);
1005 	mbstat.m_mcfail++;
1006 	return (NULL);
1007 }
1008 
1009 /*
1010  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1011  * continuing for "len" bytes, into the indicated buffer.
1012  */
1013 void
1014 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1015 {
1016 	unsigned count;
1017 
1018 	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1019 	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1020 	while (off > 0) {
1021 		KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1022 		if (off < m->m_len)
1023 			break;
1024 		off -= m->m_len;
1025 		m = m->m_next;
1026 	}
1027 	while (len > 0) {
1028 		KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1029 		count = min(m->m_len - off, len);
1030 		bcopy(mtod(m, caddr_t) + off, cp, count);
1031 		len -= count;
1032 		cp += count;
1033 		off = 0;
1034 		m = m->m_next;
1035 	}
1036 }
1037 
1038 /*
1039  * Copy a packet header mbuf chain into a completely new chain, including
1040  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1041  * you need a writable copy of an mbuf chain.
1042  */
1043 struct mbuf *
1044 m_dup(struct mbuf *m, int how)
1045 {
1046 	struct mbuf **p, *top = NULL;
1047 	int remain, moff, nsize;
1048 
1049 	/* Sanity check */
1050 	if (m == NULL)
1051 		return (NULL);
1052 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
1053 
1054 	/* While there's more data, get a new mbuf, tack it on, and fill it */
1055 	remain = m->m_pkthdr.len;
1056 	moff = 0;
1057 	p = &top;
1058 	while (remain > 0 || top == NULL) {	/* allow m->m_pkthdr.len == 0 */
1059 		struct mbuf *n;
1060 
1061 		/* Get the next new mbuf */
1062 		n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0,
1063 			   &nsize);
1064 		if (n == NULL)
1065 			goto nospace;
1066 		if (top == NULL)
1067 			if (!m_dup_pkthdr(n, m, how))
1068 				goto nospace0;
1069 
1070 		/* Link it into the new chain */
1071 		*p = n;
1072 		p = &n->m_next;
1073 
1074 		/* Copy data from original mbuf(s) into new mbuf */
1075 		n->m_len = 0;
1076 		while (n->m_len < nsize && m != NULL) {
1077 			int chunk = min(nsize - n->m_len, m->m_len - moff);
1078 
1079 			bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1080 			moff += chunk;
1081 			n->m_len += chunk;
1082 			remain -= chunk;
1083 			if (moff == m->m_len) {
1084 				m = m->m_next;
1085 				moff = 0;
1086 			}
1087 		}
1088 
1089 		/* Check correct total mbuf length */
1090 		KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1091 			("%s: bogus m_pkthdr.len", __func__));
1092 	}
1093 	return (top);
1094 
1095 nospace:
1096 	m_freem(top);
1097 nospace0:
1098 	mbstat.m_mcfail++;
1099 	return (NULL);
1100 }
1101 
1102 /*
1103  * Concatenate mbuf chain n to m.
1104  * Both chains must be of the same type (e.g. MT_DATA).
1105  * Any m_pkthdr is not updated.
1106  */
1107 void
1108 m_cat(struct mbuf *m, struct mbuf *n)
1109 {
1110 	m = m_last(m);
1111 	while (n) {
1112 		if (m->m_flags & M_EXT ||
1113 		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1114 			/* just join the two chains */
1115 			m->m_next = n;
1116 			return;
1117 		}
1118 		/* splat the data from one into the other */
1119 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1120 		    (u_int)n->m_len);
1121 		m->m_len += n->m_len;
1122 		n = m_free(n);
1123 	}
1124 }
1125 
1126 void
1127 m_adj(struct mbuf *mp, int req_len)
1128 {
1129 	int len = req_len;
1130 	struct mbuf *m;
1131 	int count;
1132 
1133 	if ((m = mp) == NULL)
1134 		return;
1135 	if (len >= 0) {
1136 		/*
1137 		 * Trim from head.
1138 		 */
1139 		while (m != NULL && len > 0) {
1140 			if (m->m_len <= len) {
1141 				len -= m->m_len;
1142 				m->m_len = 0;
1143 				m = m->m_next;
1144 			} else {
1145 				m->m_len -= len;
1146 				m->m_data += len;
1147 				len = 0;
1148 			}
1149 		}
1150 		m = mp;
1151 		if (mp->m_flags & M_PKTHDR)
1152 			m->m_pkthdr.len -= (req_len - len);
1153 	} else {
1154 		/*
1155 		 * Trim from tail.  Scan the mbuf chain,
1156 		 * calculating its length and finding the last mbuf.
1157 		 * If the adjustment only affects this mbuf, then just
1158 		 * adjust and return.  Otherwise, rescan and truncate
1159 		 * after the remaining size.
1160 		 */
1161 		len = -len;
1162 		count = 0;
1163 		for (;;) {
1164 			count += m->m_len;
1165 			if (m->m_next == (struct mbuf *)0)
1166 				break;
1167 			m = m->m_next;
1168 		}
1169 		if (m->m_len >= len) {
1170 			m->m_len -= len;
1171 			if (mp->m_flags & M_PKTHDR)
1172 				mp->m_pkthdr.len -= len;
1173 			return;
1174 		}
1175 		count -= len;
1176 		if (count < 0)
1177 			count = 0;
1178 		/*
1179 		 * Correct length for chain is "count".
1180 		 * Find the mbuf with last data, adjust its length,
1181 		 * and toss data from remaining mbufs on chain.
1182 		 */
1183 		m = mp;
1184 		if (m->m_flags & M_PKTHDR)
1185 			m->m_pkthdr.len = count;
1186 		for (; m; m = m->m_next) {
1187 			if (m->m_len >= count) {
1188 				m->m_len = count;
1189 				break;
1190 			}
1191 			count -= m->m_len;
1192 		}
1193 		while (m->m_next)
1194 			(m = m->m_next) ->m_len = 0;
1195 	}
1196 }
1197 
1198 /*
1199  * Rearrange an mbuf chain so that len bytes are contiguous
1200  * and in the data area of an mbuf (so that mtod will work for a structure
1201  * of size len).  Returns the resulting mbuf chain on success, frees it and
1202  * returns null on failure.  If there is room, it will add up to
1203  * max_protohdr-len extra bytes to the contiguous region in an attempt to
1204  * avoid being called next time.
1205  */
1206 struct mbuf *
1207 m_pullup(struct mbuf *n, int len)
1208 {
1209 	struct mbuf *m;
1210 	int count;
1211 	int space;
1212 
1213 	/*
1214 	 * If first mbuf has no cluster, and has room for len bytes
1215 	 * without shifting current data, pullup into it,
1216 	 * otherwise allocate a new mbuf to prepend to the chain.
1217 	 */
1218 	if (!(n->m_flags & M_EXT) &&
1219 	    n->m_data + len < &n->m_dat[MLEN] &&
1220 	    n->m_next) {
1221 		if (n->m_len >= len)
1222 			return (n);
1223 		m = n;
1224 		n = n->m_next;
1225 		len -= m->m_len;
1226 	} else {
1227 		if (len > MHLEN)
1228 			goto bad;
1229 		if (n->m_flags & M_PKTHDR)
1230 			m = m_gethdr(MB_DONTWAIT, n->m_type);
1231 		else
1232 			m = m_get(MB_DONTWAIT, n->m_type);
1233 		if (m == NULL)
1234 			goto bad;
1235 		m->m_len = 0;
1236 		if (n->m_flags & M_PKTHDR)
1237 			M_MOVE_PKTHDR(m, n);
1238 	}
1239 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1240 	do {
1241 		count = min(min(max(len, max_protohdr), space), n->m_len);
1242 		bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1243 		  (unsigned)count);
1244 		len -= count;
1245 		m->m_len += count;
1246 		n->m_len -= count;
1247 		space -= count;
1248 		if (n->m_len)
1249 			n->m_data += count;
1250 		else
1251 			n = m_free(n);
1252 	} while (len > 0 && n);
1253 	if (len > 0) {
1254 		m_free(m);
1255 		goto bad;
1256 	}
1257 	m->m_next = n;
1258 	return (m);
1259 bad:
1260 	m_freem(n);
1261 	mbstat.m_mpfail++;
1262 	return (NULL);
1263 }
1264 
1265 /*
1266  * Partition an mbuf chain in two pieces, returning the tail --
1267  * all but the first len0 bytes.  In case of failure, it returns NULL and
1268  * attempts to restore the chain to its original state.
1269  *
1270  * Note that the resulting mbufs might be read-only, because the new
1271  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1272  * the "breaking point" happens to lie within a cluster mbuf. Use the
1273  * M_WRITABLE() macro to check for this case.
1274  */
1275 struct mbuf *
1276 m_split(struct mbuf *m0, int len0, int wait)
1277 {
1278 	struct mbuf *m, *n;
1279 	unsigned len = len0, remain;
1280 
1281 	for (m = m0; m && len > m->m_len; m = m->m_next)
1282 		len -= m->m_len;
1283 	if (m == NULL)
1284 		return (NULL);
1285 	remain = m->m_len - len;
1286 	if (m0->m_flags & M_PKTHDR) {
1287 		n = m_gethdr(wait, m0->m_type);
1288 		if (n == NULL)
1289 			return (NULL);
1290 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1291 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1292 		m0->m_pkthdr.len = len0;
1293 		if (m->m_flags & M_EXT)
1294 			goto extpacket;
1295 		if (remain > MHLEN) {
1296 			/* m can't be the lead packet */
1297 			MH_ALIGN(n, 0);
1298 			n->m_next = m_split(m, len, wait);
1299 			if (n->m_next == NULL) {
1300 				m_free(n);
1301 				return (NULL);
1302 			} else {
1303 				n->m_len = 0;
1304 				return (n);
1305 			}
1306 		} else
1307 			MH_ALIGN(n, remain);
1308 	} else if (remain == 0) {
1309 		n = m->m_next;
1310 		m->m_next = 0;
1311 		return (n);
1312 	} else {
1313 		n = m_get(wait, m->m_type);
1314 		if (n == NULL)
1315 			return (NULL);
1316 		M_ALIGN(n, remain);
1317 	}
1318 extpacket:
1319 	if (m->m_flags & M_EXT) {
1320 		KKASSERT((n->m_flags & M_EXT) == 0);
1321 		n->m_data = m->m_data + len;
1322 		m->m_ext.ext_ref(m->m_ext.ext_arg);
1323 		n->m_ext = m->m_ext;
1324 		n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1325 	} else {
1326 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1327 	}
1328 	n->m_len = remain;
1329 	m->m_len = len;
1330 	n->m_next = m->m_next;
1331 	m->m_next = 0;
1332 	return (n);
1333 }
1334 
1335 /*
1336  * Routine to copy from device local memory into mbufs.
1337  * Note: "offset" is ill-defined and always called as 0, so ignore it.
1338  */
1339 struct mbuf *
1340 m_devget(char *buf, int len, int offset, struct ifnet *ifp,
1341     void (*copy)(volatile const void *from, volatile void *to, size_t length))
1342 {
1343 	struct mbuf *m, *mfirst = NULL, **mtail;
1344 	int nsize, flags;
1345 
1346 	if (copy == NULL)
1347 		copy = bcopy;
1348 	mtail = &mfirst;
1349 	flags = M_PKTHDR;
1350 
1351 	while (len > 0) {
1352 		m = m_getl(len, MB_DONTWAIT, MT_DATA, flags, &nsize);
1353 		if (m == NULL) {
1354 			m_freem(mfirst);
1355 			return (NULL);
1356 		}
1357 		m->m_len = min(len, nsize);
1358 
1359 		if (flags & M_PKTHDR) {
1360 			if (len + max_linkhdr <= nsize)
1361 				m->m_data += max_linkhdr;
1362 			m->m_pkthdr.rcvif = ifp;
1363 			m->m_pkthdr.len = len;
1364 			flags = 0;
1365 		}
1366 
1367 		copy(buf, m->m_data, (unsigned)m->m_len);
1368 		buf += m->m_len;
1369 		len -= m->m_len;
1370 		*mtail = m;
1371 		mtail = &m->m_next;
1372 	}
1373 
1374 	return (mfirst);
1375 }
1376 
1377 /*
1378  * Copy data from a buffer back into the indicated mbuf chain,
1379  * starting "off" bytes from the beginning, extending the mbuf
1380  * chain if necessary.
1381  */
1382 void
1383 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1384 {
1385 	int mlen;
1386 	struct mbuf *m = m0, *n;
1387 	int totlen = 0;
1388 
1389 	if (m0 == NULL)
1390 		return;
1391 	while (off > (mlen = m->m_len)) {
1392 		off -= mlen;
1393 		totlen += mlen;
1394 		if (m->m_next == NULL) {
1395 			n = m_getclr(MB_DONTWAIT, m->m_type);
1396 			if (n == NULL)
1397 				goto out;
1398 			n->m_len = min(MLEN, len + off);
1399 			m->m_next = n;
1400 		}
1401 		m = m->m_next;
1402 	}
1403 	while (len > 0) {
1404 		mlen = min (m->m_len - off, len);
1405 		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1406 		cp += mlen;
1407 		len -= mlen;
1408 		mlen += off;
1409 		off = 0;
1410 		totlen += mlen;
1411 		if (len == 0)
1412 			break;
1413 		if (m->m_next == NULL) {
1414 			n = m_get(MB_DONTWAIT, m->m_type);
1415 			if (n == NULL)
1416 				break;
1417 			n->m_len = min(MLEN, len);
1418 			m->m_next = n;
1419 		}
1420 		m = m->m_next;
1421 	}
1422 out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1423 		m->m_pkthdr.len = totlen;
1424 }
1425 
1426 void
1427 m_print(const struct mbuf *m)
1428 {
1429 	int len;
1430 	const struct mbuf *m2;
1431 
1432 	len = m->m_pkthdr.len;
1433 	m2 = m;
1434 	while (len) {
1435 		kprintf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1436 		len -= m2->m_len;
1437 		m2 = m2->m_next;
1438 	}
1439 	return;
1440 }
1441 
1442 /*
1443  * "Move" mbuf pkthdr from "from" to "to".
1444  * "from" must have M_PKTHDR set, and "to" must be empty.
1445  */
1446 void
1447 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
1448 {
1449 	KASSERT((to->m_flags & M_PKTHDR), ("m_move_pkthdr: not packet header"));
1450 
1451 	to->m_flags |= from->m_flags & M_COPYFLAGS;
1452 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
1453 	SLIST_INIT(&from->m_pkthdr.tags);	/* purge tags from src */
1454 }
1455 
1456 /*
1457  * Duplicate "from"'s mbuf pkthdr in "to".
1458  * "from" must have M_PKTHDR set, and "to" must be empty.
1459  * In particular, this does a deep copy of the packet tags.
1460  */
1461 int
1462 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
1463 {
1464 	KASSERT((to->m_flags & M_PKTHDR), ("m_dup_pkthdr: not packet header"));
1465 
1466 	to->m_flags = (from->m_flags & M_COPYFLAGS) |
1467 		      (to->m_flags & ~M_COPYFLAGS);
1468 	to->m_pkthdr = from->m_pkthdr;
1469 	SLIST_INIT(&to->m_pkthdr.tags);
1470 	return (m_tag_copy_chain(to, from, how));
1471 }
1472 
1473 /*
1474  * Defragment a mbuf chain, returning the shortest possible
1475  * chain of mbufs and clusters.  If allocation fails and
1476  * this cannot be completed, NULL will be returned, but
1477  * the passed in chain will be unchanged.  Upon success,
1478  * the original chain will be freed, and the new chain
1479  * will be returned.
1480  *
1481  * If a non-packet header is passed in, the original
1482  * mbuf (chain?) will be returned unharmed.
1483  *
1484  * m_defrag_nofree doesn't free the passed in mbuf.
1485  */
1486 struct mbuf *
1487 m_defrag(struct mbuf *m0, int how)
1488 {
1489 	struct mbuf *m_new;
1490 
1491 	if ((m_new = m_defrag_nofree(m0, how)) == NULL)
1492 		return (NULL);
1493 	if (m_new != m0)
1494 		m_freem(m0);
1495 	return (m_new);
1496 }
1497 
1498 struct mbuf *
1499 m_defrag_nofree(struct mbuf *m0, int how)
1500 {
1501 	struct mbuf	*m_new = NULL, *m_final = NULL;
1502 	int		progress = 0, length, nsize;
1503 
1504 	if (!(m0->m_flags & M_PKTHDR))
1505 		return (m0);
1506 
1507 #ifdef MBUF_STRESS_TEST
1508 	if (m_defragrandomfailures) {
1509 		int temp = karc4random() & 0xff;
1510 		if (temp == 0xba)
1511 			goto nospace;
1512 	}
1513 #endif
1514 
1515 	m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize);
1516 	if (m_final == NULL)
1517 		goto nospace;
1518 	m_final->m_len = 0;	/* in case m0->m_pkthdr.len is zero */
1519 
1520 	if (m_dup_pkthdr(m_final, m0, how) == NULL)
1521 		goto nospace;
1522 
1523 	m_new = m_final;
1524 
1525 	while (progress < m0->m_pkthdr.len) {
1526 		length = m0->m_pkthdr.len - progress;
1527 		if (length > MCLBYTES)
1528 			length = MCLBYTES;
1529 
1530 		if (m_new == NULL) {
1531 			m_new = m_getl(length, how, MT_DATA, 0, &nsize);
1532 			if (m_new == NULL)
1533 				goto nospace;
1534 		}
1535 
1536 		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1537 		progress += length;
1538 		m_new->m_len = length;
1539 		if (m_new != m_final)
1540 			m_cat(m_final, m_new);
1541 		m_new = NULL;
1542 	}
1543 	if (m0->m_next == NULL)
1544 		m_defraguseless++;
1545 	m_defragpackets++;
1546 	m_defragbytes += m_final->m_pkthdr.len;
1547 	return (m_final);
1548 nospace:
1549 	m_defragfailure++;
1550 	if (m_new)
1551 		m_free(m_new);
1552 	m_freem(m_final);
1553 	return (NULL);
1554 }
1555 
1556 /*
1557  * Move data from uio into mbufs.
1558  */
1559 struct mbuf *
1560 m_uiomove(struct uio *uio)
1561 {
1562 	struct mbuf *m;			/* current working mbuf */
1563 	struct mbuf *head = NULL;	/* result mbuf chain */
1564 	struct mbuf **mp = &head;
1565 	int resid = uio->uio_resid, nsize, flags = M_PKTHDR, error;
1566 
1567 	do {
1568 		m = m_getl(resid, MB_WAIT, MT_DATA, flags, &nsize);
1569 		if (flags) {
1570 			m->m_pkthdr.len = 0;
1571 			/* Leave room for protocol headers. */
1572 			if (resid < MHLEN)
1573 				MH_ALIGN(m, resid);
1574 			flags = 0;
1575 		}
1576 		m->m_len = min(nsize, resid);
1577 		error = uiomove(mtod(m, caddr_t), m->m_len, uio);
1578 		if (error) {
1579 			m_free(m);
1580 			goto failed;
1581 		}
1582 		*mp = m;
1583 		mp = &m->m_next;
1584 		head->m_pkthdr.len += m->m_len;
1585 		resid -= m->m_len;
1586 	} while (resid > 0);
1587 
1588 	return (head);
1589 
1590 failed:
1591 	m_freem(head);
1592 	return (NULL);
1593 }
1594 
1595 struct mbuf *
1596 m_last(struct mbuf *m)
1597 {
1598 	while (m->m_next)
1599 		m = m->m_next;
1600 	return (m);
1601 }
1602 
1603 /*
1604  * Return the number of bytes in an mbuf chain.
1605  * If lastm is not NULL, also return the last mbuf.
1606  */
1607 u_int
1608 m_lengthm(struct mbuf *m, struct mbuf **lastm)
1609 {
1610 	u_int len = 0;
1611 	struct mbuf *prev = m;
1612 
1613 	while (m) {
1614 		len += m->m_len;
1615 		prev = m;
1616 		m = m->m_next;
1617 	}
1618 	if (lastm != NULL)
1619 		*lastm = prev;
1620 	return (len);
1621 }
1622 
1623 /*
1624  * Like m_lengthm(), except also keep track of mbuf usage.
1625  */
1626 u_int
1627 m_countm(struct mbuf *m, struct mbuf **lastm, u_int *pmbcnt)
1628 {
1629 	u_int len = 0, mbcnt = 0;
1630 	struct mbuf *prev = m;
1631 
1632 	while (m) {
1633 		len += m->m_len;
1634 		mbcnt += MSIZE;
1635 		if (m->m_flags & M_EXT)
1636 			mbcnt += m->m_ext.ext_size;
1637 		prev = m;
1638 		m = m->m_next;
1639 	}
1640 	if (lastm != NULL)
1641 		*lastm = prev;
1642 	*pmbcnt = mbcnt;
1643 	return (len);
1644 }
1645