xref: /onnv-gate/usr/src/uts/common/io/stream.c (revision 10163:18b45b82bc9b)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
52958Sdr146992  * Common Development and Distribution License (the "License").
62958Sdr146992  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
220Sstevel@tonic-gate /*	  All Rights Reserved  	*/
230Sstevel@tonic-gate 
240Sstevel@tonic-gate /*
258752SPeter.Memishian@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
260Sstevel@tonic-gate  * Use is subject to license terms.
270Sstevel@tonic-gate  */
280Sstevel@tonic-gate 
290Sstevel@tonic-gate #include <sys/types.h>
300Sstevel@tonic-gate #include <sys/param.h>
310Sstevel@tonic-gate #include <sys/thread.h>
320Sstevel@tonic-gate #include <sys/sysmacros.h>
330Sstevel@tonic-gate #include <sys/stropts.h>
340Sstevel@tonic-gate #include <sys/stream.h>
350Sstevel@tonic-gate #include <sys/strsubr.h>
360Sstevel@tonic-gate #include <sys/strsun.h>
370Sstevel@tonic-gate #include <sys/conf.h>
380Sstevel@tonic-gate #include <sys/debug.h>
390Sstevel@tonic-gate #include <sys/cmn_err.h>
400Sstevel@tonic-gate #include <sys/kmem.h>
410Sstevel@tonic-gate #include <sys/atomic.h>
420Sstevel@tonic-gate #include <sys/errno.h>
430Sstevel@tonic-gate #include <sys/vtrace.h>
440Sstevel@tonic-gate #include <sys/ftrace.h>
450Sstevel@tonic-gate #include <sys/ontrap.h>
460Sstevel@tonic-gate #include <sys/multidata.h>
470Sstevel@tonic-gate #include <sys/multidata_impl.h>
480Sstevel@tonic-gate #include <sys/sdt.h>
491110Smeem #include <sys/strft.h>
500Sstevel@tonic-gate 
510Sstevel@tonic-gate #ifdef DEBUG
520Sstevel@tonic-gate #include <sys/kmem_impl.h>
530Sstevel@tonic-gate #endif
540Sstevel@tonic-gate 
550Sstevel@tonic-gate /*
560Sstevel@tonic-gate  * This file contains all the STREAMS utility routines that may
570Sstevel@tonic-gate  * be used by modules and drivers.
580Sstevel@tonic-gate  */
590Sstevel@tonic-gate 
600Sstevel@tonic-gate /*
610Sstevel@tonic-gate  * STREAMS message allocator: principles of operation
620Sstevel@tonic-gate  *
630Sstevel@tonic-gate  * The streams message allocator consists of all the routines that
640Sstevel@tonic-gate  * allocate, dup and free streams messages: allocb(), [d]esballoc[a],
650Sstevel@tonic-gate  * dupb(), freeb() and freemsg().  What follows is a high-level view
660Sstevel@tonic-gate  * of how the allocator works.
670Sstevel@tonic-gate  *
680Sstevel@tonic-gate  * Every streams message consists of one or more mblks, a dblk, and data.
690Sstevel@tonic-gate  * All mblks for all types of messages come from a common mblk_cache.
700Sstevel@tonic-gate  * The dblk and data come in several flavors, depending on how the
710Sstevel@tonic-gate  * message is allocated:
720Sstevel@tonic-gate  *
730Sstevel@tonic-gate  * (1) mblks up to DBLK_MAX_CACHE size are allocated from a collection of
740Sstevel@tonic-gate  *     fixed-size dblk/data caches. For message sizes that are multiples of
750Sstevel@tonic-gate  *     PAGESIZE, dblks are allocated separately from the buffer.
760Sstevel@tonic-gate  *     The associated buffer is allocated by the constructor using kmem_alloc().
770Sstevel@tonic-gate  *     For all other message sizes, dblk and its associated data is allocated
780Sstevel@tonic-gate  *     as a single contiguous chunk of memory.
790Sstevel@tonic-gate  *     Objects in these caches consist of a dblk plus its associated data.
800Sstevel@tonic-gate  *     allocb() determines the nearest-size cache by table lookup:
810Sstevel@tonic-gate  *     the dblk_cache[] array provides the mapping from size to dblk cache.
820Sstevel@tonic-gate  *
830Sstevel@tonic-gate  * (2) Large messages (size > DBLK_MAX_CACHE) are constructed by
840Sstevel@tonic-gate  *     kmem_alloc()'ing a buffer for the data and supplying that
850Sstevel@tonic-gate  *     buffer to gesballoc(), described below.
860Sstevel@tonic-gate  *
870Sstevel@tonic-gate  * (3) The four flavors of [d]esballoc[a] are all implemented by a
880Sstevel@tonic-gate  *     common routine, gesballoc() ("generic esballoc").  gesballoc()
890Sstevel@tonic-gate  *     allocates a dblk from the global dblk_esb_cache and sets db_base,
900Sstevel@tonic-gate  *     db_lim and db_frtnp to describe the caller-supplied buffer.
910Sstevel@tonic-gate  *
920Sstevel@tonic-gate  * While there are several routines to allocate messages, there is only
930Sstevel@tonic-gate  * one routine to free messages: freeb().  freeb() simply invokes the
940Sstevel@tonic-gate  * dblk's free method, dbp->db_free(), which is set at allocation time.
950Sstevel@tonic-gate  *
960Sstevel@tonic-gate  * dupb() creates a new reference to a message by allocating a new mblk,
970Sstevel@tonic-gate  * incrementing the dblk reference count and setting the dblk's free
980Sstevel@tonic-gate  * method to dblk_decref().  The dblk's original free method is retained
990Sstevel@tonic-gate  * in db_lastfree.  dblk_decref() decrements the reference count on each
1000Sstevel@tonic-gate  * freeb().  If this is not the last reference it just frees the mblk;
1010Sstevel@tonic-gate  * if this *is* the last reference, it restores db_free to db_lastfree,
1020Sstevel@tonic-gate  * sets db_mblk to the current mblk (see below), and invokes db_lastfree.
1030Sstevel@tonic-gate  *
1040Sstevel@tonic-gate  * The implementation makes aggressive use of kmem object caching for
1050Sstevel@tonic-gate  * maximum performance.  This makes the code simple and compact, but
1060Sstevel@tonic-gate  * also a bit abstruse in some places.  The invariants that constitute a
1070Sstevel@tonic-gate  * message's constructed state, described below, are more subtle than usual.
1080Sstevel@tonic-gate  *
1090Sstevel@tonic-gate  * Every dblk has an "attached mblk" as part of its constructed state.
1100Sstevel@tonic-gate  * The mblk is allocated by the dblk's constructor and remains attached
1110Sstevel@tonic-gate  * until the message is either dup'ed or pulled up.  In the dupb() case
1120Sstevel@tonic-gate  * the mblk association doesn't matter until the last free, at which time
1130Sstevel@tonic-gate  * dblk_decref() attaches the last mblk to the dblk.  pullupmsg() affects
1140Sstevel@tonic-gate  * the mblk association because it swaps the leading mblks of two messages,
1150Sstevel@tonic-gate  * so it is responsible for swapping their db_mblk pointers accordingly.
1160Sstevel@tonic-gate  * From a constructed-state viewpoint it doesn't matter that a dblk's
1170Sstevel@tonic-gate  * attached mblk can change while the message is allocated; all that
1180Sstevel@tonic-gate  * matters is that the dblk has *some* attached mblk when it's freed.
1190Sstevel@tonic-gate  *
1200Sstevel@tonic-gate  * The sizes of the allocb() small-message caches are not magical.
1210Sstevel@tonic-gate  * They represent a good trade-off between internal and external
1220Sstevel@tonic-gate  * fragmentation for current workloads.  They should be reevaluated
1230Sstevel@tonic-gate  * periodically, especially if allocations larger than DBLK_MAX_CACHE
1240Sstevel@tonic-gate  * become common.  We use 64-byte alignment so that dblks don't
1250Sstevel@tonic-gate  * straddle cache lines unnecessarily.
1260Sstevel@tonic-gate  */
1270Sstevel@tonic-gate #define	DBLK_MAX_CACHE		73728
1280Sstevel@tonic-gate #define	DBLK_CACHE_ALIGN	64
1290Sstevel@tonic-gate #define	DBLK_MIN_SIZE		8
1300Sstevel@tonic-gate #define	DBLK_SIZE_SHIFT		3
1310Sstevel@tonic-gate 
1320Sstevel@tonic-gate #ifdef _BIG_ENDIAN
1330Sstevel@tonic-gate #define	DBLK_RTFU_SHIFT(field)	\
1340Sstevel@tonic-gate 	(8 * (&((dblk_t *)0)->db_struioflag - &((dblk_t *)0)->field))
1350Sstevel@tonic-gate #else
1360Sstevel@tonic-gate #define	DBLK_RTFU_SHIFT(field)	\
1370Sstevel@tonic-gate 	(8 * (&((dblk_t *)0)->field - &((dblk_t *)0)->db_ref))
1380Sstevel@tonic-gate #endif
1390Sstevel@tonic-gate 
1400Sstevel@tonic-gate #define	DBLK_RTFU(ref, type, flags, uioflag)	\
1410Sstevel@tonic-gate 	(((ref) << DBLK_RTFU_SHIFT(db_ref)) | \
1420Sstevel@tonic-gate 	((type) << DBLK_RTFU_SHIFT(db_type)) | \
1430Sstevel@tonic-gate 	(((flags) | (ref - 1)) << DBLK_RTFU_SHIFT(db_flags)) | \
1440Sstevel@tonic-gate 	((uioflag) << DBLK_RTFU_SHIFT(db_struioflag)))
1450Sstevel@tonic-gate #define	DBLK_RTFU_REF_MASK	(DBLK_REFMAX << DBLK_RTFU_SHIFT(db_ref))
1460Sstevel@tonic-gate #define	DBLK_RTFU_WORD(dbp)	(*((uint32_t *)&(dbp)->db_ref))
1470Sstevel@tonic-gate #define	MBLK_BAND_FLAG_WORD(mp)	(*((uint32_t *)&(mp)->b_band))
1480Sstevel@tonic-gate 
1490Sstevel@tonic-gate static size_t dblk_sizes[] = {
1500Sstevel@tonic-gate #ifdef _LP64
1516712Stomee 	16, 80, 144, 208, 272, 336, 528, 1040, 1488, 1936, 2576, 3856,
1526712Stomee 	8192, 12048, 16384, 20240, 24576, 28432, 32768, 36624,
1536712Stomee 	40960, 44816, 49152, 53008, 57344, 61200, 65536, 69392,
1540Sstevel@tonic-gate #else
1556712Stomee 	64, 128, 320, 576, 1088, 1536, 1984, 2624, 3904,
1566712Stomee 	8192, 12096, 16384, 20288, 24576, 28480, 32768, 36672,
1576712Stomee 	40960, 44864, 49152, 53056, 57344, 61248, 65536, 69440,
1580Sstevel@tonic-gate #endif
1590Sstevel@tonic-gate 	DBLK_MAX_CACHE, 0
1600Sstevel@tonic-gate };
1610Sstevel@tonic-gate 
1620Sstevel@tonic-gate static struct kmem_cache *dblk_cache[DBLK_MAX_CACHE / DBLK_MIN_SIZE];
1630Sstevel@tonic-gate static struct kmem_cache *mblk_cache;
1640Sstevel@tonic-gate static struct kmem_cache *dblk_esb_cache;
1650Sstevel@tonic-gate static struct kmem_cache *fthdr_cache;
1660Sstevel@tonic-gate static struct kmem_cache *ftblk_cache;
1670Sstevel@tonic-gate 
1680Sstevel@tonic-gate static void dblk_lastfree(mblk_t *mp, dblk_t *dbp);
1690Sstevel@tonic-gate static mblk_t *allocb_oversize(size_t size, int flags);
1700Sstevel@tonic-gate static int allocb_tryhard_fails;
1710Sstevel@tonic-gate static void frnop_func(void *arg);
1720Sstevel@tonic-gate frtn_t frnop = { frnop_func };
1730Sstevel@tonic-gate static void bcache_dblk_lastfree(mblk_t *mp, dblk_t *dbp);
1740Sstevel@tonic-gate 
1750Sstevel@tonic-gate static boolean_t rwnext_enter(queue_t *qp);
1760Sstevel@tonic-gate static void rwnext_exit(queue_t *qp);
1770Sstevel@tonic-gate 
1780Sstevel@tonic-gate /*
1790Sstevel@tonic-gate  * Patchable mblk/dblk kmem_cache flags.
1800Sstevel@tonic-gate  */
1810Sstevel@tonic-gate int dblk_kmem_flags = 0;
1820Sstevel@tonic-gate int mblk_kmem_flags = 0;
1830Sstevel@tonic-gate 
1840Sstevel@tonic-gate static int
1850Sstevel@tonic-gate dblk_constructor(void *buf, void *cdrarg, int kmflags)
1860Sstevel@tonic-gate {
1870Sstevel@tonic-gate 	dblk_t *dbp = buf;
1880Sstevel@tonic-gate 	ssize_t msg_size = (ssize_t)cdrarg;
1890Sstevel@tonic-gate 	size_t index;
1900Sstevel@tonic-gate 
1910Sstevel@tonic-gate 	ASSERT(msg_size != 0);
1920Sstevel@tonic-gate 
1930Sstevel@tonic-gate 	index = (msg_size - 1) >> DBLK_SIZE_SHIFT;
1940Sstevel@tonic-gate 
195577Smeem 	ASSERT(index < (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT));
1960Sstevel@tonic-gate 
1970Sstevel@tonic-gate 	if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
1980Sstevel@tonic-gate 		return (-1);
1990Sstevel@tonic-gate 	if ((msg_size & PAGEOFFSET) == 0) {
2000Sstevel@tonic-gate 		dbp->db_base = kmem_alloc(msg_size, kmflags);
2010Sstevel@tonic-gate 		if (dbp->db_base == NULL) {
2020Sstevel@tonic-gate 			kmem_cache_free(mblk_cache, dbp->db_mblk);
2030Sstevel@tonic-gate 			return (-1);
2040Sstevel@tonic-gate 		}
2050Sstevel@tonic-gate 	} else {
2060Sstevel@tonic-gate 		dbp->db_base = (unsigned char *)&dbp[1];
2070Sstevel@tonic-gate 	}
2080Sstevel@tonic-gate 
2090Sstevel@tonic-gate 	dbp->db_mblk->b_datap = dbp;
2100Sstevel@tonic-gate 	dbp->db_cache = dblk_cache[index];
2110Sstevel@tonic-gate 	dbp->db_lim = dbp->db_base + msg_size;
2120Sstevel@tonic-gate 	dbp->db_free = dbp->db_lastfree = dblk_lastfree;
2130Sstevel@tonic-gate 	dbp->db_frtnp = NULL;
2140Sstevel@tonic-gate 	dbp->db_fthdr = NULL;
2150Sstevel@tonic-gate 	dbp->db_credp = NULL;
2160Sstevel@tonic-gate 	dbp->db_cpid = -1;
2170Sstevel@tonic-gate 	dbp->db_struioflag = 0;
2180Sstevel@tonic-gate 	dbp->db_struioun.cksum.flags = 0;
2190Sstevel@tonic-gate 	return (0);
2200Sstevel@tonic-gate }
2210Sstevel@tonic-gate 
2220Sstevel@tonic-gate /*ARGSUSED*/
2230Sstevel@tonic-gate static int
2240Sstevel@tonic-gate dblk_esb_constructor(void *buf, void *cdrarg, int kmflags)
2250Sstevel@tonic-gate {
2260Sstevel@tonic-gate 	dblk_t *dbp = buf;
2270Sstevel@tonic-gate 
2280Sstevel@tonic-gate 	if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
2290Sstevel@tonic-gate 		return (-1);
2300Sstevel@tonic-gate 	dbp->db_mblk->b_datap = dbp;
2310Sstevel@tonic-gate 	dbp->db_cache = dblk_esb_cache;
2320Sstevel@tonic-gate 	dbp->db_fthdr = NULL;
2330Sstevel@tonic-gate 	dbp->db_credp = NULL;
2340Sstevel@tonic-gate 	dbp->db_cpid = -1;
2350Sstevel@tonic-gate 	dbp->db_struioflag = 0;
2360Sstevel@tonic-gate 	dbp->db_struioun.cksum.flags = 0;
2370Sstevel@tonic-gate 	return (0);
2380Sstevel@tonic-gate }
2390Sstevel@tonic-gate 
2400Sstevel@tonic-gate static int
2410Sstevel@tonic-gate bcache_dblk_constructor(void *buf, void *cdrarg, int kmflags)
2420Sstevel@tonic-gate {
2430Sstevel@tonic-gate 	dblk_t *dbp = buf;
2448752SPeter.Memishian@Sun.COM 	bcache_t *bcp = cdrarg;
2450Sstevel@tonic-gate 
2460Sstevel@tonic-gate 	if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
2470Sstevel@tonic-gate 		return (-1);
2480Sstevel@tonic-gate 
2498752SPeter.Memishian@Sun.COM 	dbp->db_base = kmem_cache_alloc(bcp->buffer_cache, kmflags);
2508752SPeter.Memishian@Sun.COM 	if (dbp->db_base == NULL) {
2510Sstevel@tonic-gate 		kmem_cache_free(mblk_cache, dbp->db_mblk);
2520Sstevel@tonic-gate 		return (-1);
2530Sstevel@tonic-gate 	}
2540Sstevel@tonic-gate 
2550Sstevel@tonic-gate 	dbp->db_mblk->b_datap = dbp;
2560Sstevel@tonic-gate 	dbp->db_cache = (void *)bcp;
2570Sstevel@tonic-gate 	dbp->db_lim = dbp->db_base + bcp->size;
2580Sstevel@tonic-gate 	dbp->db_free = dbp->db_lastfree = bcache_dblk_lastfree;
2590Sstevel@tonic-gate 	dbp->db_frtnp = NULL;
2600Sstevel@tonic-gate 	dbp->db_fthdr = NULL;
2610Sstevel@tonic-gate 	dbp->db_credp = NULL;
2620Sstevel@tonic-gate 	dbp->db_cpid = -1;
2630Sstevel@tonic-gate 	dbp->db_struioflag = 0;
2640Sstevel@tonic-gate 	dbp->db_struioun.cksum.flags = 0;
2650Sstevel@tonic-gate 	return (0);
2660Sstevel@tonic-gate }
2670Sstevel@tonic-gate 
2680Sstevel@tonic-gate /*ARGSUSED*/
2690Sstevel@tonic-gate static void
2700Sstevel@tonic-gate dblk_destructor(void *buf, void *cdrarg)
2710Sstevel@tonic-gate {
2720Sstevel@tonic-gate 	dblk_t *dbp = buf;
2730Sstevel@tonic-gate 	ssize_t msg_size = (ssize_t)cdrarg;
2740Sstevel@tonic-gate 
2750Sstevel@tonic-gate 	ASSERT(dbp->db_mblk->b_datap == dbp);
2760Sstevel@tonic-gate 	ASSERT(msg_size != 0);
2770Sstevel@tonic-gate 	ASSERT(dbp->db_struioflag == 0);
2780Sstevel@tonic-gate 	ASSERT(dbp->db_struioun.cksum.flags == 0);
2790Sstevel@tonic-gate 
2800Sstevel@tonic-gate 	if ((msg_size & PAGEOFFSET) == 0) {
2810Sstevel@tonic-gate 		kmem_free(dbp->db_base, msg_size);
2820Sstevel@tonic-gate 	}
2830Sstevel@tonic-gate 
2840Sstevel@tonic-gate 	kmem_cache_free(mblk_cache, dbp->db_mblk);
2850Sstevel@tonic-gate }
2860Sstevel@tonic-gate 
2870Sstevel@tonic-gate static void
2880Sstevel@tonic-gate bcache_dblk_destructor(void *buf, void *cdrarg)
2890Sstevel@tonic-gate {
2900Sstevel@tonic-gate 	dblk_t *dbp = buf;
2918752SPeter.Memishian@Sun.COM 	bcache_t *bcp = cdrarg;
2920Sstevel@tonic-gate 
2930Sstevel@tonic-gate 	kmem_cache_free(bcp->buffer_cache, dbp->db_base);
2940Sstevel@tonic-gate 
2950Sstevel@tonic-gate 	ASSERT(dbp->db_mblk->b_datap == dbp);
2960Sstevel@tonic-gate 	ASSERT(dbp->db_struioflag == 0);
2970Sstevel@tonic-gate 	ASSERT(dbp->db_struioun.cksum.flags == 0);
2980Sstevel@tonic-gate 
2990Sstevel@tonic-gate 	kmem_cache_free(mblk_cache, dbp->db_mblk);
3000Sstevel@tonic-gate }
3010Sstevel@tonic-gate 
3028752SPeter.Memishian@Sun.COM /* ARGSUSED */
3038752SPeter.Memishian@Sun.COM static int
3048752SPeter.Memishian@Sun.COM ftblk_constructor(void *buf, void *cdrarg, int kmflags)
3058752SPeter.Memishian@Sun.COM {
3068752SPeter.Memishian@Sun.COM 	ftblk_t *fbp = buf;
3078752SPeter.Memishian@Sun.COM 	int i;
3088752SPeter.Memishian@Sun.COM 
3098752SPeter.Memishian@Sun.COM 	bzero(fbp, sizeof (ftblk_t));
3108752SPeter.Memishian@Sun.COM 	if (str_ftstack != 0) {
3118752SPeter.Memishian@Sun.COM 		for (i = 0; i < FTBLK_EVNTS; i++)
3128752SPeter.Memishian@Sun.COM 			fbp->ev[i].stk = kmem_alloc(sizeof (ftstk_t), kmflags);
3138752SPeter.Memishian@Sun.COM 	}
3148752SPeter.Memishian@Sun.COM 
3158752SPeter.Memishian@Sun.COM 	return (0);
3168752SPeter.Memishian@Sun.COM }
3178752SPeter.Memishian@Sun.COM 
3188752SPeter.Memishian@Sun.COM /* ARGSUSED */
3198752SPeter.Memishian@Sun.COM static void
3208752SPeter.Memishian@Sun.COM ftblk_destructor(void *buf, void *cdrarg)
3218752SPeter.Memishian@Sun.COM {
3228752SPeter.Memishian@Sun.COM 	ftblk_t *fbp = buf;
3238752SPeter.Memishian@Sun.COM 	int i;
3248752SPeter.Memishian@Sun.COM 
3258752SPeter.Memishian@Sun.COM 	if (str_ftstack != 0) {
3268752SPeter.Memishian@Sun.COM 		for (i = 0; i < FTBLK_EVNTS; i++) {
3278752SPeter.Memishian@Sun.COM 			if (fbp->ev[i].stk != NULL) {
3288752SPeter.Memishian@Sun.COM 				kmem_free(fbp->ev[i].stk, sizeof (ftstk_t));
3298752SPeter.Memishian@Sun.COM 				fbp->ev[i].stk = NULL;
3308752SPeter.Memishian@Sun.COM 			}
3318752SPeter.Memishian@Sun.COM 		}
3328752SPeter.Memishian@Sun.COM 	}
3338752SPeter.Memishian@Sun.COM }
3348752SPeter.Memishian@Sun.COM 
3358752SPeter.Memishian@Sun.COM static int
3368752SPeter.Memishian@Sun.COM fthdr_constructor(void *buf, void *cdrarg, int kmflags)
3378752SPeter.Memishian@Sun.COM {
3388752SPeter.Memishian@Sun.COM 	fthdr_t *fhp = buf;
3398752SPeter.Memishian@Sun.COM 
3408752SPeter.Memishian@Sun.COM 	return (ftblk_constructor(&fhp->first, cdrarg, kmflags));
3418752SPeter.Memishian@Sun.COM }
3428752SPeter.Memishian@Sun.COM 
3438752SPeter.Memishian@Sun.COM static void
3448752SPeter.Memishian@Sun.COM fthdr_destructor(void *buf, void *cdrarg)
3458752SPeter.Memishian@Sun.COM {
3468752SPeter.Memishian@Sun.COM 	fthdr_t *fhp = buf;
3478752SPeter.Memishian@Sun.COM 
3488752SPeter.Memishian@Sun.COM 	ftblk_destructor(&fhp->first, cdrarg);
3498752SPeter.Memishian@Sun.COM }
3508752SPeter.Memishian@Sun.COM 
3510Sstevel@tonic-gate void
3520Sstevel@tonic-gate streams_msg_init(void)
3530Sstevel@tonic-gate {
3540Sstevel@tonic-gate 	char name[40];
3550Sstevel@tonic-gate 	size_t size;
3560Sstevel@tonic-gate 	size_t lastsize = DBLK_MIN_SIZE;
3570Sstevel@tonic-gate 	size_t *sizep;
3580Sstevel@tonic-gate 	struct kmem_cache *cp;
3590Sstevel@tonic-gate 	size_t tot_size;
3600Sstevel@tonic-gate 	int offset;
3610Sstevel@tonic-gate 
3628752SPeter.Memishian@Sun.COM 	mblk_cache = kmem_cache_create("streams_mblk", sizeof (mblk_t), 32,
3638752SPeter.Memishian@Sun.COM 	    NULL, NULL, NULL, NULL, NULL, mblk_kmem_flags);
3640Sstevel@tonic-gate 
3650Sstevel@tonic-gate 	for (sizep = dblk_sizes; (size = *sizep) != 0; sizep++) {
3660Sstevel@tonic-gate 
3670Sstevel@tonic-gate 		if ((offset = (size & PAGEOFFSET)) != 0) {
3680Sstevel@tonic-gate 			/*
3690Sstevel@tonic-gate 			 * We are in the middle of a page, dblk should
3700Sstevel@tonic-gate 			 * be allocated on the same page
3710Sstevel@tonic-gate 			 */
3720Sstevel@tonic-gate 			tot_size = size + sizeof (dblk_t);
3730Sstevel@tonic-gate 			ASSERT((offset + sizeof (dblk_t) + sizeof (kmem_slab_t))
3746707Sbrutus 			    < PAGESIZE);
3750Sstevel@tonic-gate 			ASSERT((tot_size & (DBLK_CACHE_ALIGN - 1)) == 0);
3760Sstevel@tonic-gate 
3770Sstevel@tonic-gate 		} else {
3780Sstevel@tonic-gate 
3790Sstevel@tonic-gate 			/*
3800Sstevel@tonic-gate 			 * buf size is multiple of page size, dblk and
3810Sstevel@tonic-gate 			 * buffer are allocated separately.
3820Sstevel@tonic-gate 			 */
3830Sstevel@tonic-gate 
3840Sstevel@tonic-gate 			ASSERT((size & (DBLK_CACHE_ALIGN - 1)) == 0);
3850Sstevel@tonic-gate 			tot_size = sizeof (dblk_t);
3860Sstevel@tonic-gate 		}
3870Sstevel@tonic-gate 
3880Sstevel@tonic-gate 		(void) sprintf(name, "streams_dblk_%ld", size);
3898752SPeter.Memishian@Sun.COM 		cp = kmem_cache_create(name, tot_size, DBLK_CACHE_ALIGN,
3908752SPeter.Memishian@Sun.COM 		    dblk_constructor, dblk_destructor, NULL, (void *)(size),
3918752SPeter.Memishian@Sun.COM 		    NULL, dblk_kmem_flags);
3920Sstevel@tonic-gate 
3930Sstevel@tonic-gate 		while (lastsize <= size) {
3940Sstevel@tonic-gate 			dblk_cache[(lastsize - 1) >> DBLK_SIZE_SHIFT] = cp;
3950Sstevel@tonic-gate 			lastsize += DBLK_MIN_SIZE;
3960Sstevel@tonic-gate 		}
3970Sstevel@tonic-gate 	}
3980Sstevel@tonic-gate 
3998752SPeter.Memishian@Sun.COM 	dblk_esb_cache = kmem_cache_create("streams_dblk_esb", sizeof (dblk_t),
4008752SPeter.Memishian@Sun.COM 	    DBLK_CACHE_ALIGN, dblk_esb_constructor, dblk_destructor, NULL,
4018752SPeter.Memishian@Sun.COM 	    (void *)sizeof (dblk_t), NULL, dblk_kmem_flags);
4028752SPeter.Memishian@Sun.COM 	fthdr_cache = kmem_cache_create("streams_fthdr", sizeof (fthdr_t), 32,
4038752SPeter.Memishian@Sun.COM 	    fthdr_constructor, fthdr_destructor, NULL, NULL, NULL, 0);
4048752SPeter.Memishian@Sun.COM 	ftblk_cache = kmem_cache_create("streams_ftblk", sizeof (ftblk_t), 32,
4058752SPeter.Memishian@Sun.COM 	    ftblk_constructor, ftblk_destructor, NULL, NULL, NULL, 0);
4060Sstevel@tonic-gate 
4070Sstevel@tonic-gate 	/* Initialize Multidata caches */
4080Sstevel@tonic-gate 	mmd_init();
4093932Sss146032 
4103932Sss146032 	/* initialize throttling queue for esballoc */
4113932Sss146032 	esballoc_queue_init();
4120Sstevel@tonic-gate }
4130Sstevel@tonic-gate 
4140Sstevel@tonic-gate /*ARGSUSED*/
4150Sstevel@tonic-gate mblk_t *
4160Sstevel@tonic-gate allocb(size_t size, uint_t pri)
4170Sstevel@tonic-gate {
4180Sstevel@tonic-gate 	dblk_t *dbp;
4190Sstevel@tonic-gate 	mblk_t *mp;
4200Sstevel@tonic-gate 	size_t index;
4210Sstevel@tonic-gate 
4220Sstevel@tonic-gate 	index =  (size - 1)  >> DBLK_SIZE_SHIFT;
4230Sstevel@tonic-gate 
4240Sstevel@tonic-gate 	if (index >= (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT)) {
4250Sstevel@tonic-gate 		if (size != 0) {
4260Sstevel@tonic-gate 			mp = allocb_oversize(size, KM_NOSLEEP);
4270Sstevel@tonic-gate 			goto out;
4280Sstevel@tonic-gate 		}
4290Sstevel@tonic-gate 		index = 0;
4300Sstevel@tonic-gate 	}
4310Sstevel@tonic-gate 
4320Sstevel@tonic-gate 	if ((dbp = kmem_cache_alloc(dblk_cache[index], KM_NOSLEEP)) == NULL) {
4330Sstevel@tonic-gate 		mp = NULL;
4340Sstevel@tonic-gate 		goto out;
4350Sstevel@tonic-gate 	}
4360Sstevel@tonic-gate 
4370Sstevel@tonic-gate 	mp = dbp->db_mblk;
4380Sstevel@tonic-gate 	DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
4390Sstevel@tonic-gate 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
4400Sstevel@tonic-gate 	mp->b_rptr = mp->b_wptr = dbp->db_base;
4410Sstevel@tonic-gate 	mp->b_queue = NULL;
4420Sstevel@tonic-gate 	MBLK_BAND_FLAG_WORD(mp) = 0;
4430Sstevel@tonic-gate 	STR_FTALLOC(&dbp->db_fthdr, FTEV_ALLOCB, size);
4440Sstevel@tonic-gate out:
4450Sstevel@tonic-gate 	FTRACE_1("allocb(): mp=0x%p", (uintptr_t)mp);
4460Sstevel@tonic-gate 
4470Sstevel@tonic-gate 	return (mp);
4480Sstevel@tonic-gate }
4490Sstevel@tonic-gate 
4508778SErik.Nordmark@Sun.COM /*
4518778SErik.Nordmark@Sun.COM  * Allocate an mblk taking db_credp and db_cpid from the template.
4528778SErik.Nordmark@Sun.COM  * Allow the cred to be NULL.
4538778SErik.Nordmark@Sun.COM  */
4540Sstevel@tonic-gate mblk_t *
4550Sstevel@tonic-gate allocb_tmpl(size_t size, const mblk_t *tmpl)
4560Sstevel@tonic-gate {
4570Sstevel@tonic-gate 	mblk_t *mp = allocb(size, 0);
4580Sstevel@tonic-gate 
4590Sstevel@tonic-gate 	if (mp != NULL) {
4608778SErik.Nordmark@Sun.COM 		dblk_t *src = tmpl->b_datap;
4618778SErik.Nordmark@Sun.COM 		dblk_t *dst = mp->b_datap;
462*10163SKen.Powell@Sun.COM 		cred_t *cr;
463*10163SKen.Powell@Sun.COM 		pid_t cpid;
464*10163SKen.Powell@Sun.COM 
465*10163SKen.Powell@Sun.COM 		cr = msg_getcred(tmpl, &cpid);
4660Sstevel@tonic-gate 		if (cr != NULL)
4678778SErik.Nordmark@Sun.COM 			crhold(dst->db_credp = cr);
468*10163SKen.Powell@Sun.COM 		dst->db_cpid = cpid;
4698778SErik.Nordmark@Sun.COM 		dst->db_type = src->db_type;
4708778SErik.Nordmark@Sun.COM 	}
4718778SErik.Nordmark@Sun.COM 	return (mp);
4728778SErik.Nordmark@Sun.COM }
4738778SErik.Nordmark@Sun.COM 
4748778SErik.Nordmark@Sun.COM mblk_t *
4758778SErik.Nordmark@Sun.COM allocb_cred(size_t size, cred_t *cr, pid_t cpid)
4768778SErik.Nordmark@Sun.COM {
4778778SErik.Nordmark@Sun.COM 	mblk_t *mp = allocb(size, 0);
4788778SErik.Nordmark@Sun.COM 
4798778SErik.Nordmark@Sun.COM 	ASSERT(cr != NULL);
4808778SErik.Nordmark@Sun.COM 	if (mp != NULL) {
4818778SErik.Nordmark@Sun.COM 		dblk_t *dbp = mp->b_datap;
4828778SErik.Nordmark@Sun.COM 
4838778SErik.Nordmark@Sun.COM 		crhold(dbp->db_credp = cr);
4848778SErik.Nordmark@Sun.COM 		dbp->db_cpid = cpid;
4850Sstevel@tonic-gate 	}
4860Sstevel@tonic-gate 	return (mp);
4870Sstevel@tonic-gate }
4880Sstevel@tonic-gate 
4890Sstevel@tonic-gate mblk_t *
4908778SErik.Nordmark@Sun.COM allocb_cred_wait(size_t size, uint_t flags, int *error, cred_t *cr, pid_t cpid)
4910Sstevel@tonic-gate {
4928778SErik.Nordmark@Sun.COM 	mblk_t *mp = allocb_wait(size, 0, flags, error);
4938778SErik.Nordmark@Sun.COM 
4948778SErik.Nordmark@Sun.COM 	ASSERT(cr != NULL);
4958778SErik.Nordmark@Sun.COM 	if (mp != NULL) {
4968778SErik.Nordmark@Sun.COM 		dblk_t *dbp = mp->b_datap;
4978778SErik.Nordmark@Sun.COM 
4988778SErik.Nordmark@Sun.COM 		crhold(dbp->db_credp = cr);
4998778SErik.Nordmark@Sun.COM 		dbp->db_cpid = cpid;
5008778SErik.Nordmark@Sun.COM 	}
5010Sstevel@tonic-gate 
5020Sstevel@tonic-gate 	return (mp);
5030Sstevel@tonic-gate }
5040Sstevel@tonic-gate 
5058778SErik.Nordmark@Sun.COM /*
5068778SErik.Nordmark@Sun.COM  * Extract the db_cred (and optionally db_cpid) from a message.
5078778SErik.Nordmark@Sun.COM  * We find the first mblk which has a non-NULL db_cred and use that.
5088778SErik.Nordmark@Sun.COM  * If none found we return NULL.
5098778SErik.Nordmark@Sun.COM  * Does NOT get a hold on the cred.
5108778SErik.Nordmark@Sun.COM  */
5118778SErik.Nordmark@Sun.COM cred_t *
5128778SErik.Nordmark@Sun.COM msg_getcred(const mblk_t *mp, pid_t *cpidp)
5130Sstevel@tonic-gate {
5148778SErik.Nordmark@Sun.COM 	cred_t *cr = NULL;
5158778SErik.Nordmark@Sun.COM 	cred_t *cr2;
516*10163SKen.Powell@Sun.COM 	mblk_t *mp2;
5178778SErik.Nordmark@Sun.COM 
5188778SErik.Nordmark@Sun.COM 	while (mp != NULL) {
5198778SErik.Nordmark@Sun.COM 		dblk_t *dbp = mp->b_datap;
5208778SErik.Nordmark@Sun.COM 
5218778SErik.Nordmark@Sun.COM 		cr = dbp->db_credp;
5228778SErik.Nordmark@Sun.COM 		if (cr == NULL) {
5238778SErik.Nordmark@Sun.COM 			mp = mp->b_cont;
5248778SErik.Nordmark@Sun.COM 			continue;
5258778SErik.Nordmark@Sun.COM 		}
5268778SErik.Nordmark@Sun.COM 		if (cpidp != NULL)
5278778SErik.Nordmark@Sun.COM 			*cpidp = dbp->db_cpid;
5288778SErik.Nordmark@Sun.COM 
5298778SErik.Nordmark@Sun.COM #ifdef DEBUG
5308778SErik.Nordmark@Sun.COM 		/*
5318778SErik.Nordmark@Sun.COM 		 * Normally there should at most one db_credp in a message.
5328778SErik.Nordmark@Sun.COM 		 * But if there are multiple (as in the case of some M_IOC*
5338778SErik.Nordmark@Sun.COM 		 * and some internal messages in TCP/IP bind logic) then
5348778SErik.Nordmark@Sun.COM 		 * they must be identical in the normal case.
5358778SErik.Nordmark@Sun.COM 		 * However, a socket can be shared between different uids
5368778SErik.Nordmark@Sun.COM 		 * in which case data queued in TCP would be from different
5378778SErik.Nordmark@Sun.COM 		 * creds. Thus we can only assert for the zoneid being the
5388778SErik.Nordmark@Sun.COM 		 * same. Due to Multi-level Level Ports for TX, some
5398778SErik.Nordmark@Sun.COM 		 * cred_t can have a NULL cr_zone, and we skip the comparison
5408778SErik.Nordmark@Sun.COM 		 * in that case.
5418778SErik.Nordmark@Sun.COM 		 */
542*10163SKen.Powell@Sun.COM 		mp2 = mp->b_cont;
543*10163SKen.Powell@Sun.COM 		while (mp2 != NULL) {
544*10163SKen.Powell@Sun.COM 			cr2 = DB_CRED(mp2);
545*10163SKen.Powell@Sun.COM 			if (cr2 != NULL) {
546*10163SKen.Powell@Sun.COM 				DTRACE_PROBE2(msg__getcred,
547*10163SKen.Powell@Sun.COM 				    cred_t *, cr, cred_t *, cr2);
548*10163SKen.Powell@Sun.COM 				ASSERT(crgetzoneid(cr) == crgetzoneid(cr2) ||
549*10163SKen.Powell@Sun.COM 				    crgetzone(cr) == NULL ||
550*10163SKen.Powell@Sun.COM 				    crgetzone(cr2) == NULL);
551*10163SKen.Powell@Sun.COM 			}
552*10163SKen.Powell@Sun.COM 			mp2 = mp2->b_cont;
5538778SErik.Nordmark@Sun.COM 		}
5548778SErik.Nordmark@Sun.COM #endif
5558778SErik.Nordmark@Sun.COM 		return (cr);
5568778SErik.Nordmark@Sun.COM 	}
5578778SErik.Nordmark@Sun.COM 	if (cpidp != NULL)
5588778SErik.Nordmark@Sun.COM 		*cpidp = NOPID;
5598778SErik.Nordmark@Sun.COM 	return (NULL);
5608778SErik.Nordmark@Sun.COM }
5618778SErik.Nordmark@Sun.COM 
5628778SErik.Nordmark@Sun.COM /*
5638778SErik.Nordmark@Sun.COM  * Variant of msg_getcred which, when a cred is found
5648778SErik.Nordmark@Sun.COM  * 1. Returns with a hold on the cred
5658778SErik.Nordmark@Sun.COM  * 2. Clears the first cred in the mblk.
5668778SErik.Nordmark@Sun.COM  * This is more efficient to use than a msg_getcred() + crhold() when
5678778SErik.Nordmark@Sun.COM  * the message is freed after the cred has been extracted.
5688778SErik.Nordmark@Sun.COM  *
5698778SErik.Nordmark@Sun.COM  * The caller is responsible for ensuring that there is no other reference
5708778SErik.Nordmark@Sun.COM  * on the message since db_credp can not be cleared when there are other
5718778SErik.Nordmark@Sun.COM  * references.
5728778SErik.Nordmark@Sun.COM  */
5738778SErik.Nordmark@Sun.COM cred_t *
5748778SErik.Nordmark@Sun.COM msg_extractcred(mblk_t *mp, pid_t *cpidp)
5758778SErik.Nordmark@Sun.COM {
5768778SErik.Nordmark@Sun.COM 	cred_t *cr = NULL;
5778778SErik.Nordmark@Sun.COM 	cred_t *cr2;
578*10163SKen.Powell@Sun.COM 	mblk_t *mp2;
5798778SErik.Nordmark@Sun.COM 
5808778SErik.Nordmark@Sun.COM 	while (mp != NULL) {
5818778SErik.Nordmark@Sun.COM 		dblk_t *dbp = mp->b_datap;
5828778SErik.Nordmark@Sun.COM 
5838778SErik.Nordmark@Sun.COM 		cr = dbp->db_credp;
5848778SErik.Nordmark@Sun.COM 		if (cr == NULL) {
5858778SErik.Nordmark@Sun.COM 			mp = mp->b_cont;
5868778SErik.Nordmark@Sun.COM 			continue;
5878778SErik.Nordmark@Sun.COM 		}
5888778SErik.Nordmark@Sun.COM 		ASSERT(dbp->db_ref == 1);
5898778SErik.Nordmark@Sun.COM 		dbp->db_credp = NULL;
5908778SErik.Nordmark@Sun.COM 		if (cpidp != NULL)
5918778SErik.Nordmark@Sun.COM 			*cpidp = dbp->db_cpid;
5928778SErik.Nordmark@Sun.COM #ifdef DEBUG
5938778SErik.Nordmark@Sun.COM 		/*
5948778SErik.Nordmark@Sun.COM 		 * Normally there should at most one db_credp in a message.
5958778SErik.Nordmark@Sun.COM 		 * But if there are multiple (as in the case of some M_IOC*
5968778SErik.Nordmark@Sun.COM 		 * and some internal messages in TCP/IP bind logic) then
5978778SErik.Nordmark@Sun.COM 		 * they must be identical in the normal case.
5988778SErik.Nordmark@Sun.COM 		 * However, a socket can be shared between different uids
5998778SErik.Nordmark@Sun.COM 		 * in which case data queued in TCP would be from different
6008778SErik.Nordmark@Sun.COM 		 * creds. Thus we can only assert for the zoneid being the
6018778SErik.Nordmark@Sun.COM 		 * same. Due to Multi-level Level Ports for TX, some
6028778SErik.Nordmark@Sun.COM 		 * cred_t can have a NULL cr_zone, and we skip the comparison
6038778SErik.Nordmark@Sun.COM 		 * in that case.
6048778SErik.Nordmark@Sun.COM 		 */
605*10163SKen.Powell@Sun.COM 		mp2 = mp->b_cont;
606*10163SKen.Powell@Sun.COM 		while (mp2 != NULL) {
607*10163SKen.Powell@Sun.COM 			cr2 = DB_CRED(mp2);
608*10163SKen.Powell@Sun.COM 			if (cr2 != NULL) {
609*10163SKen.Powell@Sun.COM 				DTRACE_PROBE2(msg__extractcred,
610*10163SKen.Powell@Sun.COM 				    cred_t *, cr, cred_t *, cr2);
611*10163SKen.Powell@Sun.COM 				ASSERT(crgetzoneid(cr) == crgetzoneid(cr2) ||
612*10163SKen.Powell@Sun.COM 				    crgetzone(cr) == NULL ||
613*10163SKen.Powell@Sun.COM 				    crgetzone(cr2) == NULL);
614*10163SKen.Powell@Sun.COM 			}
615*10163SKen.Powell@Sun.COM 			mp2 = mp2->b_cont;
6168778SErik.Nordmark@Sun.COM 		}
6178778SErik.Nordmark@Sun.COM #endif
6188778SErik.Nordmark@Sun.COM 		return (cr);
6198778SErik.Nordmark@Sun.COM 	}
6208778SErik.Nordmark@Sun.COM 	return (NULL);
6218778SErik.Nordmark@Sun.COM }
6228778SErik.Nordmark@Sun.COM /*
6238778SErik.Nordmark@Sun.COM  * Get the label for a message. Uses the first mblk in the message
6248778SErik.Nordmark@Sun.COM  * which has a non-NULL db_credp.
6258778SErik.Nordmark@Sun.COM  * Returns NULL if there is no credp.
6268778SErik.Nordmark@Sun.COM  */
6278778SErik.Nordmark@Sun.COM extern struct ts_label_s *
6288778SErik.Nordmark@Sun.COM msg_getlabel(const mblk_t *mp)
6298778SErik.Nordmark@Sun.COM {
6308778SErik.Nordmark@Sun.COM 	cred_t *cr = msg_getcred(mp, NULL);
6318778SErik.Nordmark@Sun.COM 
6328778SErik.Nordmark@Sun.COM 	if (cr == NULL)
6338778SErik.Nordmark@Sun.COM 		return (NULL);
6348778SErik.Nordmark@Sun.COM 
6358778SErik.Nordmark@Sun.COM 	return (crgetlabel(cr));
6360Sstevel@tonic-gate }
6370Sstevel@tonic-gate 
6380Sstevel@tonic-gate void
6390Sstevel@tonic-gate freeb(mblk_t *mp)
6400Sstevel@tonic-gate {
6410Sstevel@tonic-gate 	dblk_t *dbp = mp->b_datap;
6420Sstevel@tonic-gate 
6430Sstevel@tonic-gate 	ASSERT(dbp->db_ref > 0);
6440Sstevel@tonic-gate 	ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
6450Sstevel@tonic-gate 	FTRACE_1("freeb(): mp=0x%lx", (uintptr_t)mp);
6460Sstevel@tonic-gate 
6470Sstevel@tonic-gate 	STR_FTEVENT_MBLK(mp, caller(), FTEV_FREEB, dbp->db_ref);
6480Sstevel@tonic-gate 
6490Sstevel@tonic-gate 	dbp->db_free(mp, dbp);
6500Sstevel@tonic-gate }
6510Sstevel@tonic-gate 
6520Sstevel@tonic-gate void
6530Sstevel@tonic-gate freemsg(mblk_t *mp)
6540Sstevel@tonic-gate {
6550Sstevel@tonic-gate 	FTRACE_1("freemsg(): mp=0x%lx", (uintptr_t)mp);
6560Sstevel@tonic-gate 	while (mp) {
6570Sstevel@tonic-gate 		dblk_t *dbp = mp->b_datap;
6580Sstevel@tonic-gate 		mblk_t *mp_cont = mp->b_cont;
6590Sstevel@tonic-gate 
6600Sstevel@tonic-gate 		ASSERT(dbp->db_ref > 0);
6610Sstevel@tonic-gate 		ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
6620Sstevel@tonic-gate 
6630Sstevel@tonic-gate 		STR_FTEVENT_MBLK(mp, caller(), FTEV_FREEB, dbp->db_ref);
6640Sstevel@tonic-gate 
6650Sstevel@tonic-gate 		dbp->db_free(mp, dbp);
6660Sstevel@tonic-gate 		mp = mp_cont;
6670Sstevel@tonic-gate 	}
6680Sstevel@tonic-gate }
6690Sstevel@tonic-gate 
6700Sstevel@tonic-gate /*
6710Sstevel@tonic-gate  * Reallocate a block for another use.  Try hard to use the old block.
6720Sstevel@tonic-gate  * If the old data is wanted (copy), leave b_wptr at the end of the data,
6730Sstevel@tonic-gate  * otherwise return b_wptr = b_rptr.
6740Sstevel@tonic-gate  *
6750Sstevel@tonic-gate  * This routine is private and unstable.
6760Sstevel@tonic-gate  */
6770Sstevel@tonic-gate mblk_t	*
6780Sstevel@tonic-gate reallocb(mblk_t *mp, size_t size, uint_t copy)
6790Sstevel@tonic-gate {
6800Sstevel@tonic-gate 	mblk_t		*mp1;
6810Sstevel@tonic-gate 	unsigned char	*old_rptr;
6820Sstevel@tonic-gate 	ptrdiff_t	cur_size;
6830Sstevel@tonic-gate 
6840Sstevel@tonic-gate 	if (mp == NULL)
6850Sstevel@tonic-gate 		return (allocb(size, BPRI_HI));
6860Sstevel@tonic-gate 
6870Sstevel@tonic-gate 	cur_size = mp->b_wptr - mp->b_rptr;
6880Sstevel@tonic-gate 	old_rptr = mp->b_rptr;
6890Sstevel@tonic-gate 
6900Sstevel@tonic-gate 	ASSERT(mp->b_datap->db_ref != 0);
6910Sstevel@tonic-gate 
6920Sstevel@tonic-gate 	if (mp->b_datap->db_ref == 1 && MBLKSIZE(mp) >= size) {
6930Sstevel@tonic-gate 		/*
6940Sstevel@tonic-gate 		 * If the data is wanted and it will fit where it is, no
6950Sstevel@tonic-gate 		 * work is required.
6960Sstevel@tonic-gate 		 */
6970Sstevel@tonic-gate 		if (copy && mp->b_datap->db_lim - mp->b_rptr >= size)
6980Sstevel@tonic-gate 			return (mp);
6990Sstevel@tonic-gate 
7000Sstevel@tonic-gate 		mp->b_wptr = mp->b_rptr = mp->b_datap->db_base;
7010Sstevel@tonic-gate 		mp1 = mp;
7020Sstevel@tonic-gate 	} else if ((mp1 = allocb_tmpl(size, mp)) != NULL) {
7030Sstevel@tonic-gate 		/* XXX other mp state could be copied too, db_flags ... ? */
7040Sstevel@tonic-gate 		mp1->b_cont = mp->b_cont;
7050Sstevel@tonic-gate 	} else {
7060Sstevel@tonic-gate 		return (NULL);
7070Sstevel@tonic-gate 	}
7080Sstevel@tonic-gate 
7090Sstevel@tonic-gate 	if (copy) {
7100Sstevel@tonic-gate 		bcopy(old_rptr, mp1->b_rptr, cur_size);
7110Sstevel@tonic-gate 		mp1->b_wptr = mp1->b_rptr + cur_size;
7120Sstevel@tonic-gate 	}
7130Sstevel@tonic-gate 
7140Sstevel@tonic-gate 	if (mp != mp1)
7150Sstevel@tonic-gate 		freeb(mp);
7160Sstevel@tonic-gate 
7170Sstevel@tonic-gate 	return (mp1);
7180Sstevel@tonic-gate }
7190Sstevel@tonic-gate 
7200Sstevel@tonic-gate static void
7210Sstevel@tonic-gate dblk_lastfree(mblk_t *mp, dblk_t *dbp)
7220Sstevel@tonic-gate {
7230Sstevel@tonic-gate 	ASSERT(dbp->db_mblk == mp);
7240Sstevel@tonic-gate 	if (dbp->db_fthdr != NULL)
7250Sstevel@tonic-gate 		str_ftfree(dbp);
7260Sstevel@tonic-gate 
7270Sstevel@tonic-gate 	/* set credp and projid to be 'unspecified' before returning to cache */
7280Sstevel@tonic-gate 	if (dbp->db_credp != NULL) {
7290Sstevel@tonic-gate 		crfree(dbp->db_credp);
7300Sstevel@tonic-gate 		dbp->db_credp = NULL;
7310Sstevel@tonic-gate 	}
7320Sstevel@tonic-gate 	dbp->db_cpid = -1;
7330Sstevel@tonic-gate 
7340Sstevel@tonic-gate 	/* Reset the struioflag and the checksum flag fields */
7350Sstevel@tonic-gate 	dbp->db_struioflag = 0;
7360Sstevel@tonic-gate 	dbp->db_struioun.cksum.flags = 0;
7370Sstevel@tonic-gate 
7386707Sbrutus 	/* and the COOKED and/or UIOA flag(s) */
7396707Sbrutus 	dbp->db_flags &= ~(DBLK_COOKED | DBLK_UIOA);
740898Skais 
7410Sstevel@tonic-gate 	kmem_cache_free(dbp->db_cache, dbp);
7420Sstevel@tonic-gate }
7430Sstevel@tonic-gate 
7440Sstevel@tonic-gate static void
7450Sstevel@tonic-gate dblk_decref(mblk_t *mp, dblk_t *dbp)
7460Sstevel@tonic-gate {
7470Sstevel@tonic-gate 	if (dbp->db_ref != 1) {
7480Sstevel@tonic-gate 		uint32_t rtfu = atomic_add_32_nv(&DBLK_RTFU_WORD(dbp),
7490Sstevel@tonic-gate 		    -(1 << DBLK_RTFU_SHIFT(db_ref)));
7500Sstevel@tonic-gate 		/*
7510Sstevel@tonic-gate 		 * atomic_add_32_nv() just decremented db_ref, so we no longer
7520Sstevel@tonic-gate 		 * have a reference to the dblk, which means another thread
7530Sstevel@tonic-gate 		 * could free it.  Therefore we cannot examine the dblk to
7540Sstevel@tonic-gate 		 * determine whether ours was the last reference.  Instead,
7550Sstevel@tonic-gate 		 * we extract the new and minimum reference counts from rtfu.
7560Sstevel@tonic-gate 		 * Note that all we're really saying is "if (ref != refmin)".
7570Sstevel@tonic-gate 		 */
7580Sstevel@tonic-gate 		if (((rtfu >> DBLK_RTFU_SHIFT(db_ref)) & DBLK_REFMAX) !=
7590Sstevel@tonic-gate 		    ((rtfu >> DBLK_RTFU_SHIFT(db_flags)) & DBLK_REFMIN)) {
7600Sstevel@tonic-gate 			kmem_cache_free(mblk_cache, mp);
7610Sstevel@tonic-gate 			return;
7620Sstevel@tonic-gate 		}
7630Sstevel@tonic-gate 	}
7640Sstevel@tonic-gate 	dbp->db_mblk = mp;
7650Sstevel@tonic-gate 	dbp->db_free = dbp->db_lastfree;
7660Sstevel@tonic-gate 	dbp->db_lastfree(mp, dbp);
7670Sstevel@tonic-gate }
7680Sstevel@tonic-gate 
7690Sstevel@tonic-gate mblk_t *
7700Sstevel@tonic-gate dupb(mblk_t *mp)
7710Sstevel@tonic-gate {
7720Sstevel@tonic-gate 	dblk_t *dbp = mp->b_datap;
7730Sstevel@tonic-gate 	mblk_t *new_mp;
7740Sstevel@tonic-gate 	uint32_t oldrtfu, newrtfu;
7750Sstevel@tonic-gate 
7760Sstevel@tonic-gate 	if ((new_mp = kmem_cache_alloc(mblk_cache, KM_NOSLEEP)) == NULL)
7770Sstevel@tonic-gate 		goto out;
7780Sstevel@tonic-gate 
7790Sstevel@tonic-gate 	new_mp->b_next = new_mp->b_prev = new_mp->b_cont = NULL;
7800Sstevel@tonic-gate 	new_mp->b_rptr = mp->b_rptr;
7810Sstevel@tonic-gate 	new_mp->b_wptr = mp->b_wptr;
7820Sstevel@tonic-gate 	new_mp->b_datap = dbp;
7830Sstevel@tonic-gate 	new_mp->b_queue = NULL;
7840Sstevel@tonic-gate 	MBLK_BAND_FLAG_WORD(new_mp) = MBLK_BAND_FLAG_WORD(mp);
7850Sstevel@tonic-gate 
7860Sstevel@tonic-gate 	STR_FTEVENT_MBLK(mp, caller(), FTEV_DUPB, dbp->db_ref);
7870Sstevel@tonic-gate 
7883163Sgeorges 	dbp->db_free = dblk_decref;
7890Sstevel@tonic-gate 	do {
7900Sstevel@tonic-gate 		ASSERT(dbp->db_ref > 0);
7910Sstevel@tonic-gate 		oldrtfu = DBLK_RTFU_WORD(dbp);
7920Sstevel@tonic-gate 		newrtfu = oldrtfu + (1 << DBLK_RTFU_SHIFT(db_ref));
7930Sstevel@tonic-gate 		/*
7940Sstevel@tonic-gate 		 * If db_ref is maxed out we can't dup this message anymore.
7950Sstevel@tonic-gate 		 */
7960Sstevel@tonic-gate 		if ((oldrtfu & DBLK_RTFU_REF_MASK) == DBLK_RTFU_REF_MASK) {
7970Sstevel@tonic-gate 			kmem_cache_free(mblk_cache, new_mp);
7980Sstevel@tonic-gate 			new_mp = NULL;
7990Sstevel@tonic-gate 			goto out;
8000Sstevel@tonic-gate 		}
8010Sstevel@tonic-gate 	} while (cas32(&DBLK_RTFU_WORD(dbp), oldrtfu, newrtfu) != oldrtfu);
8020Sstevel@tonic-gate 
8030Sstevel@tonic-gate out:
8040Sstevel@tonic-gate 	FTRACE_1("dupb(): new_mp=0x%lx", (uintptr_t)new_mp);
8050Sstevel@tonic-gate 	return (new_mp);
8060Sstevel@tonic-gate }
8070Sstevel@tonic-gate 
8080Sstevel@tonic-gate static void
8090Sstevel@tonic-gate dblk_lastfree_desb(mblk_t *mp, dblk_t *dbp)
8100Sstevel@tonic-gate {
8110Sstevel@tonic-gate 	frtn_t *frp = dbp->db_frtnp;
8120Sstevel@tonic-gate 
8130Sstevel@tonic-gate 	ASSERT(dbp->db_mblk == mp);
8140Sstevel@tonic-gate 	frp->free_func(frp->free_arg);
8150Sstevel@tonic-gate 	if (dbp->db_fthdr != NULL)
8160Sstevel@tonic-gate 		str_ftfree(dbp);
8170Sstevel@tonic-gate 
8180Sstevel@tonic-gate 	/* set credp and projid to be 'unspecified' before returning to cache */
8190Sstevel@tonic-gate 	if (dbp->db_credp != NULL) {
8200Sstevel@tonic-gate 		crfree(dbp->db_credp);
8210Sstevel@tonic-gate 		dbp->db_credp = NULL;
8220Sstevel@tonic-gate 	}
8230Sstevel@tonic-gate 	dbp->db_cpid = -1;
8240Sstevel@tonic-gate 	dbp->db_struioflag = 0;
8250Sstevel@tonic-gate 	dbp->db_struioun.cksum.flags = 0;
8260Sstevel@tonic-gate 
8270Sstevel@tonic-gate 	kmem_cache_free(dbp->db_cache, dbp);
8280Sstevel@tonic-gate }
8290Sstevel@tonic-gate 
8300Sstevel@tonic-gate /*ARGSUSED*/
8310Sstevel@tonic-gate static void
8320Sstevel@tonic-gate frnop_func(void *arg)
8330Sstevel@tonic-gate {
8340Sstevel@tonic-gate }
8350Sstevel@tonic-gate 
8360Sstevel@tonic-gate /*
8370Sstevel@tonic-gate  * Generic esballoc used to implement the four flavors: [d]esballoc[a].
8380Sstevel@tonic-gate  */
8390Sstevel@tonic-gate static mblk_t *
8400Sstevel@tonic-gate gesballoc(unsigned char *base, size_t size, uint32_t db_rtfu, frtn_t *frp,
8410Sstevel@tonic-gate 	void (*lastfree)(mblk_t *, dblk_t *), int kmflags)
8420Sstevel@tonic-gate {
8430Sstevel@tonic-gate 	dblk_t *dbp;
8440Sstevel@tonic-gate 	mblk_t *mp;
8450Sstevel@tonic-gate 
8460Sstevel@tonic-gate 	ASSERT(base != NULL && frp != NULL);
8470Sstevel@tonic-gate 
8480Sstevel@tonic-gate 	if ((dbp = kmem_cache_alloc(dblk_esb_cache, kmflags)) == NULL) {
8490Sstevel@tonic-gate 		mp = NULL;
8500Sstevel@tonic-gate 		goto out;
8510Sstevel@tonic-gate 	}
8520Sstevel@tonic-gate 
8530Sstevel@tonic-gate 	mp = dbp->db_mblk;
8540Sstevel@tonic-gate 	dbp->db_base = base;
8550Sstevel@tonic-gate 	dbp->db_lim = base + size;
8560Sstevel@tonic-gate 	dbp->db_free = dbp->db_lastfree = lastfree;
8570Sstevel@tonic-gate 	dbp->db_frtnp = frp;
8580Sstevel@tonic-gate 	DBLK_RTFU_WORD(dbp) = db_rtfu;
8590Sstevel@tonic-gate 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
8600Sstevel@tonic-gate 	mp->b_rptr = mp->b_wptr = base;
8610Sstevel@tonic-gate 	mp->b_queue = NULL;
8620Sstevel@tonic-gate 	MBLK_BAND_FLAG_WORD(mp) = 0;
8630Sstevel@tonic-gate 
8640Sstevel@tonic-gate out:
8650Sstevel@tonic-gate 	FTRACE_1("gesballoc(): mp=0x%lx", (uintptr_t)mp);
8660Sstevel@tonic-gate 	return (mp);
8670Sstevel@tonic-gate }
8680Sstevel@tonic-gate 
8690Sstevel@tonic-gate /*ARGSUSED*/
8700Sstevel@tonic-gate mblk_t *
8710Sstevel@tonic-gate esballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
8720Sstevel@tonic-gate {
8730Sstevel@tonic-gate 	mblk_t *mp;
8740Sstevel@tonic-gate 
8750Sstevel@tonic-gate 	/*
8760Sstevel@tonic-gate 	 * Note that this is structured to allow the common case (i.e.
8770Sstevel@tonic-gate 	 * STREAMS flowtracing disabled) to call gesballoc() with tail
8780Sstevel@tonic-gate 	 * call optimization.
8790Sstevel@tonic-gate 	 */
8800Sstevel@tonic-gate 	if (!str_ftnever) {
8810Sstevel@tonic-gate 		mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
8820Sstevel@tonic-gate 		    frp, freebs_enqueue, KM_NOSLEEP);
8830Sstevel@tonic-gate 
8840Sstevel@tonic-gate 		if (mp != NULL)
8850Sstevel@tonic-gate 			STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOC, size);
8860Sstevel@tonic-gate 		return (mp);
8870Sstevel@tonic-gate 	}
8880Sstevel@tonic-gate 
8890Sstevel@tonic-gate 	return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
8900Sstevel@tonic-gate 	    frp, freebs_enqueue, KM_NOSLEEP));
8910Sstevel@tonic-gate }
8920Sstevel@tonic-gate 
8930Sstevel@tonic-gate /*
8940Sstevel@tonic-gate  * Same as esballoc() but sleeps waiting for memory.
8950Sstevel@tonic-gate  */
8960Sstevel@tonic-gate /*ARGSUSED*/
8970Sstevel@tonic-gate mblk_t *
8980Sstevel@tonic-gate esballoc_wait(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
8990Sstevel@tonic-gate {
9000Sstevel@tonic-gate 	mblk_t *mp;
9010Sstevel@tonic-gate 
9020Sstevel@tonic-gate 	/*
9030Sstevel@tonic-gate 	 * Note that this is structured to allow the common case (i.e.
9040Sstevel@tonic-gate 	 * STREAMS flowtracing disabled) to call gesballoc() with tail
9050Sstevel@tonic-gate 	 * call optimization.
9060Sstevel@tonic-gate 	 */
9070Sstevel@tonic-gate 	if (!str_ftnever) {
9080Sstevel@tonic-gate 		mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
9090Sstevel@tonic-gate 		    frp, freebs_enqueue, KM_SLEEP);
9100Sstevel@tonic-gate 
9110Sstevel@tonic-gate 		STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOC, size);
9120Sstevel@tonic-gate 		return (mp);
9130Sstevel@tonic-gate 	}
9140Sstevel@tonic-gate 
9150Sstevel@tonic-gate 	return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
9160Sstevel@tonic-gate 	    frp, freebs_enqueue, KM_SLEEP));
9170Sstevel@tonic-gate }
9180Sstevel@tonic-gate 
9190Sstevel@tonic-gate /*ARGSUSED*/
9200Sstevel@tonic-gate mblk_t *
9210Sstevel@tonic-gate desballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
9220Sstevel@tonic-gate {
9230Sstevel@tonic-gate 	mblk_t *mp;
9240Sstevel@tonic-gate 
9250Sstevel@tonic-gate 	/*
9260Sstevel@tonic-gate 	 * Note that this is structured to allow the common case (i.e.
9270Sstevel@tonic-gate 	 * STREAMS flowtracing disabled) to call gesballoc() with tail
9280Sstevel@tonic-gate 	 * call optimization.
9290Sstevel@tonic-gate 	 */
9300Sstevel@tonic-gate 	if (!str_ftnever) {
9310Sstevel@tonic-gate 		mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
9326707Sbrutus 		    frp, dblk_lastfree_desb, KM_NOSLEEP);
9330Sstevel@tonic-gate 
9340Sstevel@tonic-gate 		if (mp != NULL)
9350Sstevel@tonic-gate 			STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOC, size);
9360Sstevel@tonic-gate 		return (mp);
9370Sstevel@tonic-gate 	}
9380Sstevel@tonic-gate 
9390Sstevel@tonic-gate 	return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
9400Sstevel@tonic-gate 	    frp, dblk_lastfree_desb, KM_NOSLEEP));
9410Sstevel@tonic-gate }
9420Sstevel@tonic-gate 
9430Sstevel@tonic-gate /*ARGSUSED*/
9440Sstevel@tonic-gate mblk_t *
9450Sstevel@tonic-gate esballoca(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
9460Sstevel@tonic-gate {
9470Sstevel@tonic-gate 	mblk_t *mp;
9480Sstevel@tonic-gate 
9490Sstevel@tonic-gate 	/*
9500Sstevel@tonic-gate 	 * Note that this is structured to allow the common case (i.e.
9510Sstevel@tonic-gate 	 * STREAMS flowtracing disabled) to call gesballoc() with tail
9520Sstevel@tonic-gate 	 * call optimization.
9530Sstevel@tonic-gate 	 */
9540Sstevel@tonic-gate 	if (!str_ftnever) {
9550Sstevel@tonic-gate 		mp = gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
9560Sstevel@tonic-gate 		    frp, freebs_enqueue, KM_NOSLEEP);
9570Sstevel@tonic-gate 
9580Sstevel@tonic-gate 		if (mp != NULL)
9590Sstevel@tonic-gate 			STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOCA, size);
9600Sstevel@tonic-gate 		return (mp);
9610Sstevel@tonic-gate 	}
9620Sstevel@tonic-gate 
9630Sstevel@tonic-gate 	return (gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
9640Sstevel@tonic-gate 	    frp, freebs_enqueue, KM_NOSLEEP));
9650Sstevel@tonic-gate }
9660Sstevel@tonic-gate 
9670Sstevel@tonic-gate /*ARGSUSED*/
9680Sstevel@tonic-gate mblk_t *
9690Sstevel@tonic-gate desballoca(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
9700Sstevel@tonic-gate {
9710Sstevel@tonic-gate 	mblk_t *mp;
9720Sstevel@tonic-gate 
9730Sstevel@tonic-gate 	/*
9740Sstevel@tonic-gate 	 * Note that this is structured to allow the common case (i.e.
9750Sstevel@tonic-gate 	 * STREAMS flowtracing disabled) to call gesballoc() with tail
9760Sstevel@tonic-gate 	 * call optimization.
9770Sstevel@tonic-gate 	 */
9780Sstevel@tonic-gate 	if (!str_ftnever) {
9790Sstevel@tonic-gate 		mp = gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
9800Sstevel@tonic-gate 		    frp, dblk_lastfree_desb, KM_NOSLEEP);
9810Sstevel@tonic-gate 
9820Sstevel@tonic-gate 		if (mp != NULL)
9830Sstevel@tonic-gate 			STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOCA, size);
9840Sstevel@tonic-gate 		return (mp);
9850Sstevel@tonic-gate 	}
9860Sstevel@tonic-gate 
9870Sstevel@tonic-gate 	return (gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
9880Sstevel@tonic-gate 	    frp, dblk_lastfree_desb, KM_NOSLEEP));
9890Sstevel@tonic-gate }
9900Sstevel@tonic-gate 
9910Sstevel@tonic-gate static void
9920Sstevel@tonic-gate bcache_dblk_lastfree(mblk_t *mp, dblk_t *dbp)
9930Sstevel@tonic-gate {
9940Sstevel@tonic-gate 	bcache_t *bcp = dbp->db_cache;
9950Sstevel@tonic-gate 
9960Sstevel@tonic-gate 	ASSERT(dbp->db_mblk == mp);
9970Sstevel@tonic-gate 	if (dbp->db_fthdr != NULL)
9980Sstevel@tonic-gate 		str_ftfree(dbp);
9990Sstevel@tonic-gate 
10000Sstevel@tonic-gate 	/* set credp and projid to be 'unspecified' before returning to cache */
10010Sstevel@tonic-gate 	if (dbp->db_credp != NULL) {
10020Sstevel@tonic-gate 		crfree(dbp->db_credp);
10030Sstevel@tonic-gate 		dbp->db_credp = NULL;
10040Sstevel@tonic-gate 	}
10050Sstevel@tonic-gate 	dbp->db_cpid = -1;
10060Sstevel@tonic-gate 	dbp->db_struioflag = 0;
10070Sstevel@tonic-gate 	dbp->db_struioun.cksum.flags = 0;
10080Sstevel@tonic-gate 
10090Sstevel@tonic-gate 	mutex_enter(&bcp->mutex);
10100Sstevel@tonic-gate 	kmem_cache_free(bcp->dblk_cache, dbp);
10110Sstevel@tonic-gate 	bcp->alloc--;
10120Sstevel@tonic-gate 
10130Sstevel@tonic-gate 	if (bcp->alloc == 0 && bcp->destroy != 0) {
10140Sstevel@tonic-gate 		kmem_cache_destroy(bcp->dblk_cache);
10150Sstevel@tonic-gate 		kmem_cache_destroy(bcp->buffer_cache);
10160Sstevel@tonic-gate 		mutex_exit(&bcp->mutex);
10170Sstevel@tonic-gate 		mutex_destroy(&bcp->mutex);
10180Sstevel@tonic-gate 		kmem_free(bcp, sizeof (bcache_t));
10190Sstevel@tonic-gate 	} else {
10200Sstevel@tonic-gate 		mutex_exit(&bcp->mutex);
10210Sstevel@tonic-gate 	}
10220Sstevel@tonic-gate }
10230Sstevel@tonic-gate 
10240Sstevel@tonic-gate bcache_t *
10250Sstevel@tonic-gate bcache_create(char *name, size_t size, uint_t align)
10260Sstevel@tonic-gate {
10270Sstevel@tonic-gate 	bcache_t *bcp;
10280Sstevel@tonic-gate 	char buffer[255];
10290Sstevel@tonic-gate 
10300Sstevel@tonic-gate 	ASSERT((align & (align - 1)) == 0);
10310Sstevel@tonic-gate 
10328752SPeter.Memishian@Sun.COM 	if ((bcp = kmem_alloc(sizeof (bcache_t), KM_NOSLEEP)) == NULL)
10330Sstevel@tonic-gate 		return (NULL);
10340Sstevel@tonic-gate 
10350Sstevel@tonic-gate 	bcp->size = size;
10360Sstevel@tonic-gate 	bcp->align = align;
10370Sstevel@tonic-gate 	bcp->alloc = 0;
10380Sstevel@tonic-gate 	bcp->destroy = 0;
10390Sstevel@tonic-gate 
10400Sstevel@tonic-gate 	mutex_init(&bcp->mutex, NULL, MUTEX_DRIVER, NULL);
10410Sstevel@tonic-gate 
10420Sstevel@tonic-gate 	(void) sprintf(buffer, "%s_buffer_cache", name);
10430Sstevel@tonic-gate 	bcp->buffer_cache = kmem_cache_create(buffer, size, align, NULL, NULL,
10440Sstevel@tonic-gate 	    NULL, NULL, NULL, 0);
10450Sstevel@tonic-gate 	(void) sprintf(buffer, "%s_dblk_cache", name);
10460Sstevel@tonic-gate 	bcp->dblk_cache = kmem_cache_create(buffer, sizeof (dblk_t),
10470Sstevel@tonic-gate 	    DBLK_CACHE_ALIGN, bcache_dblk_constructor, bcache_dblk_destructor,
10486707Sbrutus 	    NULL, (void *)bcp, NULL, 0);
10490Sstevel@tonic-gate 
10500Sstevel@tonic-gate 	return (bcp);
10510Sstevel@tonic-gate }
10520Sstevel@tonic-gate 
10530Sstevel@tonic-gate void
10540Sstevel@tonic-gate bcache_destroy(bcache_t *bcp)
10550Sstevel@tonic-gate {
10560Sstevel@tonic-gate 	ASSERT(bcp != NULL);
10570Sstevel@tonic-gate 
10580Sstevel@tonic-gate 	mutex_enter(&bcp->mutex);
10590Sstevel@tonic-gate 	if (bcp->alloc == 0) {
10600Sstevel@tonic-gate 		kmem_cache_destroy(bcp->dblk_cache);
10610Sstevel@tonic-gate 		kmem_cache_destroy(bcp->buffer_cache);
10620Sstevel@tonic-gate 		mutex_exit(&bcp->mutex);
10630Sstevel@tonic-gate 		mutex_destroy(&bcp->mutex);
10640Sstevel@tonic-gate 		kmem_free(bcp, sizeof (bcache_t));
10650Sstevel@tonic-gate 	} else {
10660Sstevel@tonic-gate 		bcp->destroy++;
10670Sstevel@tonic-gate 		mutex_exit(&bcp->mutex);
10680Sstevel@tonic-gate 	}
10690Sstevel@tonic-gate }
10700Sstevel@tonic-gate 
10710Sstevel@tonic-gate /*ARGSUSED*/
10720Sstevel@tonic-gate mblk_t *
10730Sstevel@tonic-gate bcache_allocb(bcache_t *bcp, uint_t pri)
10740Sstevel@tonic-gate {
10750Sstevel@tonic-gate 	dblk_t *dbp;
10760Sstevel@tonic-gate 	mblk_t *mp = NULL;
10770Sstevel@tonic-gate 
10780Sstevel@tonic-gate 	ASSERT(bcp != NULL);
10790Sstevel@tonic-gate 
10800Sstevel@tonic-gate 	mutex_enter(&bcp->mutex);
10810Sstevel@tonic-gate 	if (bcp->destroy != 0) {
10820Sstevel@tonic-gate 		mutex_exit(&bcp->mutex);
10830Sstevel@tonic-gate 		goto out;
10840Sstevel@tonic-gate 	}
10850Sstevel@tonic-gate 
10860Sstevel@tonic-gate 	if ((dbp = kmem_cache_alloc(bcp->dblk_cache, KM_NOSLEEP)) == NULL) {
10870Sstevel@tonic-gate 		mutex_exit(&bcp->mutex);
10880Sstevel@tonic-gate 		goto out;
10890Sstevel@tonic-gate 	}
10900Sstevel@tonic-gate 	bcp->alloc++;
10910Sstevel@tonic-gate 	mutex_exit(&bcp->mutex);
10920Sstevel@tonic-gate 
10930Sstevel@tonic-gate 	ASSERT(((uintptr_t)(dbp->db_base) & (bcp->align - 1)) == 0);
10940Sstevel@tonic-gate 
10950Sstevel@tonic-gate 	mp = dbp->db_mblk;
10960Sstevel@tonic-gate 	DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
10970Sstevel@tonic-gate 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
10980Sstevel@tonic-gate 	mp->b_rptr = mp->b_wptr = dbp->db_base;
10990Sstevel@tonic-gate 	mp->b_queue = NULL;
11000Sstevel@tonic-gate 	MBLK_BAND_FLAG_WORD(mp) = 0;
11010Sstevel@tonic-gate 	STR_FTALLOC(&dbp->db_fthdr, FTEV_BCALLOCB, bcp->size);
11020Sstevel@tonic-gate out:
11030Sstevel@tonic-gate 	FTRACE_1("bcache_allocb(): mp=0x%p", (uintptr_t)mp);
11040Sstevel@tonic-gate 
11050Sstevel@tonic-gate 	return (mp);
11060Sstevel@tonic-gate }
11070Sstevel@tonic-gate 
11080Sstevel@tonic-gate static void
11090Sstevel@tonic-gate dblk_lastfree_oversize(mblk_t *mp, dblk_t *dbp)
11100Sstevel@tonic-gate {
11110Sstevel@tonic-gate 	ASSERT(dbp->db_mblk == mp);
11120Sstevel@tonic-gate 	if (dbp->db_fthdr != NULL)
11130Sstevel@tonic-gate 		str_ftfree(dbp);
11140Sstevel@tonic-gate 
11150Sstevel@tonic-gate 	/* set credp and projid to be 'unspecified' before returning to cache */
11160Sstevel@tonic-gate 	if (dbp->db_credp != NULL) {
11170Sstevel@tonic-gate 		crfree(dbp->db_credp);
11180Sstevel@tonic-gate 		dbp->db_credp = NULL;
11190Sstevel@tonic-gate 	}
11200Sstevel@tonic-gate 	dbp->db_cpid = -1;
11210Sstevel@tonic-gate 	dbp->db_struioflag = 0;
11220Sstevel@tonic-gate 	dbp->db_struioun.cksum.flags = 0;
11230Sstevel@tonic-gate 
11240Sstevel@tonic-gate 	kmem_free(dbp->db_base, dbp->db_lim - dbp->db_base);
11250Sstevel@tonic-gate 	kmem_cache_free(dbp->db_cache, dbp);
11260Sstevel@tonic-gate }
11270Sstevel@tonic-gate 
11280Sstevel@tonic-gate static mblk_t *
11290Sstevel@tonic-gate allocb_oversize(size_t size, int kmflags)
11300Sstevel@tonic-gate {
11310Sstevel@tonic-gate 	mblk_t *mp;
11320Sstevel@tonic-gate 	void *buf;
11330Sstevel@tonic-gate 
11340Sstevel@tonic-gate 	size = P2ROUNDUP(size, DBLK_CACHE_ALIGN);
11350Sstevel@tonic-gate 	if ((buf = kmem_alloc(size, kmflags)) == NULL)
11360Sstevel@tonic-gate 		return (NULL);
11370Sstevel@tonic-gate 	if ((mp = gesballoc(buf, size, DBLK_RTFU(1, M_DATA, 0, 0),
11380Sstevel@tonic-gate 	    &frnop, dblk_lastfree_oversize, kmflags)) == NULL)
11390Sstevel@tonic-gate 		kmem_free(buf, size);
11400Sstevel@tonic-gate 
11410Sstevel@tonic-gate 	if (mp != NULL)
11420Sstevel@tonic-gate 		STR_FTALLOC(&DB_FTHDR(mp), FTEV_ALLOCBIG, size);
11430Sstevel@tonic-gate 
11440Sstevel@tonic-gate 	return (mp);
11450Sstevel@tonic-gate }
11460Sstevel@tonic-gate 
11470Sstevel@tonic-gate mblk_t *
11480Sstevel@tonic-gate allocb_tryhard(size_t target_size)
11490Sstevel@tonic-gate {
11500Sstevel@tonic-gate 	size_t size;
11510Sstevel@tonic-gate 	mblk_t *bp;
11520Sstevel@tonic-gate 
11530Sstevel@tonic-gate 	for (size = target_size; size < target_size + 512;
11540Sstevel@tonic-gate 	    size += DBLK_CACHE_ALIGN)
11550Sstevel@tonic-gate 		if ((bp = allocb(size, BPRI_HI)) != NULL)
11560Sstevel@tonic-gate 			return (bp);
11570Sstevel@tonic-gate 	allocb_tryhard_fails++;
11580Sstevel@tonic-gate 	return (NULL);
11590Sstevel@tonic-gate }
11600Sstevel@tonic-gate 
11610Sstevel@tonic-gate /*
11620Sstevel@tonic-gate  * This routine is consolidation private for STREAMS internal use
11630Sstevel@tonic-gate  * This routine may only be called from sync routines (i.e., not
11640Sstevel@tonic-gate  * from put or service procedures).  It is located here (rather
11650Sstevel@tonic-gate  * than strsubr.c) so that we don't have to expose all of the
11660Sstevel@tonic-gate  * allocb() implementation details in header files.
11670Sstevel@tonic-gate  */
11680Sstevel@tonic-gate mblk_t *
11690Sstevel@tonic-gate allocb_wait(size_t size, uint_t pri, uint_t flags, int *error)
11700Sstevel@tonic-gate {
11710Sstevel@tonic-gate 	dblk_t *dbp;
11720Sstevel@tonic-gate 	mblk_t *mp;
11730Sstevel@tonic-gate 	size_t index;
11740Sstevel@tonic-gate 
11750Sstevel@tonic-gate 	index = (size -1) >> DBLK_SIZE_SHIFT;
11760Sstevel@tonic-gate 
11770Sstevel@tonic-gate 	if (flags & STR_NOSIG) {
11780Sstevel@tonic-gate 		if (index >= (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT)) {
11790Sstevel@tonic-gate 			if (size != 0) {
11800Sstevel@tonic-gate 				mp = allocb_oversize(size, KM_SLEEP);
11810Sstevel@tonic-gate 				FTRACE_1("allocb_wait (NOSIG): mp=0x%lx",
11820Sstevel@tonic-gate 				    (uintptr_t)mp);
11830Sstevel@tonic-gate 				return (mp);
11840Sstevel@tonic-gate 			}
11850Sstevel@tonic-gate 			index = 0;
11860Sstevel@tonic-gate 		}
11870Sstevel@tonic-gate 
11880Sstevel@tonic-gate 		dbp = kmem_cache_alloc(dblk_cache[index], KM_SLEEP);
11890Sstevel@tonic-gate 		mp = dbp->db_mblk;
11900Sstevel@tonic-gate 		DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
11910Sstevel@tonic-gate 		mp->b_next = mp->b_prev = mp->b_cont = NULL;
11920Sstevel@tonic-gate 		mp->b_rptr = mp->b_wptr = dbp->db_base;
11930Sstevel@tonic-gate 		mp->b_queue = NULL;
11940Sstevel@tonic-gate 		MBLK_BAND_FLAG_WORD(mp) = 0;
11950Sstevel@tonic-gate 		STR_FTALLOC(&DB_FTHDR(mp), FTEV_ALLOCBW, size);
11960Sstevel@tonic-gate 
11970Sstevel@tonic-gate 		FTRACE_1("allocb_wait (NOSIG): mp=0x%lx", (uintptr_t)mp);
11980Sstevel@tonic-gate 
11990Sstevel@tonic-gate 	} else {
12000Sstevel@tonic-gate 		while ((mp = allocb(size, pri)) == NULL) {
12010Sstevel@tonic-gate 			if ((*error = strwaitbuf(size, BPRI_HI)) != 0)
12020Sstevel@tonic-gate 				return (NULL);
12030Sstevel@tonic-gate 		}
12040Sstevel@tonic-gate 	}
12050Sstevel@tonic-gate 
12060Sstevel@tonic-gate 	return (mp);
12070Sstevel@tonic-gate }
12080Sstevel@tonic-gate 
12090Sstevel@tonic-gate /*
12100Sstevel@tonic-gate  * Call function 'func' with 'arg' when a class zero block can
12110Sstevel@tonic-gate  * be allocated with priority 'pri'.
12120Sstevel@tonic-gate  */
12130Sstevel@tonic-gate bufcall_id_t
12140Sstevel@tonic-gate esbbcall(uint_t pri, void (*func)(void *), void *arg)
12150Sstevel@tonic-gate {
12160Sstevel@tonic-gate 	return (bufcall(1, pri, func, arg));
12170Sstevel@tonic-gate }
12180Sstevel@tonic-gate 
12190Sstevel@tonic-gate /*
12200Sstevel@tonic-gate  * Allocates an iocblk (M_IOCTL) block. Properly sets the credentials
12210Sstevel@tonic-gate  * ioc_id, rval and error of the struct ioctl to set up an ioctl call.
12220Sstevel@tonic-gate  * This provides consistency for all internal allocators of ioctl.
12230Sstevel@tonic-gate  */
12240Sstevel@tonic-gate mblk_t *
12250Sstevel@tonic-gate mkiocb(uint_t cmd)
12260Sstevel@tonic-gate {
12270Sstevel@tonic-gate 	struct iocblk	*ioc;
12280Sstevel@tonic-gate 	mblk_t		*mp;
12290Sstevel@tonic-gate 
12300Sstevel@tonic-gate 	/*
12310Sstevel@tonic-gate 	 * Allocate enough space for any of the ioctl related messages.
12320Sstevel@tonic-gate 	 */
12330Sstevel@tonic-gate 	if ((mp = allocb(sizeof (union ioctypes), BPRI_MED)) == NULL)
12340Sstevel@tonic-gate 		return (NULL);
12350Sstevel@tonic-gate 
12360Sstevel@tonic-gate 	bzero(mp->b_rptr, sizeof (union ioctypes));
12370Sstevel@tonic-gate 
12380Sstevel@tonic-gate 	/*
12390Sstevel@tonic-gate 	 * Set the mblk_t information and ptrs correctly.
12400Sstevel@tonic-gate 	 */
12410Sstevel@tonic-gate 	mp->b_wptr += sizeof (struct iocblk);
12420Sstevel@tonic-gate 	mp->b_datap->db_type = M_IOCTL;
12430Sstevel@tonic-gate 
12440Sstevel@tonic-gate 	/*
12450Sstevel@tonic-gate 	 * Fill in the fields.
12460Sstevel@tonic-gate 	 */
12470Sstevel@tonic-gate 	ioc		= (struct iocblk *)mp->b_rptr;
12480Sstevel@tonic-gate 	ioc->ioc_cmd	= cmd;
12490Sstevel@tonic-gate 	ioc->ioc_cr	= kcred;
12500Sstevel@tonic-gate 	ioc->ioc_id	= getiocseqno();
12510Sstevel@tonic-gate 	ioc->ioc_flag	= IOC_NATIVE;
12520Sstevel@tonic-gate 	return (mp);
12530Sstevel@tonic-gate }
12540Sstevel@tonic-gate 
12550Sstevel@tonic-gate /*
12560Sstevel@tonic-gate  * test if block of given size can be allocated with a request of
12570Sstevel@tonic-gate  * the given priority.
12580Sstevel@tonic-gate  * 'pri' is no longer used, but is retained for compatibility.
12590Sstevel@tonic-gate  */
12600Sstevel@tonic-gate /* ARGSUSED */
12610Sstevel@tonic-gate int
12620Sstevel@tonic-gate testb(size_t size, uint_t pri)
12630Sstevel@tonic-gate {
12640Sstevel@tonic-gate 	return ((size + sizeof (dblk_t)) <= kmem_avail());
12650Sstevel@tonic-gate }
12660Sstevel@tonic-gate 
12670Sstevel@tonic-gate /*
12680Sstevel@tonic-gate  * Call function 'func' with argument 'arg' when there is a reasonably
12690Sstevel@tonic-gate  * good chance that a block of size 'size' can be allocated.
12700Sstevel@tonic-gate  * 'pri' is no longer used, but is retained for compatibility.
12710Sstevel@tonic-gate  */
12720Sstevel@tonic-gate /* ARGSUSED */
12730Sstevel@tonic-gate bufcall_id_t
12740Sstevel@tonic-gate bufcall(size_t size, uint_t pri, void (*func)(void *), void *arg)
12750Sstevel@tonic-gate {
12760Sstevel@tonic-gate 	static long bid = 1;	/* always odd to save checking for zero */
12770Sstevel@tonic-gate 	bufcall_id_t bc_id;
12780Sstevel@tonic-gate 	struct strbufcall *bcp;
12790Sstevel@tonic-gate 
12800Sstevel@tonic-gate 	if ((bcp = kmem_alloc(sizeof (strbufcall_t), KM_NOSLEEP)) == NULL)
12810Sstevel@tonic-gate 		return (0);
12820Sstevel@tonic-gate 
12830Sstevel@tonic-gate 	bcp->bc_func = func;
12840Sstevel@tonic-gate 	bcp->bc_arg = arg;
12850Sstevel@tonic-gate 	bcp->bc_size = size;
12860Sstevel@tonic-gate 	bcp->bc_next = NULL;
12870Sstevel@tonic-gate 	bcp->bc_executor = NULL;
12880Sstevel@tonic-gate 
12890Sstevel@tonic-gate 	mutex_enter(&strbcall_lock);
12900Sstevel@tonic-gate 	/*
12910Sstevel@tonic-gate 	 * After bcp is linked into strbcalls and strbcall_lock is dropped there
12920Sstevel@tonic-gate 	 * should be no references to bcp since it may be freed by
12930Sstevel@tonic-gate 	 * runbufcalls(). Since bcp_id field is returned, we save its value in
12940Sstevel@tonic-gate 	 * the local var.
12950Sstevel@tonic-gate 	 */
12960Sstevel@tonic-gate 	bc_id = bcp->bc_id = (bufcall_id_t)(bid += 2);	/* keep it odd */
12970Sstevel@tonic-gate 
12980Sstevel@tonic-gate 	/*
12990Sstevel@tonic-gate 	 * add newly allocated stream event to existing
13000Sstevel@tonic-gate 	 * linked list of events.
13010Sstevel@tonic-gate 	 */
13020Sstevel@tonic-gate 	if (strbcalls.bc_head == NULL) {
13030Sstevel@tonic-gate 		strbcalls.bc_head = strbcalls.bc_tail = bcp;
13040Sstevel@tonic-gate 	} else {
13050Sstevel@tonic-gate 		strbcalls.bc_tail->bc_next = bcp;
13060Sstevel@tonic-gate 		strbcalls.bc_tail = bcp;
13070Sstevel@tonic-gate 	}
13080Sstevel@tonic-gate 
13090Sstevel@tonic-gate 	cv_signal(&strbcall_cv);
13100Sstevel@tonic-gate 	mutex_exit(&strbcall_lock);
13110Sstevel@tonic-gate 	return (bc_id);
13120Sstevel@tonic-gate }
13130Sstevel@tonic-gate 
13140Sstevel@tonic-gate /*
13150Sstevel@tonic-gate  * Cancel a bufcall request.
13160Sstevel@tonic-gate  */
13170Sstevel@tonic-gate void
13180Sstevel@tonic-gate unbufcall(bufcall_id_t id)
13190Sstevel@tonic-gate {
13200Sstevel@tonic-gate 	strbufcall_t *bcp, *pbcp;
13210Sstevel@tonic-gate 
13220Sstevel@tonic-gate 	mutex_enter(&strbcall_lock);
13230Sstevel@tonic-gate again:
13240Sstevel@tonic-gate 	pbcp = NULL;
13250Sstevel@tonic-gate 	for (bcp = strbcalls.bc_head; bcp; bcp = bcp->bc_next) {
13260Sstevel@tonic-gate 		if (id == bcp->bc_id)
13270Sstevel@tonic-gate 			break;
13280Sstevel@tonic-gate 		pbcp = bcp;
13290Sstevel@tonic-gate 	}
13300Sstevel@tonic-gate 	if (bcp) {
13310Sstevel@tonic-gate 		if (bcp->bc_executor != NULL) {
13320Sstevel@tonic-gate 			if (bcp->bc_executor != curthread) {
13330Sstevel@tonic-gate 				cv_wait(&bcall_cv, &strbcall_lock);
13340Sstevel@tonic-gate 				goto again;
13350Sstevel@tonic-gate 			}
13360Sstevel@tonic-gate 		} else {
13370Sstevel@tonic-gate 			if (pbcp)
13380Sstevel@tonic-gate 				pbcp->bc_next = bcp->bc_next;
13390Sstevel@tonic-gate 			else
13400Sstevel@tonic-gate 				strbcalls.bc_head = bcp->bc_next;
13410Sstevel@tonic-gate 			if (bcp == strbcalls.bc_tail)
13420Sstevel@tonic-gate 				strbcalls.bc_tail = pbcp;
13430Sstevel@tonic-gate 			kmem_free(bcp, sizeof (strbufcall_t));
13440Sstevel@tonic-gate 		}
13450Sstevel@tonic-gate 	}
13460Sstevel@tonic-gate 	mutex_exit(&strbcall_lock);
13470Sstevel@tonic-gate }
13480Sstevel@tonic-gate 
13490Sstevel@tonic-gate /*
13500Sstevel@tonic-gate  * Duplicate a message block by block (uses dupb), returning
13510Sstevel@tonic-gate  * a pointer to the duplicate message.
13520Sstevel@tonic-gate  * Returns a non-NULL value only if the entire message
13530Sstevel@tonic-gate  * was dup'd.
13540Sstevel@tonic-gate  */
13550Sstevel@tonic-gate mblk_t *
13560Sstevel@tonic-gate dupmsg(mblk_t *bp)
13570Sstevel@tonic-gate {
13580Sstevel@tonic-gate 	mblk_t *head, *nbp;
13590Sstevel@tonic-gate 
13600Sstevel@tonic-gate 	if (!bp || !(nbp = head = dupb(bp)))
13610Sstevel@tonic-gate 		return (NULL);
13620Sstevel@tonic-gate 
13630Sstevel@tonic-gate 	while (bp->b_cont) {
13640Sstevel@tonic-gate 		if (!(nbp->b_cont = dupb(bp->b_cont))) {
13650Sstevel@tonic-gate 			freemsg(head);
13660Sstevel@tonic-gate 			return (NULL);
13670Sstevel@tonic-gate 		}
13680Sstevel@tonic-gate 		nbp = nbp->b_cont;
13690Sstevel@tonic-gate 		bp = bp->b_cont;
13700Sstevel@tonic-gate 	}
13710Sstevel@tonic-gate 	return (head);
13720Sstevel@tonic-gate }
13730Sstevel@tonic-gate 
13740Sstevel@tonic-gate #define	DUPB_NOLOAN(bp) \
13750Sstevel@tonic-gate 	((((bp)->b_datap->db_struioflag & STRUIO_ZC) != 0) ? \
13760Sstevel@tonic-gate 	copyb((bp)) : dupb((bp)))
13770Sstevel@tonic-gate 
13780Sstevel@tonic-gate mblk_t *
13790Sstevel@tonic-gate dupmsg_noloan(mblk_t *bp)
13800Sstevel@tonic-gate {
13810Sstevel@tonic-gate 	mblk_t *head, *nbp;
13820Sstevel@tonic-gate 
13830Sstevel@tonic-gate 	if (bp == NULL || DB_TYPE(bp) != M_DATA ||
13840Sstevel@tonic-gate 	    ((nbp = head = DUPB_NOLOAN(bp)) == NULL))
13850Sstevel@tonic-gate 		return (NULL);
13860Sstevel@tonic-gate 
13870Sstevel@tonic-gate 	while (bp->b_cont) {
13880Sstevel@tonic-gate 		if ((nbp->b_cont = DUPB_NOLOAN(bp->b_cont)) == NULL) {
13890Sstevel@tonic-gate 			freemsg(head);
13900Sstevel@tonic-gate 			return (NULL);
13910Sstevel@tonic-gate 		}
13920Sstevel@tonic-gate 		nbp = nbp->b_cont;
13930Sstevel@tonic-gate 		bp = bp->b_cont;
13940Sstevel@tonic-gate 	}
13950Sstevel@tonic-gate 	return (head);
13960Sstevel@tonic-gate }
13970Sstevel@tonic-gate 
13980Sstevel@tonic-gate /*
13990Sstevel@tonic-gate  * Copy data from message and data block to newly allocated message and
14000Sstevel@tonic-gate  * data block. Returns new message block pointer, or NULL if error.
14010Sstevel@tonic-gate  * The alignment of rptr (w.r.t. word alignment) will be the same in the copy
14020Sstevel@tonic-gate  * as in the original even when db_base is not word aligned. (bug 1052877)
14030Sstevel@tonic-gate  */
14040Sstevel@tonic-gate mblk_t *
14050Sstevel@tonic-gate copyb(mblk_t *bp)
14060Sstevel@tonic-gate {
14070Sstevel@tonic-gate 	mblk_t	*nbp;
14080Sstevel@tonic-gate 	dblk_t	*dp, *ndp;
14090Sstevel@tonic-gate 	uchar_t *base;
14100Sstevel@tonic-gate 	size_t	size;
14110Sstevel@tonic-gate 	size_t	unaligned;
14120Sstevel@tonic-gate 
14130Sstevel@tonic-gate 	ASSERT(bp->b_wptr >= bp->b_rptr);
14140Sstevel@tonic-gate 
14150Sstevel@tonic-gate 	dp = bp->b_datap;
14160Sstevel@tonic-gate 	if (dp->db_fthdr != NULL)
14170Sstevel@tonic-gate 		STR_FTEVENT_MBLK(bp, caller(), FTEV_COPYB, 0);
14180Sstevel@tonic-gate 
14190Sstevel@tonic-gate 	/*
14200Sstevel@tonic-gate 	 * Special handling for Multidata message; this should be
14210Sstevel@tonic-gate 	 * removed once a copy-callback routine is made available.
14220Sstevel@tonic-gate 	 */
14230Sstevel@tonic-gate 	if (dp->db_type == M_MULTIDATA) {
14240Sstevel@tonic-gate 		cred_t *cr;
14250Sstevel@tonic-gate 
14260Sstevel@tonic-gate 		if ((nbp = mmd_copy(bp, KM_NOSLEEP)) == NULL)
14270Sstevel@tonic-gate 			return (NULL);
14280Sstevel@tonic-gate 
14290Sstevel@tonic-gate 		nbp->b_flag = bp->b_flag;
14300Sstevel@tonic-gate 		nbp->b_band = bp->b_band;
14310Sstevel@tonic-gate 		ndp = nbp->b_datap;
14320Sstevel@tonic-gate 
14330Sstevel@tonic-gate 		/* See comments below on potential issues. */
14340Sstevel@tonic-gate 		STR_FTEVENT_MBLK(nbp, caller(), FTEV_COPYB, 1);
14350Sstevel@tonic-gate 
14360Sstevel@tonic-gate 		ASSERT(ndp->db_type == dp->db_type);
14370Sstevel@tonic-gate 		cr = dp->db_credp;
14380Sstevel@tonic-gate 		if (cr != NULL)
14390Sstevel@tonic-gate 			crhold(ndp->db_credp = cr);
14400Sstevel@tonic-gate 		ndp->db_cpid = dp->db_cpid;
14410Sstevel@tonic-gate 		return (nbp);
14420Sstevel@tonic-gate 	}
14430Sstevel@tonic-gate 
14440Sstevel@tonic-gate 	size = dp->db_lim - dp->db_base;
14450Sstevel@tonic-gate 	unaligned = P2PHASE((uintptr_t)dp->db_base, sizeof (uint_t));
14460Sstevel@tonic-gate 	if ((nbp = allocb_tmpl(size + unaligned, bp)) == NULL)
14470Sstevel@tonic-gate 		return (NULL);
14480Sstevel@tonic-gate 	nbp->b_flag = bp->b_flag;
14490Sstevel@tonic-gate 	nbp->b_band = bp->b_band;
14500Sstevel@tonic-gate 	ndp = nbp->b_datap;
14510Sstevel@tonic-gate 
14520Sstevel@tonic-gate 	/*
14530Sstevel@tonic-gate 	 * Well, here is a potential issue.  If we are trying to
14540Sstevel@tonic-gate 	 * trace a flow, and we copy the message, we might lose
14550Sstevel@tonic-gate 	 * information about where this message might have been.
14560Sstevel@tonic-gate 	 * So we should inherit the FT data.  On the other hand,
14570Sstevel@tonic-gate 	 * a user might be interested only in alloc to free data.
14580Sstevel@tonic-gate 	 * So I guess the real answer is to provide a tunable.
14590Sstevel@tonic-gate 	 */
14600Sstevel@tonic-gate 	STR_FTEVENT_MBLK(nbp, caller(), FTEV_COPYB, 1);
14610Sstevel@tonic-gate 
14620Sstevel@tonic-gate 	base = ndp->db_base + unaligned;
14630Sstevel@tonic-gate 	bcopy(dp->db_base, ndp->db_base + unaligned, size);
14640Sstevel@tonic-gate 
14650Sstevel@tonic-gate 	nbp->b_rptr = base + (bp->b_rptr - dp->db_base);
14660Sstevel@tonic-gate 	nbp->b_wptr = nbp->b_rptr + MBLKL(bp);
14670Sstevel@tonic-gate 
14680Sstevel@tonic-gate 	return (nbp);
14690Sstevel@tonic-gate }
14700Sstevel@tonic-gate 
14710Sstevel@tonic-gate /*
14720Sstevel@tonic-gate  * Copy data from message to newly allocated message using new
14730Sstevel@tonic-gate  * data blocks.  Returns a pointer to the new message, or NULL if error.
14740Sstevel@tonic-gate  */
14750Sstevel@tonic-gate mblk_t *
14760Sstevel@tonic-gate copymsg(mblk_t *bp)
14770Sstevel@tonic-gate {
14780Sstevel@tonic-gate 	mblk_t *head, *nbp;
14790Sstevel@tonic-gate 
14800Sstevel@tonic-gate 	if (!bp || !(nbp = head = copyb(bp)))
14810Sstevel@tonic-gate 		return (NULL);
14820Sstevel@tonic-gate 
14830Sstevel@tonic-gate 	while (bp->b_cont) {
14840Sstevel@tonic-gate 		if (!(nbp->b_cont = copyb(bp->b_cont))) {
14850Sstevel@tonic-gate 			freemsg(head);
14860Sstevel@tonic-gate 			return (NULL);
14870Sstevel@tonic-gate 		}
14880Sstevel@tonic-gate 		nbp = nbp->b_cont;
14890Sstevel@tonic-gate 		bp = bp->b_cont;
14900Sstevel@tonic-gate 	}
14910Sstevel@tonic-gate 	return (head);
14920Sstevel@tonic-gate }
14930Sstevel@tonic-gate 
14940Sstevel@tonic-gate /*
14950Sstevel@tonic-gate  * link a message block to tail of message
14960Sstevel@tonic-gate  */
14970Sstevel@tonic-gate void
14980Sstevel@tonic-gate linkb(mblk_t *mp, mblk_t *bp)
14990Sstevel@tonic-gate {
15000Sstevel@tonic-gate 	ASSERT(mp && bp);
15010Sstevel@tonic-gate 
15020Sstevel@tonic-gate 	for (; mp->b_cont; mp = mp->b_cont)
15030Sstevel@tonic-gate 		;
15040Sstevel@tonic-gate 	mp->b_cont = bp;
15050Sstevel@tonic-gate }
15060Sstevel@tonic-gate 
15070Sstevel@tonic-gate /*
15080Sstevel@tonic-gate  * unlink a message block from head of message
15090Sstevel@tonic-gate  * return pointer to new message.
15100Sstevel@tonic-gate  * NULL if message becomes empty.
15110Sstevel@tonic-gate  */
15120Sstevel@tonic-gate mblk_t *
15130Sstevel@tonic-gate unlinkb(mblk_t *bp)
15140Sstevel@tonic-gate {
15150Sstevel@tonic-gate 	mblk_t *bp1;
15160Sstevel@tonic-gate 
15170Sstevel@tonic-gate 	bp1 = bp->b_cont;
15180Sstevel@tonic-gate 	bp->b_cont = NULL;
15190Sstevel@tonic-gate 	return (bp1);
15200Sstevel@tonic-gate }
15210Sstevel@tonic-gate 
15220Sstevel@tonic-gate /*
15230Sstevel@tonic-gate  * remove a message block "bp" from message "mp"
15240Sstevel@tonic-gate  *
15250Sstevel@tonic-gate  * Return pointer to new message or NULL if no message remains.
15260Sstevel@tonic-gate  * Return -1 if bp is not found in message.
15270Sstevel@tonic-gate  */
15280Sstevel@tonic-gate mblk_t *
15290Sstevel@tonic-gate rmvb(mblk_t *mp, mblk_t *bp)
15300Sstevel@tonic-gate {
15310Sstevel@tonic-gate 	mblk_t *tmp;
15320Sstevel@tonic-gate 	mblk_t *lastp = NULL;
15330Sstevel@tonic-gate 
15340Sstevel@tonic-gate 	ASSERT(mp && bp);
15350Sstevel@tonic-gate 	for (tmp = mp; tmp; tmp = tmp->b_cont) {
15360Sstevel@tonic-gate 		if (tmp == bp) {
15370Sstevel@tonic-gate 			if (lastp)
15380Sstevel@tonic-gate 				lastp->b_cont = tmp->b_cont;
15390Sstevel@tonic-gate 			else
15400Sstevel@tonic-gate 				mp = tmp->b_cont;
15410Sstevel@tonic-gate 			tmp->b_cont = NULL;
15420Sstevel@tonic-gate 			return (mp);
15430Sstevel@tonic-gate 		}
15440Sstevel@tonic-gate 		lastp = tmp;
15450Sstevel@tonic-gate 	}
15460Sstevel@tonic-gate 	return ((mblk_t *)-1);
15470Sstevel@tonic-gate }
15480Sstevel@tonic-gate 
15490Sstevel@tonic-gate /*
15500Sstevel@tonic-gate  * Concatenate and align first len bytes of common
15510Sstevel@tonic-gate  * message type.  Len == -1, means concat everything.
15520Sstevel@tonic-gate  * Returns 1 on success, 0 on failure
15530Sstevel@tonic-gate  * After the pullup, mp points to the pulled up data.
15540Sstevel@tonic-gate  */
15550Sstevel@tonic-gate int
15560Sstevel@tonic-gate pullupmsg(mblk_t *mp, ssize_t len)
15570Sstevel@tonic-gate {
15580Sstevel@tonic-gate 	mblk_t *bp, *b_cont;
15590Sstevel@tonic-gate 	dblk_t *dbp;
15600Sstevel@tonic-gate 	ssize_t n;
15610Sstevel@tonic-gate 
15620Sstevel@tonic-gate 	ASSERT(mp->b_datap->db_ref > 0);
15630Sstevel@tonic-gate 	ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
15640Sstevel@tonic-gate 
15650Sstevel@tonic-gate 	/*
15660Sstevel@tonic-gate 	 * We won't handle Multidata message, since it contains
15670Sstevel@tonic-gate 	 * metadata which this function has no knowledge of; we
15680Sstevel@tonic-gate 	 * assert on DEBUG, and return failure otherwise.
15690Sstevel@tonic-gate 	 */
15700Sstevel@tonic-gate 	ASSERT(mp->b_datap->db_type != M_MULTIDATA);
15710Sstevel@tonic-gate 	if (mp->b_datap->db_type == M_MULTIDATA)
15720Sstevel@tonic-gate 		return (0);
15730Sstevel@tonic-gate 
15740Sstevel@tonic-gate 	if (len == -1) {
15750Sstevel@tonic-gate 		if (mp->b_cont == NULL && str_aligned(mp->b_rptr))
15760Sstevel@tonic-gate 			return (1);
15770Sstevel@tonic-gate 		len = xmsgsize(mp);
15780Sstevel@tonic-gate 	} else {
15790Sstevel@tonic-gate 		ssize_t first_mblk_len = mp->b_wptr - mp->b_rptr;
15800Sstevel@tonic-gate 		ASSERT(first_mblk_len >= 0);
15810Sstevel@tonic-gate 		/*
15820Sstevel@tonic-gate 		 * If the length is less than that of the first mblk,
15830Sstevel@tonic-gate 		 * we want to pull up the message into an aligned mblk.
15840Sstevel@tonic-gate 		 * Though not part of the spec, some callers assume it.
15850Sstevel@tonic-gate 		 */
15860Sstevel@tonic-gate 		if (len <= first_mblk_len) {
15870Sstevel@tonic-gate 			if (str_aligned(mp->b_rptr))
15880Sstevel@tonic-gate 				return (1);
15890Sstevel@tonic-gate 			len = first_mblk_len;
15900Sstevel@tonic-gate 		} else if (xmsgsize(mp) < len)
15910Sstevel@tonic-gate 			return (0);
15920Sstevel@tonic-gate 	}
15930Sstevel@tonic-gate 
15940Sstevel@tonic-gate 	if ((bp = allocb_tmpl(len, mp)) == NULL)
15950Sstevel@tonic-gate 		return (0);
15960Sstevel@tonic-gate 
15970Sstevel@tonic-gate 	dbp = bp->b_datap;
15980Sstevel@tonic-gate 	*bp = *mp;		/* swap mblks so bp heads the old msg... */
15990Sstevel@tonic-gate 	mp->b_datap = dbp;	/* ... and mp heads the new message */
16000Sstevel@tonic-gate 	mp->b_datap->db_mblk = mp;
16010Sstevel@tonic-gate 	bp->b_datap->db_mblk = bp;
16020Sstevel@tonic-gate 	mp->b_rptr = mp->b_wptr = dbp->db_base;
16030Sstevel@tonic-gate 
16040Sstevel@tonic-gate 	do {
16050Sstevel@tonic-gate 		ASSERT(bp->b_datap->db_ref > 0);
16060Sstevel@tonic-gate 		ASSERT(bp->b_wptr >= bp->b_rptr);
16070Sstevel@tonic-gate 		n = MIN(bp->b_wptr - bp->b_rptr, len);
16080Sstevel@tonic-gate 		bcopy(bp->b_rptr, mp->b_wptr, (size_t)n);
16090Sstevel@tonic-gate 		mp->b_wptr += n;
16100Sstevel@tonic-gate 		bp->b_rptr += n;
16110Sstevel@tonic-gate 		len -= n;
16120Sstevel@tonic-gate 		if (bp->b_rptr != bp->b_wptr)
16130Sstevel@tonic-gate 			break;
16140Sstevel@tonic-gate 		b_cont = bp->b_cont;
16150Sstevel@tonic-gate 		freeb(bp);
16160Sstevel@tonic-gate 		bp = b_cont;
16170Sstevel@tonic-gate 	} while (len && bp);
16180Sstevel@tonic-gate 
16190Sstevel@tonic-gate 	mp->b_cont = bp;	/* tack on whatever wasn't pulled up */
16200Sstevel@tonic-gate 
16210Sstevel@tonic-gate 	return (1);
16220Sstevel@tonic-gate }
16230Sstevel@tonic-gate 
16240Sstevel@tonic-gate /*
16250Sstevel@tonic-gate  * Concatenate and align at least the first len bytes of common message
16260Sstevel@tonic-gate  * type.  Len == -1 means concatenate everything.  The original message is
16270Sstevel@tonic-gate  * unaltered.  Returns a pointer to a new message on success, otherwise
16280Sstevel@tonic-gate  * returns NULL.
16290Sstevel@tonic-gate  */
16300Sstevel@tonic-gate mblk_t *
16310Sstevel@tonic-gate msgpullup(mblk_t *mp, ssize_t len)
16320Sstevel@tonic-gate {
16330Sstevel@tonic-gate 	mblk_t	*newmp;
16340Sstevel@tonic-gate 	ssize_t	totlen;
16350Sstevel@tonic-gate 	ssize_t	n;
16360Sstevel@tonic-gate 
16370Sstevel@tonic-gate 	/*
16380Sstevel@tonic-gate 	 * We won't handle Multidata message, since it contains
16390Sstevel@tonic-gate 	 * metadata which this function has no knowledge of; we
16400Sstevel@tonic-gate 	 * assert on DEBUG, and return failure otherwise.
16410Sstevel@tonic-gate 	 */
16420Sstevel@tonic-gate 	ASSERT(mp->b_datap->db_type != M_MULTIDATA);
16430Sstevel@tonic-gate 	if (mp->b_datap->db_type == M_MULTIDATA)
16440Sstevel@tonic-gate 		return (NULL);
16450Sstevel@tonic-gate 
16460Sstevel@tonic-gate 	totlen = xmsgsize(mp);
16470Sstevel@tonic-gate 
16480Sstevel@tonic-gate 	if ((len > 0) && (len > totlen))
16490Sstevel@tonic-gate 		return (NULL);
16500Sstevel@tonic-gate 
16510Sstevel@tonic-gate 	/*
16520Sstevel@tonic-gate 	 * Copy all of the first msg type into one new mblk, then dupmsg
16530Sstevel@tonic-gate 	 * and link the rest onto this.
16540Sstevel@tonic-gate 	 */
16550Sstevel@tonic-gate 
16560Sstevel@tonic-gate 	len = totlen;
16570Sstevel@tonic-gate 
16580Sstevel@tonic-gate 	if ((newmp = allocb_tmpl(len, mp)) == NULL)
16590Sstevel@tonic-gate 		return (NULL);
16600Sstevel@tonic-gate 
16610Sstevel@tonic-gate 	newmp->b_flag = mp->b_flag;
16620Sstevel@tonic-gate 	newmp->b_band = mp->b_band;
16630Sstevel@tonic-gate 
16640Sstevel@tonic-gate 	while (len > 0) {
16650Sstevel@tonic-gate 		n = mp->b_wptr - mp->b_rptr;
16660Sstevel@tonic-gate 		ASSERT(n >= 0);		/* allow zero-length mblk_t's */
16670Sstevel@tonic-gate 		if (n > 0)
16680Sstevel@tonic-gate 			bcopy(mp->b_rptr, newmp->b_wptr, n);
16690Sstevel@tonic-gate 		newmp->b_wptr += n;
16700Sstevel@tonic-gate 		len -= n;
16710Sstevel@tonic-gate 		mp = mp->b_cont;
16720Sstevel@tonic-gate 	}
16730Sstevel@tonic-gate 
16740Sstevel@tonic-gate 	if (mp != NULL) {
16750Sstevel@tonic-gate 		newmp->b_cont = dupmsg(mp);
16760Sstevel@tonic-gate 		if (newmp->b_cont == NULL) {
16770Sstevel@tonic-gate 			freemsg(newmp);
16780Sstevel@tonic-gate 			return (NULL);
16790Sstevel@tonic-gate 		}
16800Sstevel@tonic-gate 	}
16810Sstevel@tonic-gate 
16820Sstevel@tonic-gate 	return (newmp);
16830Sstevel@tonic-gate }
16840Sstevel@tonic-gate 
16850Sstevel@tonic-gate /*
16860Sstevel@tonic-gate  * Trim bytes from message
16870Sstevel@tonic-gate  *  len > 0, trim from head
16880Sstevel@tonic-gate  *  len < 0, trim from tail
16890Sstevel@tonic-gate  * Returns 1 on success, 0 on failure.
16900Sstevel@tonic-gate  */
16910Sstevel@tonic-gate int
16920Sstevel@tonic-gate adjmsg(mblk_t *mp, ssize_t len)
16930Sstevel@tonic-gate {
16940Sstevel@tonic-gate 	mblk_t *bp;
16950Sstevel@tonic-gate 	mblk_t *save_bp = NULL;
16960Sstevel@tonic-gate 	mblk_t *prev_bp;
16970Sstevel@tonic-gate 	mblk_t *bcont;
16980Sstevel@tonic-gate 	unsigned char type;
16990Sstevel@tonic-gate 	ssize_t n;
17000Sstevel@tonic-gate 	int fromhead;
17010Sstevel@tonic-gate 	int first;
17020Sstevel@tonic-gate 
17030Sstevel@tonic-gate 	ASSERT(mp != NULL);
17040Sstevel@tonic-gate 	/*
17050Sstevel@tonic-gate 	 * We won't handle Multidata message, since it contains
17060Sstevel@tonic-gate 	 * metadata which this function has no knowledge of; we
17070Sstevel@tonic-gate 	 * assert on DEBUG, and return failure otherwise.
17080Sstevel@tonic-gate 	 */
17090Sstevel@tonic-gate 	ASSERT(mp->b_datap->db_type != M_MULTIDATA);
17100Sstevel@tonic-gate 	if (mp->b_datap->db_type == M_MULTIDATA)
17110Sstevel@tonic-gate 		return (0);
17120Sstevel@tonic-gate 
17130Sstevel@tonic-gate 	if (len < 0) {
17140Sstevel@tonic-gate 		fromhead = 0;
17150Sstevel@tonic-gate 		len = -len;
17160Sstevel@tonic-gate 	} else {
17170Sstevel@tonic-gate 		fromhead = 1;
17180Sstevel@tonic-gate 	}
17190Sstevel@tonic-gate 
17200Sstevel@tonic-gate 	if (xmsgsize(mp) < len)
17210Sstevel@tonic-gate 		return (0);
17220Sstevel@tonic-gate 
17230Sstevel@tonic-gate 	if (fromhead) {
17240Sstevel@tonic-gate 		first = 1;
17250Sstevel@tonic-gate 		while (len) {
17260Sstevel@tonic-gate 			ASSERT(mp->b_wptr >= mp->b_rptr);
17270Sstevel@tonic-gate 			n = MIN(mp->b_wptr - mp->b_rptr, len);
17280Sstevel@tonic-gate 			mp->b_rptr += n;
17290Sstevel@tonic-gate 			len -= n;
17300Sstevel@tonic-gate 
17310Sstevel@tonic-gate 			/*
17320Sstevel@tonic-gate 			 * If this is not the first zero length
17330Sstevel@tonic-gate 			 * message remove it
17340Sstevel@tonic-gate 			 */
17350Sstevel@tonic-gate 			if (!first && (mp->b_wptr == mp->b_rptr)) {
17360Sstevel@tonic-gate 				bcont = mp->b_cont;
17370Sstevel@tonic-gate 				freeb(mp);
17380Sstevel@tonic-gate 				mp = save_bp->b_cont = bcont;
17390Sstevel@tonic-gate 			} else {
17400Sstevel@tonic-gate 				save_bp = mp;
17410Sstevel@tonic-gate 				mp = mp->b_cont;
17420Sstevel@tonic-gate 			}
17430Sstevel@tonic-gate 			first = 0;
17440Sstevel@tonic-gate 		}
17450Sstevel@tonic-gate 	} else {
17460Sstevel@tonic-gate 		type = mp->b_datap->db_type;
17470Sstevel@tonic-gate 		while (len) {
17480Sstevel@tonic-gate 			bp = mp;
17490Sstevel@tonic-gate 			save_bp = NULL;
17500Sstevel@tonic-gate 
17510Sstevel@tonic-gate 			/*
17520Sstevel@tonic-gate 			 * Find the last message of same type
17530Sstevel@tonic-gate 			 */
17540Sstevel@tonic-gate 			while (bp && bp->b_datap->db_type == type) {
17550Sstevel@tonic-gate 				ASSERT(bp->b_wptr >= bp->b_rptr);
17560Sstevel@tonic-gate 				prev_bp = save_bp;
17570Sstevel@tonic-gate 				save_bp = bp;
17580Sstevel@tonic-gate 				bp = bp->b_cont;
17590Sstevel@tonic-gate 			}
17600Sstevel@tonic-gate 			if (save_bp == NULL)
17610Sstevel@tonic-gate 				break;
17620Sstevel@tonic-gate 			n = MIN(save_bp->b_wptr - save_bp->b_rptr, len);
17630Sstevel@tonic-gate 			save_bp->b_wptr -= n;
17640Sstevel@tonic-gate 			len -= n;
17650Sstevel@tonic-gate 
17660Sstevel@tonic-gate 			/*
17670Sstevel@tonic-gate 			 * If this is not the first message
17680Sstevel@tonic-gate 			 * and we have taken away everything
17690Sstevel@tonic-gate 			 * from this message, remove it
17700Sstevel@tonic-gate 			 */
17710Sstevel@tonic-gate 
17720Sstevel@tonic-gate 			if ((save_bp != mp) &&
17736707Sbrutus 			    (save_bp->b_wptr == save_bp->b_rptr)) {
17740Sstevel@tonic-gate 				bcont = save_bp->b_cont;
17750Sstevel@tonic-gate 				freeb(save_bp);
17760Sstevel@tonic-gate 				prev_bp->b_cont = bcont;
17770Sstevel@tonic-gate 			}
17780Sstevel@tonic-gate 		}
17790Sstevel@tonic-gate 	}
17800Sstevel@tonic-gate 	return (1);
17810Sstevel@tonic-gate }
17820Sstevel@tonic-gate 
17830Sstevel@tonic-gate /*
17840Sstevel@tonic-gate  * get number of data bytes in message
17850Sstevel@tonic-gate  */
17860Sstevel@tonic-gate size_t
17870Sstevel@tonic-gate msgdsize(mblk_t *bp)
17880Sstevel@tonic-gate {
17890Sstevel@tonic-gate 	size_t count = 0;
17900Sstevel@tonic-gate 
17910Sstevel@tonic-gate 	for (; bp; bp = bp->b_cont)
17920Sstevel@tonic-gate 		if (bp->b_datap->db_type == M_DATA) {
17930Sstevel@tonic-gate 			ASSERT(bp->b_wptr >= bp->b_rptr);
17940Sstevel@tonic-gate 			count += bp->b_wptr - bp->b_rptr;
17950Sstevel@tonic-gate 		}
17960Sstevel@tonic-gate 	return (count);
17970Sstevel@tonic-gate }
17980Sstevel@tonic-gate 
17990Sstevel@tonic-gate /*
18000Sstevel@tonic-gate  * Get a message off head of queue
18010Sstevel@tonic-gate  *
18020Sstevel@tonic-gate  * If queue has no buffers then mark queue
18030Sstevel@tonic-gate  * with QWANTR. (queue wants to be read by
18040Sstevel@tonic-gate  * someone when data becomes available)
18050Sstevel@tonic-gate  *
18060Sstevel@tonic-gate  * If there is something to take off then do so.
18070Sstevel@tonic-gate  * If queue falls below hi water mark turn off QFULL
18080Sstevel@tonic-gate  * flag.  Decrement weighted count of queue.
18090Sstevel@tonic-gate  * Also turn off QWANTR because queue is being read.
18100Sstevel@tonic-gate  *
18110Sstevel@tonic-gate  * The queue count is maintained on a per-band basis.
18120Sstevel@tonic-gate  * Priority band 0 (normal messages) uses q_count,
18130Sstevel@tonic-gate  * q_lowat, etc.  Non-zero priority bands use the
18140Sstevel@tonic-gate  * fields in their respective qband structures
18150Sstevel@tonic-gate  * (qb_count, qb_lowat, etc.)  All messages appear
18160Sstevel@tonic-gate  * on the same list, linked via their b_next pointers.
18170Sstevel@tonic-gate  * q_first is the head of the list.  q_count does
18180Sstevel@tonic-gate  * not reflect the size of all the messages on the
18190Sstevel@tonic-gate  * queue.  It only reflects those messages in the
18200Sstevel@tonic-gate  * normal band of flow.  The one exception to this
18210Sstevel@tonic-gate  * deals with high priority messages.  They are in
18220Sstevel@tonic-gate  * their own conceptual "band", but are accounted
18230Sstevel@tonic-gate  * against q_count.
18240Sstevel@tonic-gate  *
18250Sstevel@tonic-gate  * If queue count is below the lo water mark and QWANTW
18260Sstevel@tonic-gate  * is set, enable the closest backq which has a service
18270Sstevel@tonic-gate  * procedure and turn off the QWANTW flag.
18280Sstevel@tonic-gate  *
18290Sstevel@tonic-gate  * getq could be built on top of rmvq, but isn't because
18300Sstevel@tonic-gate  * of performance considerations.
18310Sstevel@tonic-gate  *
18320Sstevel@tonic-gate  * A note on the use of q_count and q_mblkcnt:
18330Sstevel@tonic-gate  *   q_count is the traditional byte count for messages that
18340Sstevel@tonic-gate  *   have been put on a queue.  Documentation tells us that
18350Sstevel@tonic-gate  *   we shouldn't rely on that count, but some drivers/modules
18360Sstevel@tonic-gate  *   do.  What was needed, however, is a mechanism to prevent
18370Sstevel@tonic-gate  *   runaway streams from consuming all of the resources,
18380Sstevel@tonic-gate  *   and particularly be able to flow control zero-length
18390Sstevel@tonic-gate  *   messages.  q_mblkcnt is used for this purpose.  It
18400Sstevel@tonic-gate  *   counts the number of mblk's that are being put on
18410Sstevel@tonic-gate  *   the queue.  The intention here, is that each mblk should
18420Sstevel@tonic-gate  *   contain one byte of data and, for the purpose of
18430Sstevel@tonic-gate  *   flow-control, logically does.  A queue will become
18440Sstevel@tonic-gate  *   full when EITHER of these values (q_count and q_mblkcnt)
18450Sstevel@tonic-gate  *   reach the highwater mark.  It will clear when BOTH
18460Sstevel@tonic-gate  *   of them drop below the highwater mark.  And it will
18470Sstevel@tonic-gate  *   backenable when BOTH of them drop below the lowwater
18480Sstevel@tonic-gate  *   mark.
18490Sstevel@tonic-gate  *   With this algorithm, a driver/module might be able
18500Sstevel@tonic-gate  *   to find a reasonably accurate q_count, and the
18510Sstevel@tonic-gate  *   framework can still try and limit resource usage.
18520Sstevel@tonic-gate  */
18530Sstevel@tonic-gate mblk_t *
18540Sstevel@tonic-gate getq(queue_t *q)
18550Sstevel@tonic-gate {
18560Sstevel@tonic-gate 	mblk_t *bp;
1857235Smicheng 	uchar_t band = 0;
18580Sstevel@tonic-gate 
18596769Sja97890 	bp = getq_noenab(q, 0);
18600Sstevel@tonic-gate 	if (bp != NULL)
18610Sstevel@tonic-gate 		band = bp->b_band;
18620Sstevel@tonic-gate 
18630Sstevel@tonic-gate 	/*
18640Sstevel@tonic-gate 	 * Inlined from qbackenable().
18650Sstevel@tonic-gate 	 * Quick check without holding the lock.
18660Sstevel@tonic-gate 	 */
18670Sstevel@tonic-gate 	if (band == 0 && (q->q_flag & (QWANTW|QWANTWSYNC)) == 0)
18680Sstevel@tonic-gate 		return (bp);
18690Sstevel@tonic-gate 
18700Sstevel@tonic-gate 	qbackenable(q, band);
18710Sstevel@tonic-gate 	return (bp);
18720Sstevel@tonic-gate }
18730Sstevel@tonic-gate 
18740Sstevel@tonic-gate /*
1875741Smasputra  * Calculate number of data bytes in a single data message block taking
1876741Smasputra  * multidata messages into account.
1877741Smasputra  */
1878741Smasputra 
1879741Smasputra #define	ADD_MBLK_SIZE(mp, size) 					\
1880741Smasputra 	if (DB_TYPE(mp) != M_MULTIDATA) {				\
1881741Smasputra 		(size) += MBLKL(mp);					\
1882741Smasputra 	} else {							\
1883741Smasputra 		uint_t	pinuse;						\
1884741Smasputra 									\
1885741Smasputra 		mmd_getsize(mmd_getmultidata(mp), NULL, &pinuse);	\
1886741Smasputra 		(size) += pinuse;					\
1887741Smasputra 	}
1888741Smasputra 
1889741Smasputra /*
18906769Sja97890  * Returns the number of bytes in a message (a message is defined as a
18916769Sja97890  * chain of mblks linked by b_cont). If a non-NULL mblkcnt is supplied we
18926769Sja97890  * also return the number of distinct mblks in the message.
18936769Sja97890  */
18946769Sja97890 int
18956769Sja97890 mp_cont_len(mblk_t *bp, int *mblkcnt)
18966769Sja97890 {
18976769Sja97890 	mblk_t	*mp;
18986769Sja97890 	int	mblks = 0;
18996769Sja97890 	int	bytes = 0;
19006769Sja97890 
19016769Sja97890 	for (mp = bp; mp != NULL; mp = mp->b_cont) {
19026769Sja97890 		ADD_MBLK_SIZE(mp, bytes);
19036769Sja97890 		mblks++;
19046769Sja97890 	}
19056769Sja97890 
19066769Sja97890 	if (mblkcnt != NULL)
19076769Sja97890 		*mblkcnt = mblks;
19086769Sja97890 
19096769Sja97890 	return (bytes);
19106769Sja97890 }
19116769Sja97890 
19126769Sja97890 /*
19130Sstevel@tonic-gate  * Like getq() but does not backenable.  This is used by the stream
19140Sstevel@tonic-gate  * head when a putback() is likely.  The caller must call qbackenable()
19150Sstevel@tonic-gate  * after it is done with accessing the queue.
19166769Sja97890  * The rbytes arguments to getq_noneab() allows callers to specify a
19176769Sja97890  * the maximum number of bytes to return. If the current amount on the
19186769Sja97890  * queue is less than this then the entire message will be returned.
19196769Sja97890  * A value of 0 returns the entire message and is equivalent to the old
19206769Sja97890  * default behaviour prior to the addition of the rbytes argument.
19210Sstevel@tonic-gate  */
19220Sstevel@tonic-gate mblk_t *
19236769Sja97890 getq_noenab(queue_t *q, ssize_t rbytes)
19240Sstevel@tonic-gate {
19256769Sja97890 	mblk_t *bp, *mp1;
19266769Sja97890 	mblk_t *mp2 = NULL;
19270Sstevel@tonic-gate 	qband_t *qbp;
19280Sstevel@tonic-gate 	kthread_id_t freezer;
19290Sstevel@tonic-gate 	int	bytecnt = 0, mblkcnt = 0;
19300Sstevel@tonic-gate 
19310Sstevel@tonic-gate 	/* freezestr should allow its caller to call getq/putq */
19320Sstevel@tonic-gate 	freezer = STREAM(q)->sd_freezer;
19330Sstevel@tonic-gate 	if (freezer == curthread) {
19340Sstevel@tonic-gate 		ASSERT(frozenstr(q));
19350Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(QLOCK(q)));
19360Sstevel@tonic-gate 	} else
19370Sstevel@tonic-gate 		mutex_enter(QLOCK(q));
19380Sstevel@tonic-gate 
19390Sstevel@tonic-gate 	if ((bp = q->q_first) == 0) {
19400Sstevel@tonic-gate 		q->q_flag |= QWANTR;
19410Sstevel@tonic-gate 	} else {
19426769Sja97890 		/*
19436769Sja97890 		 * If the caller supplied a byte threshold and there is
19446769Sja97890 		 * more than this amount on the queue then break up the
19456769Sja97890 		 * the message appropriately.  We can only safely do
19466769Sja97890 		 * this for M_DATA messages.
19476769Sja97890 		 */
19486769Sja97890 		if ((DB_TYPE(bp) == M_DATA) && (rbytes > 0) &&
19496769Sja97890 		    (q->q_count > rbytes)) {
19506769Sja97890 			/*
19516769Sja97890 			 * Inline version of mp_cont_len() which terminates
19526769Sja97890 			 * when we meet or exceed rbytes.
19536769Sja97890 			 */
19546769Sja97890 			for (mp1 = bp; mp1 != NULL; mp1 = mp1->b_cont) {
19556769Sja97890 				mblkcnt++;
19566769Sja97890 				ADD_MBLK_SIZE(mp1, bytecnt);
19576769Sja97890 				if (bytecnt  >= rbytes)
19586769Sja97890 					break;
19596769Sja97890 			}
19606769Sja97890 			/*
19616769Sja97890 			 * We need to account for the following scenarios:
19626769Sja97890 			 *
19636769Sja97890 			 * 1) Too much data in the first message:
19646769Sja97890 			 *	mp1 will be the mblk which puts us over our
19656769Sja97890 			 *	byte limit.
19666769Sja97890 			 * 2) Not enough data in the first message:
19676769Sja97890 			 *	mp1 will be NULL.
19686769Sja97890 			 * 3) Exactly the right amount of data contained within
19696769Sja97890 			 *    whole mblks:
19706769Sja97890 			 *	mp1->b_cont will be where we break the message.
19716769Sja97890 			 */
19726769Sja97890 			if (bytecnt > rbytes) {
19736769Sja97890 				/*
19746769Sja97890 				 * Dup/copy mp1 and put what we don't need
19756769Sja97890 				 * back onto the queue. Adjust the read/write
19766769Sja97890 				 * and continuation pointers appropriately
19776769Sja97890 				 * and decrement the current mblk count to
19786769Sja97890 				 * reflect we are putting an mblk back onto
19796769Sja97890 				 * the queue.
19806769Sja97890 				 * When adjusting the message pointers, it's
19816769Sja97890 				 * OK to use the existing bytecnt and the
19826769Sja97890 				 * requested amount (rbytes) to calculate the
19836769Sja97890 				 * the new write offset (b_wptr) of what we
19846769Sja97890 				 * are taking. However, we  cannot use these
19856769Sja97890 				 * values when calculating the read offset of
19866769Sja97890 				 * the mblk we are putting back on the queue.
19876769Sja97890 				 * This is because the begining (b_rptr) of the
19886769Sja97890 				 * mblk represents some arbitrary point within
19896769Sja97890 				 * the message.
19906769Sja97890 				 * It's simplest to do this by advancing b_rptr
19916769Sja97890 				 * by the new length of mp1 as we don't have to
19926769Sja97890 				 * remember any intermediate state.
19936769Sja97890 				 */
19946769Sja97890 				ASSERT(mp1 != NULL);
19956769Sja97890 				mblkcnt--;
19966769Sja97890 				if ((mp2 = dupb(mp1)) == NULL &&
19976769Sja97890 				    (mp2 = copyb(mp1)) == NULL) {
19986769Sja97890 					bytecnt = mblkcnt = 0;
19996769Sja97890 					goto dup_failed;
20006769Sja97890 				}
20016769Sja97890 				mp2->b_cont = mp1->b_cont;
20026769Sja97890 				mp1->b_wptr -= bytecnt - rbytes;
20036769Sja97890 				mp2->b_rptr += mp1->b_wptr - mp1->b_rptr;
20046769Sja97890 				mp1->b_cont = NULL;
20056769Sja97890 				bytecnt = rbytes;
20066769Sja97890 			} else {
20076769Sja97890 				/*
20086769Sja97890 				 * Either there is not enough data in the first
20096769Sja97890 				 * message or there is no excess data to deal
20106769Sja97890 				 * with. If mp1 is NULL, we are taking the
20116769Sja97890 				 * whole message. No need to do anything.
20126769Sja97890 				 * Otherwise we assign mp1->b_cont to mp2 as
20136769Sja97890 				 * we will be putting this back onto the head of
20146769Sja97890 				 * the queue.
20156769Sja97890 				 */
20166769Sja97890 				if (mp1 != NULL) {
20176769Sja97890 					mp2 = mp1->b_cont;
20186769Sja97890 					mp1->b_cont = NULL;
20196769Sja97890 				}
20206769Sja97890 			}
20216769Sja97890 			/*
20226769Sja97890 			 * If mp2 is not NULL then we have part of the message
20236769Sja97890 			 * to put back onto the queue.
20246769Sja97890 			 */
20256769Sja97890 			if (mp2 != NULL) {
20266769Sja97890 				if ((mp2->b_next = bp->b_next) == NULL)
20276769Sja97890 					q->q_last = mp2;
20286769Sja97890 				else
20296769Sja97890 					bp->b_next->b_prev = mp2;
20306769Sja97890 				q->q_first = mp2;
20316769Sja97890 			} else {
20326769Sja97890 				if ((q->q_first = bp->b_next) == NULL)
20336769Sja97890 					q->q_last = NULL;
20346769Sja97890 				else
20356769Sja97890 					q->q_first->b_prev = NULL;
20366769Sja97890 			}
20376769Sja97890 		} else {
20386769Sja97890 			/*
20396769Sja97890 			 * Either no byte threshold was supplied, there is
20406769Sja97890 			 * not enough on the queue or we failed to
20416769Sja97890 			 * duplicate/copy a data block. In these cases we
20426769Sja97890 			 * just take the entire first message.
20436769Sja97890 			 */
20446769Sja97890 dup_failed:
20456769Sja97890 			bytecnt = mp_cont_len(bp, &mblkcnt);
20466769Sja97890 			if ((q->q_first = bp->b_next) == NULL)
20476769Sja97890 				q->q_last = NULL;
20486769Sja97890 			else
20496769Sja97890 				q->q_first->b_prev = NULL;
20500Sstevel@tonic-gate 		}
20510Sstevel@tonic-gate 		if (bp->b_band == 0) {
20520Sstevel@tonic-gate 			q->q_count -= bytecnt;
20530Sstevel@tonic-gate 			q->q_mblkcnt -= mblkcnt;
20545360Srk129064 			if (q->q_mblkcnt == 0 || ((q->q_count < q->q_hiwat) &&
20555360Srk129064 			    (q->q_mblkcnt < q->q_hiwat))) {
20560Sstevel@tonic-gate 				q->q_flag &= ~QFULL;
20570Sstevel@tonic-gate 			}
20580Sstevel@tonic-gate 		} else {
20590Sstevel@tonic-gate 			int i;
20600Sstevel@tonic-gate 
20610Sstevel@tonic-gate 			ASSERT(bp->b_band <= q->q_nband);
20620Sstevel@tonic-gate 			ASSERT(q->q_bandp != NULL);
20630Sstevel@tonic-gate 			ASSERT(MUTEX_HELD(QLOCK(q)));
20640Sstevel@tonic-gate 			qbp = q->q_bandp;
20650Sstevel@tonic-gate 			i = bp->b_band;
20660Sstevel@tonic-gate 			while (--i > 0)
20670Sstevel@tonic-gate 				qbp = qbp->qb_next;
20680Sstevel@tonic-gate 			if (qbp->qb_first == qbp->qb_last) {
20690Sstevel@tonic-gate 				qbp->qb_first = NULL;
20700Sstevel@tonic-gate 				qbp->qb_last = NULL;
20710Sstevel@tonic-gate 			} else {
20720Sstevel@tonic-gate 				qbp->qb_first = bp->b_next;
20730Sstevel@tonic-gate 			}
20740Sstevel@tonic-gate 			qbp->qb_count -= bytecnt;
20750Sstevel@tonic-gate 			qbp->qb_mblkcnt -= mblkcnt;
20765360Srk129064 			if (qbp->qb_mblkcnt == 0 ||
20775360Srk129064 			    ((qbp->qb_count < qbp->qb_hiwat) &&
20785360Srk129064 			    (qbp->qb_mblkcnt < qbp->qb_hiwat))) {
20790Sstevel@tonic-gate 				qbp->qb_flag &= ~QB_FULL;
20800Sstevel@tonic-gate 			}
20810Sstevel@tonic-gate 		}
20820Sstevel@tonic-gate 		q->q_flag &= ~QWANTR;
20830Sstevel@tonic-gate 		bp->b_next = NULL;
20840Sstevel@tonic-gate 		bp->b_prev = NULL;
20850Sstevel@tonic-gate 	}
20860Sstevel@tonic-gate 	if (freezer != curthread)
20870Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
20880Sstevel@tonic-gate 
20890Sstevel@tonic-gate 	STR_FTEVENT_MSG(bp, q, FTEV_GETQ, NULL);
20900Sstevel@tonic-gate 
20910Sstevel@tonic-gate 	return (bp);
20920Sstevel@tonic-gate }
20930Sstevel@tonic-gate 
20940Sstevel@tonic-gate /*
20950Sstevel@tonic-gate  * Determine if a backenable is needed after removing a message in the
20960Sstevel@tonic-gate  * specified band.
20970Sstevel@tonic-gate  * NOTE: This routine assumes that something like getq_noenab() has been
20980Sstevel@tonic-gate  * already called.
20990Sstevel@tonic-gate  *
21000Sstevel@tonic-gate  * For the read side it is ok to hold sd_lock across calling this (and the
21010Sstevel@tonic-gate  * stream head often does).
21020Sstevel@tonic-gate  * But for the write side strwakeq might be invoked and it acquires sd_lock.
21030Sstevel@tonic-gate  */
21040Sstevel@tonic-gate void
2105235Smicheng qbackenable(queue_t *q, uchar_t band)
21060Sstevel@tonic-gate {
21070Sstevel@tonic-gate 	int backenab = 0;
21080Sstevel@tonic-gate 	qband_t *qbp;
21090Sstevel@tonic-gate 	kthread_id_t freezer;
21100Sstevel@tonic-gate 
21110Sstevel@tonic-gate 	ASSERT(q);
21120Sstevel@tonic-gate 	ASSERT((q->q_flag & QREADR) || MUTEX_NOT_HELD(&STREAM(q)->sd_lock));
21130Sstevel@tonic-gate 
21140Sstevel@tonic-gate 	/*
21150Sstevel@tonic-gate 	 * Quick check without holding the lock.
21160Sstevel@tonic-gate 	 * OK since after getq() has lowered the q_count these flags
21170Sstevel@tonic-gate 	 * would not change unless either the qbackenable() is done by
21180Sstevel@tonic-gate 	 * another thread (which is ok) or the queue has gotten QFULL
21190Sstevel@tonic-gate 	 * in which case another backenable will take place when the queue
21200Sstevel@tonic-gate 	 * drops below q_lowat.
21210Sstevel@tonic-gate 	 */
21220Sstevel@tonic-gate 	if (band == 0 && (q->q_flag & (QWANTW|QWANTWSYNC)) == 0)
21230Sstevel@tonic-gate 		return;
21240Sstevel@tonic-gate 
21250Sstevel@tonic-gate 	/* freezestr should allow its caller to call getq/putq */
21260Sstevel@tonic-gate 	freezer = STREAM(q)->sd_freezer;
21270Sstevel@tonic-gate 	if (freezer == curthread) {
21280Sstevel@tonic-gate 		ASSERT(frozenstr(q));
21290Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(QLOCK(q)));
21300Sstevel@tonic-gate 	} else
21310Sstevel@tonic-gate 		mutex_enter(QLOCK(q));
21320Sstevel@tonic-gate 
21330Sstevel@tonic-gate 	if (band == 0) {
21340Sstevel@tonic-gate 		if (q->q_lowat == 0 || (q->q_count < q->q_lowat &&
21350Sstevel@tonic-gate 		    q->q_mblkcnt < q->q_lowat)) {
21360Sstevel@tonic-gate 			backenab = q->q_flag & (QWANTW|QWANTWSYNC);
21370Sstevel@tonic-gate 		}
21380Sstevel@tonic-gate 	} else {
21390Sstevel@tonic-gate 		int i;
21400Sstevel@tonic-gate 
21410Sstevel@tonic-gate 		ASSERT((unsigned)band <= q->q_nband);
21420Sstevel@tonic-gate 		ASSERT(q->q_bandp != NULL);
21430Sstevel@tonic-gate 
21440Sstevel@tonic-gate 		qbp = q->q_bandp;
21450Sstevel@tonic-gate 		i = band;
21460Sstevel@tonic-gate 		while (--i > 0)
21470Sstevel@tonic-gate 			qbp = qbp->qb_next;
21480Sstevel@tonic-gate 
21490Sstevel@tonic-gate 		if (qbp->qb_lowat == 0 || (qbp->qb_count < qbp->qb_lowat &&
21500Sstevel@tonic-gate 		    qbp->qb_mblkcnt < qbp->qb_lowat)) {
21510Sstevel@tonic-gate 			backenab = qbp->qb_flag & QB_WANTW;
21520Sstevel@tonic-gate 		}
21530Sstevel@tonic-gate 	}
21540Sstevel@tonic-gate 
21550Sstevel@tonic-gate 	if (backenab == 0) {
21560Sstevel@tonic-gate 		if (freezer != curthread)
21570Sstevel@tonic-gate 			mutex_exit(QLOCK(q));
21580Sstevel@tonic-gate 		return;
21590Sstevel@tonic-gate 	}
21600Sstevel@tonic-gate 
21610Sstevel@tonic-gate 	/* Have to drop the lock across strwakeq and backenable */
21620Sstevel@tonic-gate 	if (backenab & QWANTWSYNC)
21630Sstevel@tonic-gate 		q->q_flag &= ~QWANTWSYNC;
21640Sstevel@tonic-gate 	if (backenab & (QWANTW|QB_WANTW)) {
21650Sstevel@tonic-gate 		if (band != 0)
21660Sstevel@tonic-gate 			qbp->qb_flag &= ~QB_WANTW;
21670Sstevel@tonic-gate 		else {
21680Sstevel@tonic-gate 			q->q_flag &= ~QWANTW;
21690Sstevel@tonic-gate 		}
21700Sstevel@tonic-gate 	}
21710Sstevel@tonic-gate 
21720Sstevel@tonic-gate 	if (freezer != curthread)
21730Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
21740Sstevel@tonic-gate 
21750Sstevel@tonic-gate 	if (backenab & QWANTWSYNC)
21760Sstevel@tonic-gate 		strwakeq(q, QWANTWSYNC);
21770Sstevel@tonic-gate 	if (backenab & (QWANTW|QB_WANTW))
21780Sstevel@tonic-gate 		backenable(q, band);
21790Sstevel@tonic-gate }
21800Sstevel@tonic-gate 
21810Sstevel@tonic-gate /*
21820Sstevel@tonic-gate  * Remove a message from a queue.  The queue count and other
21830Sstevel@tonic-gate  * flow control parameters are adjusted and the back queue
21840Sstevel@tonic-gate  * enabled if necessary.
21850Sstevel@tonic-gate  *
21860Sstevel@tonic-gate  * rmvq can be called with the stream frozen, but other utility functions
21870Sstevel@tonic-gate  * holding QLOCK, and by streams modules without any locks/frozen.
21880Sstevel@tonic-gate  */
21890Sstevel@tonic-gate void
21900Sstevel@tonic-gate rmvq(queue_t *q, mblk_t *mp)
21910Sstevel@tonic-gate {
21920Sstevel@tonic-gate 	ASSERT(mp != NULL);
21930Sstevel@tonic-gate 
21940Sstevel@tonic-gate 	rmvq_noenab(q, mp);
21950Sstevel@tonic-gate 	if (curthread != STREAM(q)->sd_freezer && MUTEX_HELD(QLOCK(q))) {
21960Sstevel@tonic-gate 		/*
21970Sstevel@tonic-gate 		 * qbackenable can handle a frozen stream but not a "random"
21980Sstevel@tonic-gate 		 * qlock being held. Drop lock across qbackenable.
21990Sstevel@tonic-gate 		 */
22000Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
22010Sstevel@tonic-gate 		qbackenable(q, mp->b_band);
22020Sstevel@tonic-gate 		mutex_enter(QLOCK(q));
22030Sstevel@tonic-gate 	} else {
22040Sstevel@tonic-gate 		qbackenable(q, mp->b_band);
22050Sstevel@tonic-gate 	}
22060Sstevel@tonic-gate }
22070Sstevel@tonic-gate 
22080Sstevel@tonic-gate /*
22090Sstevel@tonic-gate  * Like rmvq() but without any backenabling.
22100Sstevel@tonic-gate  * This exists to handle SR_CONSOL_DATA in strrput().
22110Sstevel@tonic-gate  */
22120Sstevel@tonic-gate void
22130Sstevel@tonic-gate rmvq_noenab(queue_t *q, mblk_t *mp)
22140Sstevel@tonic-gate {
22150Sstevel@tonic-gate 	int i;
22160Sstevel@tonic-gate 	qband_t *qbp = NULL;
22170Sstevel@tonic-gate 	kthread_id_t freezer;
22180Sstevel@tonic-gate 	int	bytecnt = 0, mblkcnt = 0;
22190Sstevel@tonic-gate 
22200Sstevel@tonic-gate 	freezer = STREAM(q)->sd_freezer;
22210Sstevel@tonic-gate 	if (freezer == curthread) {
22220Sstevel@tonic-gate 		ASSERT(frozenstr(q));
22230Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(QLOCK(q)));
22240Sstevel@tonic-gate 	} else if (MUTEX_HELD(QLOCK(q))) {
22250Sstevel@tonic-gate 		/* Don't drop lock on exit */
22260Sstevel@tonic-gate 		freezer = curthread;
22270Sstevel@tonic-gate 	} else
22280Sstevel@tonic-gate 		mutex_enter(QLOCK(q));
22290Sstevel@tonic-gate 
22300Sstevel@tonic-gate 	ASSERT(mp->b_band <= q->q_nband);
22310Sstevel@tonic-gate 	if (mp->b_band != 0) {		/* Adjust band pointers */
22320Sstevel@tonic-gate 		ASSERT(q->q_bandp != NULL);
22330Sstevel@tonic-gate 		qbp = q->q_bandp;
22340Sstevel@tonic-gate 		i = mp->b_band;
22350Sstevel@tonic-gate 		while (--i > 0)
22360Sstevel@tonic-gate 			qbp = qbp->qb_next;
22370Sstevel@tonic-gate 		if (mp == qbp->qb_first) {
22380Sstevel@tonic-gate 			if (mp->b_next && mp->b_band == mp->b_next->b_band)
22390Sstevel@tonic-gate 				qbp->qb_first = mp->b_next;
22400Sstevel@tonic-gate 			else
22410Sstevel@tonic-gate 				qbp->qb_first = NULL;
22420Sstevel@tonic-gate 		}
22430Sstevel@tonic-gate 		if (mp == qbp->qb_last) {
22440Sstevel@tonic-gate 			if (mp->b_prev && mp->b_band == mp->b_prev->b_band)
22450Sstevel@tonic-gate 				qbp->qb_last = mp->b_prev;
22460Sstevel@tonic-gate 			else
22470Sstevel@tonic-gate 				qbp->qb_last = NULL;
22480Sstevel@tonic-gate 		}
22490Sstevel@tonic-gate 	}
22500Sstevel@tonic-gate 
22510Sstevel@tonic-gate 	/*
22520Sstevel@tonic-gate 	 * Remove the message from the list.
22530Sstevel@tonic-gate 	 */
22540Sstevel@tonic-gate 	if (mp->b_prev)
22550Sstevel@tonic-gate 		mp->b_prev->b_next = mp->b_next;
22560Sstevel@tonic-gate 	else
22570Sstevel@tonic-gate 		q->q_first = mp->b_next;
22580Sstevel@tonic-gate 	if (mp->b_next)
22590Sstevel@tonic-gate 		mp->b_next->b_prev = mp->b_prev;
22600Sstevel@tonic-gate 	else
22610Sstevel@tonic-gate 		q->q_last = mp->b_prev;
22620Sstevel@tonic-gate 	mp->b_next = NULL;
22630Sstevel@tonic-gate 	mp->b_prev = NULL;
22640Sstevel@tonic-gate 
22650Sstevel@tonic-gate 	/* Get the size of the message for q_count accounting */
22666769Sja97890 	bytecnt = mp_cont_len(mp, &mblkcnt);
22670Sstevel@tonic-gate 
22680Sstevel@tonic-gate 	if (mp->b_band == 0) {		/* Perform q_count accounting */
22690Sstevel@tonic-gate 		q->q_count -= bytecnt;
22700Sstevel@tonic-gate 		q->q_mblkcnt -= mblkcnt;
22715360Srk129064 		if (q->q_mblkcnt == 0 || ((q->q_count < q->q_hiwat) &&
22725360Srk129064 		    (q->q_mblkcnt < q->q_hiwat))) {
22730Sstevel@tonic-gate 			q->q_flag &= ~QFULL;
22740Sstevel@tonic-gate 		}
22750Sstevel@tonic-gate 	} else {			/* Perform qb_count accounting */
22760Sstevel@tonic-gate 		qbp->qb_count -= bytecnt;
22770Sstevel@tonic-gate 		qbp->qb_mblkcnt -= mblkcnt;
22785360Srk129064 		if (qbp->qb_mblkcnt == 0 || ((qbp->qb_count < qbp->qb_hiwat) &&
22795360Srk129064 		    (qbp->qb_mblkcnt < qbp->qb_hiwat))) {
22800Sstevel@tonic-gate 			qbp->qb_flag &= ~QB_FULL;
22810Sstevel@tonic-gate 		}
22820Sstevel@tonic-gate 	}
22830Sstevel@tonic-gate 	if (freezer != curthread)
22840Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
22850Sstevel@tonic-gate 
22860Sstevel@tonic-gate 	STR_FTEVENT_MSG(mp, q, FTEV_RMVQ, NULL);
22870Sstevel@tonic-gate }
22880Sstevel@tonic-gate 
22890Sstevel@tonic-gate /*
22900Sstevel@tonic-gate  * Empty a queue.
22910Sstevel@tonic-gate  * If flag is set, remove all messages.  Otherwise, remove
22920Sstevel@tonic-gate  * only non-control messages.  If queue falls below its low
22930Sstevel@tonic-gate  * water mark, and QWANTW is set, enable the nearest upstream
22940Sstevel@tonic-gate  * service procedure.
22950Sstevel@tonic-gate  *
22960Sstevel@tonic-gate  * Historical note: when merging the M_FLUSH code in strrput with this
22970Sstevel@tonic-gate  * code one difference was discovered. flushq did not have a check
22980Sstevel@tonic-gate  * for q_lowat == 0 in the backenabling test.
22990Sstevel@tonic-gate  *
23000Sstevel@tonic-gate  * pcproto_flag specifies whether or not a M_PCPROTO message should be flushed
23010Sstevel@tonic-gate  * if one exists on the queue.
23020Sstevel@tonic-gate  */
23030Sstevel@tonic-gate void
23040Sstevel@tonic-gate flushq_common(queue_t *q, int flag, int pcproto_flag)
23050Sstevel@tonic-gate {
23060Sstevel@tonic-gate 	mblk_t *mp, *nmp;
23070Sstevel@tonic-gate 	qband_t *qbp;
23080Sstevel@tonic-gate 	int backenab = 0;
23090Sstevel@tonic-gate 	unsigned char bpri;
23100Sstevel@tonic-gate 	unsigned char	qbf[NBAND];	/* band flushing backenable flags */
23110Sstevel@tonic-gate 
23120Sstevel@tonic-gate 	if (q->q_first == NULL)
23130Sstevel@tonic-gate 		return;
23140Sstevel@tonic-gate 
23150Sstevel@tonic-gate 	mutex_enter(QLOCK(q));
23160Sstevel@tonic-gate 	mp = q->q_first;
23170Sstevel@tonic-gate 	q->q_first = NULL;
23180Sstevel@tonic-gate 	q->q_last = NULL;
23190Sstevel@tonic-gate 	q->q_count = 0;
23200Sstevel@tonic-gate 	q->q_mblkcnt = 0;
23210Sstevel@tonic-gate 	for (qbp = q->q_bandp; qbp; qbp = qbp->qb_next) {
23220Sstevel@tonic-gate 		qbp->qb_first = NULL;
23230Sstevel@tonic-gate 		qbp->qb_last = NULL;
23240Sstevel@tonic-gate 		qbp->qb_count = 0;
23250Sstevel@tonic-gate 		qbp->qb_mblkcnt = 0;
23260Sstevel@tonic-gate 		qbp->qb_flag &= ~QB_FULL;
23270Sstevel@tonic-gate 	}
23280Sstevel@tonic-gate 	q->q_flag &= ~QFULL;
23290Sstevel@tonic-gate 	mutex_exit(QLOCK(q));
23300Sstevel@tonic-gate 	while (mp) {
23310Sstevel@tonic-gate 		nmp = mp->b_next;
23320Sstevel@tonic-gate 		mp->b_next = mp->b_prev = NULL;
23330Sstevel@tonic-gate 
23340Sstevel@tonic-gate 		STR_FTEVENT_MBLK(mp, q, FTEV_FLUSHQ, NULL);
23350Sstevel@tonic-gate 
23360Sstevel@tonic-gate 		if (pcproto_flag && (mp->b_datap->db_type == M_PCPROTO))
23370Sstevel@tonic-gate 			(void) putq(q, mp);
23380Sstevel@tonic-gate 		else if (flag || datamsg(mp->b_datap->db_type))
23390Sstevel@tonic-gate 			freemsg(mp);
23400Sstevel@tonic-gate 		else
23410Sstevel@tonic-gate 			(void) putq(q, mp);
23420Sstevel@tonic-gate 		mp = nmp;
23430Sstevel@tonic-gate 	}
23440Sstevel@tonic-gate 	bpri = 1;
23450Sstevel@tonic-gate 	mutex_enter(QLOCK(q));
23460Sstevel@tonic-gate 	for (qbp = q->q_bandp; qbp; qbp = qbp->qb_next) {
23470Sstevel@tonic-gate 		if ((qbp->qb_flag & QB_WANTW) &&
23480Sstevel@tonic-gate 		    (((qbp->qb_count < qbp->qb_lowat) &&
23490Sstevel@tonic-gate 		    (qbp->qb_mblkcnt < qbp->qb_lowat)) ||
23500Sstevel@tonic-gate 		    qbp->qb_lowat == 0)) {
23510Sstevel@tonic-gate 			qbp->qb_flag &= ~QB_WANTW;
23520Sstevel@tonic-gate 			backenab = 1;
23530Sstevel@tonic-gate 			qbf[bpri] = 1;
23540Sstevel@tonic-gate 		} else
23550Sstevel@tonic-gate 			qbf[bpri] = 0;
23560Sstevel@tonic-gate 		bpri++;
23570Sstevel@tonic-gate 	}
23580Sstevel@tonic-gate 	ASSERT(bpri == (unsigned char)(q->q_nband + 1));
23590Sstevel@tonic-gate 	if ((q->q_flag & QWANTW) &&
23600Sstevel@tonic-gate 	    (((q->q_count < q->q_lowat) &&
23610Sstevel@tonic-gate 	    (q->q_mblkcnt < q->q_lowat)) || q->q_lowat == 0)) {
23620Sstevel@tonic-gate 		q->q_flag &= ~QWANTW;
23630Sstevel@tonic-gate 		backenab = 1;
23640Sstevel@tonic-gate 		qbf[0] = 1;
23650Sstevel@tonic-gate 	} else
23660Sstevel@tonic-gate 		qbf[0] = 0;
23670Sstevel@tonic-gate 
23680Sstevel@tonic-gate 	/*
23690Sstevel@tonic-gate 	 * If any band can now be written to, and there is a writer
23700Sstevel@tonic-gate 	 * for that band, then backenable the closest service procedure.
23710Sstevel@tonic-gate 	 */
23720Sstevel@tonic-gate 	if (backenab) {
23730Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
23740Sstevel@tonic-gate 		for (bpri = q->q_nband; bpri != 0; bpri--)
23750Sstevel@tonic-gate 			if (qbf[bpri])
2376235Smicheng 				backenable(q, bpri);
23770Sstevel@tonic-gate 		if (qbf[0])
23780Sstevel@tonic-gate 			backenable(q, 0);
23790Sstevel@tonic-gate 	} else
23800Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
23810Sstevel@tonic-gate }
23820Sstevel@tonic-gate 
23830Sstevel@tonic-gate /*
23840Sstevel@tonic-gate  * The real flushing takes place in flushq_common. This is done so that
23850Sstevel@tonic-gate  * a flag which specifies whether or not M_PCPROTO messages should be flushed
23860Sstevel@tonic-gate  * or not. Currently the only place that uses this flag is the stream head.
23870Sstevel@tonic-gate  */
23880Sstevel@tonic-gate void
23890Sstevel@tonic-gate flushq(queue_t *q, int flag)
23900Sstevel@tonic-gate {
23910Sstevel@tonic-gate 	flushq_common(q, flag, 0);
23920Sstevel@tonic-gate }
23930Sstevel@tonic-gate 
23940Sstevel@tonic-gate /*
23950Sstevel@tonic-gate  * Flush the queue of messages of the given priority band.
23960Sstevel@tonic-gate  * There is some duplication of code between flushq and flushband.
23970Sstevel@tonic-gate  * This is because we want to optimize the code as much as possible.
23980Sstevel@tonic-gate  * The assumption is that there will be more messages in the normal
23990Sstevel@tonic-gate  * (priority 0) band than in any other.
24000Sstevel@tonic-gate  *
24010Sstevel@tonic-gate  * Historical note: when merging the M_FLUSH code in strrput with this
24020Sstevel@tonic-gate  * code one difference was discovered. flushband had an extra check for
24030Sstevel@tonic-gate  * did not have a check for (mp->b_datap->db_type < QPCTL) in the band 0
24040Sstevel@tonic-gate  * case. That check does not match the man page for flushband and was not
24050Sstevel@tonic-gate  * in the strrput flush code hence it was removed.
24060Sstevel@tonic-gate  */
24070Sstevel@tonic-gate void
24080Sstevel@tonic-gate flushband(queue_t *q, unsigned char pri, int flag)
24090Sstevel@tonic-gate {
24100Sstevel@tonic-gate 	mblk_t *mp;
24110Sstevel@tonic-gate 	mblk_t *nmp;
24120Sstevel@tonic-gate 	mblk_t *last;
24130Sstevel@tonic-gate 	qband_t *qbp;
24140Sstevel@tonic-gate 	int band;
24150Sstevel@tonic-gate 
24160Sstevel@tonic-gate 	ASSERT((flag == FLUSHDATA) || (flag == FLUSHALL));
24170Sstevel@tonic-gate 	if (pri > q->q_nband) {
24180Sstevel@tonic-gate 		return;
24190Sstevel@tonic-gate 	}
24200Sstevel@tonic-gate 	mutex_enter(QLOCK(q));
24210Sstevel@tonic-gate 	if (pri == 0) {
24220Sstevel@tonic-gate 		mp = q->q_first;
24230Sstevel@tonic-gate 		q->q_first = NULL;
24240Sstevel@tonic-gate 		q->q_last = NULL;
24250Sstevel@tonic-gate 		q->q_count = 0;
24260Sstevel@tonic-gate 		q->q_mblkcnt = 0;
24270Sstevel@tonic-gate 		for (qbp = q->q_bandp; qbp; qbp = qbp->qb_next) {
24280Sstevel@tonic-gate 			qbp->qb_first = NULL;
24290Sstevel@tonic-gate 			qbp->qb_last = NULL;
24300Sstevel@tonic-gate 			qbp->qb_count = 0;
24310Sstevel@tonic-gate 			qbp->qb_mblkcnt = 0;
24320Sstevel@tonic-gate 			qbp->qb_flag &= ~QB_FULL;
24330Sstevel@tonic-gate 		}
24340Sstevel@tonic-gate 		q->q_flag &= ~QFULL;
24350Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
24360Sstevel@tonic-gate 		while (mp) {
24370Sstevel@tonic-gate 			nmp = mp->b_next;
24380Sstevel@tonic-gate 			mp->b_next = mp->b_prev = NULL;
24390Sstevel@tonic-gate 			if ((mp->b_band == 0) &&
24406707Sbrutus 			    ((flag == FLUSHALL) ||
24416707Sbrutus 			    datamsg(mp->b_datap->db_type)))
24420Sstevel@tonic-gate 				freemsg(mp);
24430Sstevel@tonic-gate 			else
24440Sstevel@tonic-gate 				(void) putq(q, mp);
24450Sstevel@tonic-gate 			mp = nmp;
24460Sstevel@tonic-gate 		}
24470Sstevel@tonic-gate 		mutex_enter(QLOCK(q));
24480Sstevel@tonic-gate 		if ((q->q_flag & QWANTW) &&
24490Sstevel@tonic-gate 		    (((q->q_count < q->q_lowat) &&
24500Sstevel@tonic-gate 		    (q->q_mblkcnt < q->q_lowat)) || q->q_lowat == 0)) {
24510Sstevel@tonic-gate 			q->q_flag &= ~QWANTW;
24520Sstevel@tonic-gate 			mutex_exit(QLOCK(q));
24530Sstevel@tonic-gate 
2454235Smicheng 			backenable(q, pri);
24550Sstevel@tonic-gate 		} else
24560Sstevel@tonic-gate 			mutex_exit(QLOCK(q));
24570Sstevel@tonic-gate 	} else {	/* pri != 0 */
24580Sstevel@tonic-gate 		boolean_t flushed = B_FALSE;
24590Sstevel@tonic-gate 		band = pri;
24600Sstevel@tonic-gate 
24610Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(QLOCK(q)));
24620Sstevel@tonic-gate 		qbp = q->q_bandp;
24630Sstevel@tonic-gate 		while (--band > 0)
24640Sstevel@tonic-gate 			qbp = qbp->qb_next;
24650Sstevel@tonic-gate 		mp = qbp->qb_first;
24660Sstevel@tonic-gate 		if (mp == NULL) {
24670Sstevel@tonic-gate 			mutex_exit(QLOCK(q));
24680Sstevel@tonic-gate 			return;
24690Sstevel@tonic-gate 		}
24700Sstevel@tonic-gate 		last = qbp->qb_last->b_next;
24710Sstevel@tonic-gate 		/*
24720Sstevel@tonic-gate 		 * rmvq_noenab() and freemsg() are called for each mblk that
24730Sstevel@tonic-gate 		 * meets the criteria.  The loop is executed until the last
24740Sstevel@tonic-gate 		 * mblk has been processed.
24750Sstevel@tonic-gate 		 */
24760Sstevel@tonic-gate 		while (mp != last) {
24770Sstevel@tonic-gate 			ASSERT(mp->b_band == pri);
24780Sstevel@tonic-gate 			nmp = mp->b_next;
24790Sstevel@tonic-gate 			if (flag == FLUSHALL || datamsg(mp->b_datap->db_type)) {
24800Sstevel@tonic-gate 				rmvq_noenab(q, mp);
24810Sstevel@tonic-gate 				freemsg(mp);
24820Sstevel@tonic-gate 				flushed = B_TRUE;
24830Sstevel@tonic-gate 			}
24840Sstevel@tonic-gate 			mp = nmp;
24850Sstevel@tonic-gate 		}
24860Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
24870Sstevel@tonic-gate 
24880Sstevel@tonic-gate 		/*
24890Sstevel@tonic-gate 		 * If any mblk(s) has been freed, we know that qbackenable()
24900Sstevel@tonic-gate 		 * will need to be called.
24910Sstevel@tonic-gate 		 */
24920Sstevel@tonic-gate 		if (flushed)
2493235Smicheng 			qbackenable(q, pri);
24940Sstevel@tonic-gate 	}
24950Sstevel@tonic-gate }
24960Sstevel@tonic-gate 
24970Sstevel@tonic-gate /*
24980Sstevel@tonic-gate  * Return 1 if the queue is not full.  If the queue is full, return
24990Sstevel@tonic-gate  * 0 (may not put message) and set QWANTW flag (caller wants to write
25000Sstevel@tonic-gate  * to the queue).
25010Sstevel@tonic-gate  */
25020Sstevel@tonic-gate int
25030Sstevel@tonic-gate canput(queue_t *q)
25040Sstevel@tonic-gate {
25050Sstevel@tonic-gate 	TRACE_1(TR_FAC_STREAMS_FR, TR_CANPUT_IN, "canput:%p", q);
25060Sstevel@tonic-gate 
25070Sstevel@tonic-gate 	/* this is for loopback transports, they should not do a canput */
25080Sstevel@tonic-gate 	ASSERT(STRMATED(q->q_stream) || STREAM(q) == STREAM(q->q_nfsrv));
25090Sstevel@tonic-gate 
25100Sstevel@tonic-gate 	/* Find next forward module that has a service procedure */
25110Sstevel@tonic-gate 	q = q->q_nfsrv;
25120Sstevel@tonic-gate 
25130Sstevel@tonic-gate 	if (!(q->q_flag & QFULL)) {
25140Sstevel@tonic-gate 		TRACE_2(TR_FAC_STREAMS_FR, TR_CANPUT_OUT, "canput:%p %d", q, 1);
25150Sstevel@tonic-gate 		return (1);
25160Sstevel@tonic-gate 	}
25170Sstevel@tonic-gate 	mutex_enter(QLOCK(q));
25180Sstevel@tonic-gate 	if (q->q_flag & QFULL) {
25190Sstevel@tonic-gate 		q->q_flag |= QWANTW;
25200Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
25210Sstevel@tonic-gate 		TRACE_2(TR_FAC_STREAMS_FR, TR_CANPUT_OUT, "canput:%p %d", q, 0);
25220Sstevel@tonic-gate 		return (0);
25230Sstevel@tonic-gate 	}
25240Sstevel@tonic-gate 	mutex_exit(QLOCK(q));
25250Sstevel@tonic-gate 	TRACE_2(TR_FAC_STREAMS_FR, TR_CANPUT_OUT, "canput:%p %d", q, 1);
25260Sstevel@tonic-gate 	return (1);
25270Sstevel@tonic-gate }
25280Sstevel@tonic-gate 
25290Sstevel@tonic-gate /*
25300Sstevel@tonic-gate  * This is the new canput for use with priority bands.  Return 1 if the
25310Sstevel@tonic-gate  * band is not full.  If the band is full, return 0 (may not put message)
25320Sstevel@tonic-gate  * and set QWANTW(QB_WANTW) flag for zero(non-zero) band (caller wants to
25330Sstevel@tonic-gate  * write to the queue).
25340Sstevel@tonic-gate  */
25350Sstevel@tonic-gate int
25360Sstevel@tonic-gate bcanput(queue_t *q, unsigned char pri)
25370Sstevel@tonic-gate {
25380Sstevel@tonic-gate 	qband_t *qbp;
25390Sstevel@tonic-gate 
25400Sstevel@tonic-gate 	TRACE_2(TR_FAC_STREAMS_FR, TR_BCANPUT_IN, "bcanput:%p %p", q, pri);
25410Sstevel@tonic-gate 	if (!q)
25420Sstevel@tonic-gate 		return (0);
25430Sstevel@tonic-gate 
25440Sstevel@tonic-gate 	/* Find next forward module that has a service procedure */
25450Sstevel@tonic-gate 	q = q->q_nfsrv;
25460Sstevel@tonic-gate 
25470Sstevel@tonic-gate 	mutex_enter(QLOCK(q));
25480Sstevel@tonic-gate 	if (pri == 0) {
25490Sstevel@tonic-gate 		if (q->q_flag & QFULL) {
25500Sstevel@tonic-gate 			q->q_flag |= QWANTW;
25510Sstevel@tonic-gate 			mutex_exit(QLOCK(q));
25520Sstevel@tonic-gate 			TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
25536707Sbrutus 			    "bcanput:%p %X %d", q, pri, 0);
25540Sstevel@tonic-gate 			return (0);
25550Sstevel@tonic-gate 		}
25560Sstevel@tonic-gate 	} else {	/* pri != 0 */
25570Sstevel@tonic-gate 		if (pri > q->q_nband) {
25580Sstevel@tonic-gate 			/*
25590Sstevel@tonic-gate 			 * No band exists yet, so return success.
25600Sstevel@tonic-gate 			 */
25610Sstevel@tonic-gate 			mutex_exit(QLOCK(q));
25620Sstevel@tonic-gate 			TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
25636707Sbrutus 			    "bcanput:%p %X %d", q, pri, 1);
25640Sstevel@tonic-gate 			return (1);
25650Sstevel@tonic-gate 		}
25660Sstevel@tonic-gate 		qbp = q->q_bandp;
25670Sstevel@tonic-gate 		while (--pri)
25680Sstevel@tonic-gate 			qbp = qbp->qb_next;
25690Sstevel@tonic-gate 		if (qbp->qb_flag & QB_FULL) {
25700Sstevel@tonic-gate 			qbp->qb_flag |= QB_WANTW;
25710Sstevel@tonic-gate 			mutex_exit(QLOCK(q));
25720Sstevel@tonic-gate 			TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
25736707Sbrutus 			    "bcanput:%p %X %d", q, pri, 0);
25740Sstevel@tonic-gate 			return (0);
25750Sstevel@tonic-gate 		}
25760Sstevel@tonic-gate 	}
25770Sstevel@tonic-gate 	mutex_exit(QLOCK(q));
25780Sstevel@tonic-gate 	TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
25796707Sbrutus 	    "bcanput:%p %X %d", q, pri, 1);
25800Sstevel@tonic-gate 	return (1);
25810Sstevel@tonic-gate }
25820Sstevel@tonic-gate 
25830Sstevel@tonic-gate /*
25840Sstevel@tonic-gate  * Put a message on a queue.
25850Sstevel@tonic-gate  *
25860Sstevel@tonic-gate  * Messages are enqueued on a priority basis.  The priority classes
25870Sstevel@tonic-gate  * are HIGH PRIORITY (type >= QPCTL), PRIORITY (type < QPCTL && band > 0),
25880Sstevel@tonic-gate  * and B_NORMAL (type < QPCTL && band == 0).
25890Sstevel@tonic-gate  *
25900Sstevel@tonic-gate  * Add appropriate weighted data block sizes to queue count.
25910Sstevel@tonic-gate  * If queue hits high water mark then set QFULL flag.
25920Sstevel@tonic-gate  *
25930Sstevel@tonic-gate  * If QNOENAB is not set (putq is allowed to enable the queue),
25940Sstevel@tonic-gate  * enable the queue only if the message is PRIORITY,
25950Sstevel@tonic-gate  * or the QWANTR flag is set (indicating that the service procedure
25960Sstevel@tonic-gate  * is ready to read the queue.  This implies that a service
25970Sstevel@tonic-gate  * procedure must NEVER put a high priority message back on its own
25980Sstevel@tonic-gate  * queue, as this would result in an infinite loop (!).
25990Sstevel@tonic-gate  */
26000Sstevel@tonic-gate int
26010Sstevel@tonic-gate putq(queue_t *q, mblk_t *bp)
26020Sstevel@tonic-gate {
26030Sstevel@tonic-gate 	mblk_t *tmp;
26040Sstevel@tonic-gate 	qband_t *qbp = NULL;
26050Sstevel@tonic-gate 	int mcls = (int)queclass(bp);
26060Sstevel@tonic-gate 	kthread_id_t freezer;
26070Sstevel@tonic-gate 	int	bytecnt = 0, mblkcnt = 0;
26080Sstevel@tonic-gate 
26090Sstevel@tonic-gate 	freezer = STREAM(q)->sd_freezer;
26100Sstevel@tonic-gate 	if (freezer == curthread) {
26110Sstevel@tonic-gate 		ASSERT(frozenstr(q));
26120Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(QLOCK(q)));
26130Sstevel@tonic-gate 	} else
26140Sstevel@tonic-gate 		mutex_enter(QLOCK(q));
26150Sstevel@tonic-gate 
26160Sstevel@tonic-gate 	/*
26170Sstevel@tonic-gate 	 * Make sanity checks and if qband structure is not yet
26180Sstevel@tonic-gate 	 * allocated, do so.
26190Sstevel@tonic-gate 	 */
26200Sstevel@tonic-gate 	if (mcls == QPCTL) {
26210Sstevel@tonic-gate 		if (bp->b_band != 0)
26220Sstevel@tonic-gate 			bp->b_band = 0;		/* force to be correct */
26230Sstevel@tonic-gate 	} else if (bp->b_band != 0) {
26240Sstevel@tonic-gate 		int i;
26250Sstevel@tonic-gate 		qband_t **qbpp;
26260Sstevel@tonic-gate 
26270Sstevel@tonic-gate 		if (bp->b_band > q->q_nband) {
26280Sstevel@tonic-gate 
26290Sstevel@tonic-gate 			/*
26300Sstevel@tonic-gate 			 * The qband structure for this priority band is
26310Sstevel@tonic-gate 			 * not on the queue yet, so we have to allocate
26320Sstevel@tonic-gate 			 * one on the fly.  It would be wasteful to
26330Sstevel@tonic-gate 			 * associate the qband structures with every
26340Sstevel@tonic-gate 			 * queue when the queues are allocated.  This is
26350Sstevel@tonic-gate 			 * because most queues will only need the normal
26360Sstevel@tonic-gate 			 * band of flow which can be described entirely
26370Sstevel@tonic-gate 			 * by the queue itself.
26380Sstevel@tonic-gate 			 */
26390Sstevel@tonic-gate 			qbpp = &q->q_bandp;
26400Sstevel@tonic-gate 			while (*qbpp)
26410Sstevel@tonic-gate 				qbpp = &(*qbpp)->qb_next;
26420Sstevel@tonic-gate 			while (bp->b_band > q->q_nband) {
26430Sstevel@tonic-gate 				if ((*qbpp = allocband()) == NULL) {
26440Sstevel@tonic-gate 					if (freezer != curthread)
26450Sstevel@tonic-gate 						mutex_exit(QLOCK(q));
26460Sstevel@tonic-gate 					return (0);
26470Sstevel@tonic-gate 				}
26480Sstevel@tonic-gate 				(*qbpp)->qb_hiwat = q->q_hiwat;
26490Sstevel@tonic-gate 				(*qbpp)->qb_lowat = q->q_lowat;
26500Sstevel@tonic-gate 				q->q_nband++;
26510Sstevel@tonic-gate 				qbpp = &(*qbpp)->qb_next;
26520Sstevel@tonic-gate 			}
26530Sstevel@tonic-gate 		}
26540Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(QLOCK(q)));
26550Sstevel@tonic-gate 		qbp = q->q_bandp;
26560Sstevel@tonic-gate 		i = bp->b_band;
26570Sstevel@tonic-gate 		while (--i)
26580Sstevel@tonic-gate 			qbp = qbp->qb_next;
26590Sstevel@tonic-gate 	}
26600Sstevel@tonic-gate 
26610Sstevel@tonic-gate 	/*
26620Sstevel@tonic-gate 	 * If queue is empty, add the message and initialize the pointers.
26630Sstevel@tonic-gate 	 * Otherwise, adjust message pointers and queue pointers based on
26640Sstevel@tonic-gate 	 * the type of the message and where it belongs on the queue.  Some
26650Sstevel@tonic-gate 	 * code is duplicated to minimize the number of conditionals and
26660Sstevel@tonic-gate 	 * hopefully minimize the amount of time this routine takes.
26670Sstevel@tonic-gate 	 */
26680Sstevel@tonic-gate 	if (!q->q_first) {
26690Sstevel@tonic-gate 		bp->b_next = NULL;
26700Sstevel@tonic-gate 		bp->b_prev = NULL;
26710Sstevel@tonic-gate 		q->q_first = bp;
26720Sstevel@tonic-gate 		q->q_last = bp;
26730Sstevel@tonic-gate 		if (qbp) {
26740Sstevel@tonic-gate 			qbp->qb_first = bp;
26750Sstevel@tonic-gate 			qbp->qb_last = bp;
26760Sstevel@tonic-gate 		}
26770Sstevel@tonic-gate 	} else if (!qbp) {	/* bp->b_band == 0 */
26780Sstevel@tonic-gate 
26790Sstevel@tonic-gate 		/*
26800Sstevel@tonic-gate 		 * If queue class of message is less than or equal to
26810Sstevel@tonic-gate 		 * that of the last one on the queue, tack on to the end.
26820Sstevel@tonic-gate 		 */
26830Sstevel@tonic-gate 		tmp = q->q_last;
26840Sstevel@tonic-gate 		if (mcls <= (int)queclass(tmp)) {
26850Sstevel@tonic-gate 			bp->b_next = NULL;
26860Sstevel@tonic-gate 			bp->b_prev = tmp;
26870Sstevel@tonic-gate 			tmp->b_next = bp;
26880Sstevel@tonic-gate 			q->q_last = bp;
26890Sstevel@tonic-gate 		} else {
26900Sstevel@tonic-gate 			tmp = q->q_first;
26910Sstevel@tonic-gate 			while ((int)queclass(tmp) >= mcls)
26920Sstevel@tonic-gate 				tmp = tmp->b_next;
26930Sstevel@tonic-gate 
26940Sstevel@tonic-gate 			/*
26950Sstevel@tonic-gate 			 * Insert bp before tmp.
26960Sstevel@tonic-gate 			 */
26970Sstevel@tonic-gate 			bp->b_next = tmp;
26980Sstevel@tonic-gate 			bp->b_prev = tmp->b_prev;
26990Sstevel@tonic-gate 			if (tmp->b_prev)
27000Sstevel@tonic-gate 				tmp->b_prev->b_next = bp;
27010Sstevel@tonic-gate 			else
27020Sstevel@tonic-gate 				q->q_first = bp;
27030Sstevel@tonic-gate 			tmp->b_prev = bp;
27040Sstevel@tonic-gate 		}
27050Sstevel@tonic-gate 	} else {		/* bp->b_band != 0 */
27060Sstevel@tonic-gate 		if (qbp->qb_first) {
27070Sstevel@tonic-gate 			tmp = qbp->qb_last;
27080Sstevel@tonic-gate 
27090Sstevel@tonic-gate 			/*
27100Sstevel@tonic-gate 			 * Insert bp after the last message in this band.
27110Sstevel@tonic-gate 			 */
27120Sstevel@tonic-gate 			bp->b_next = tmp->b_next;
27130Sstevel@tonic-gate 			if (tmp->b_next)
27140Sstevel@tonic-gate 				tmp->b_next->b_prev = bp;
27150Sstevel@tonic-gate 			else
27160Sstevel@tonic-gate 				q->q_last = bp;
27170Sstevel@tonic-gate 			bp->b_prev = tmp;
27180Sstevel@tonic-gate 			tmp->b_next = bp;
27190Sstevel@tonic-gate 		} else {
27200Sstevel@tonic-gate 			tmp = q->q_last;
27210Sstevel@tonic-gate 			if ((mcls < (int)queclass(tmp)) ||
27220Sstevel@tonic-gate 			    (bp->b_band <= tmp->b_band)) {
27230Sstevel@tonic-gate 
27240Sstevel@tonic-gate 				/*
27250Sstevel@tonic-gate 				 * Tack bp on end of queue.
27260Sstevel@tonic-gate 				 */
27270Sstevel@tonic-gate 				bp->b_next = NULL;
27280Sstevel@tonic-gate 				bp->b_prev = tmp;
27290Sstevel@tonic-gate 				tmp->b_next = bp;
27300Sstevel@tonic-gate 				q->q_last = bp;
27310Sstevel@tonic-gate 			} else {
27320Sstevel@tonic-gate 				tmp = q->q_first;
27330Sstevel@tonic-gate 				while (tmp->b_datap->db_type >= QPCTL)
27340Sstevel@tonic-gate 					tmp = tmp->b_next;
27350Sstevel@tonic-gate 				while (tmp->b_band >= bp->b_band)
27360Sstevel@tonic-gate 					tmp = tmp->b_next;
27370Sstevel@tonic-gate 
27380Sstevel@tonic-gate 				/*
27390Sstevel@tonic-gate 				 * Insert bp before tmp.
27400Sstevel@tonic-gate 				 */
27410Sstevel@tonic-gate 				bp->b_next = tmp;
27420Sstevel@tonic-gate 				bp->b_prev = tmp->b_prev;
27430Sstevel@tonic-gate 				if (tmp->b_prev)
27440Sstevel@tonic-gate 					tmp->b_prev->b_next = bp;
27450Sstevel@tonic-gate 				else
27460Sstevel@tonic-gate 					q->q_first = bp;
27470Sstevel@tonic-gate 				tmp->b_prev = bp;
27480Sstevel@tonic-gate 			}
27490Sstevel@tonic-gate 			qbp->qb_first = bp;
27500Sstevel@tonic-gate 		}
27510Sstevel@tonic-gate 		qbp->qb_last = bp;
27520Sstevel@tonic-gate 	}
27530Sstevel@tonic-gate 
27540Sstevel@tonic-gate 	/* Get message byte count for q_count accounting */
27556769Sja97890 	bytecnt = mp_cont_len(bp, &mblkcnt);
2756741Smasputra 
27570Sstevel@tonic-gate 	if (qbp) {
27580Sstevel@tonic-gate 		qbp->qb_count += bytecnt;
27590Sstevel@tonic-gate 		qbp->qb_mblkcnt += mblkcnt;
27600Sstevel@tonic-gate 		if ((qbp->qb_count >= qbp->qb_hiwat) ||
27610Sstevel@tonic-gate 		    (qbp->qb_mblkcnt >= qbp->qb_hiwat)) {
27620Sstevel@tonic-gate 			qbp->qb_flag |= QB_FULL;
27630Sstevel@tonic-gate 		}
27640Sstevel@tonic-gate 	} else {
27650Sstevel@tonic-gate 		q->q_count += bytecnt;
27660Sstevel@tonic-gate 		q->q_mblkcnt += mblkcnt;
27670Sstevel@tonic-gate 		if ((q->q_count >= q->q_hiwat) ||
27680Sstevel@tonic-gate 		    (q->q_mblkcnt >= q->q_hiwat)) {
27690Sstevel@tonic-gate 			q->q_flag |= QFULL;
27700Sstevel@tonic-gate 		}
27710Sstevel@tonic-gate 	}
27720Sstevel@tonic-gate 
27730Sstevel@tonic-gate 	STR_FTEVENT_MSG(bp, q, FTEV_PUTQ, NULL);
27740Sstevel@tonic-gate 
27750Sstevel@tonic-gate 	if ((mcls > QNORM) ||
27760Sstevel@tonic-gate 	    (canenable(q) && (q->q_flag & QWANTR || bp->b_band)))
27770Sstevel@tonic-gate 		qenable_locked(q);
27780Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(QLOCK(q)));
27790Sstevel@tonic-gate 	if (freezer != curthread)
27800Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
27810Sstevel@tonic-gate 
27820Sstevel@tonic-gate 	return (1);
27830Sstevel@tonic-gate }
27840Sstevel@tonic-gate 
27850Sstevel@tonic-gate /*
27860Sstevel@tonic-gate  * Put stuff back at beginning of Q according to priority order.
27870Sstevel@tonic-gate  * See comment on putq above for details.
27880Sstevel@tonic-gate  */
27890Sstevel@tonic-gate int
27900Sstevel@tonic-gate putbq(queue_t *q, mblk_t *bp)
27910Sstevel@tonic-gate {
27920Sstevel@tonic-gate 	mblk_t *tmp;
27930Sstevel@tonic-gate 	qband_t *qbp = NULL;
27940Sstevel@tonic-gate 	int mcls = (int)queclass(bp);
27950Sstevel@tonic-gate 	kthread_id_t freezer;
27960Sstevel@tonic-gate 	int	bytecnt = 0, mblkcnt = 0;
27970Sstevel@tonic-gate 
27980Sstevel@tonic-gate 	ASSERT(q && bp);
27990Sstevel@tonic-gate 	ASSERT(bp->b_next == NULL);
28000Sstevel@tonic-gate 	freezer = STREAM(q)->sd_freezer;
28010Sstevel@tonic-gate 	if (freezer == curthread) {
28020Sstevel@tonic-gate 		ASSERT(frozenstr(q));
28030Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(QLOCK(q)));
28040Sstevel@tonic-gate 	} else
28050Sstevel@tonic-gate 		mutex_enter(QLOCK(q));
28060Sstevel@tonic-gate 
28070Sstevel@tonic-gate 	/*
28080Sstevel@tonic-gate 	 * Make sanity checks and if qband structure is not yet
28090Sstevel@tonic-gate 	 * allocated, do so.
28100Sstevel@tonic-gate 	 */
28110Sstevel@tonic-gate 	if (mcls == QPCTL) {
28120Sstevel@tonic-gate 		if (bp->b_band != 0)
28130Sstevel@tonic-gate 			bp->b_band = 0;		/* force to be correct */
28140Sstevel@tonic-gate 	} else if (bp->b_band != 0) {
28150Sstevel@tonic-gate 		int i;
28160Sstevel@tonic-gate 		qband_t **qbpp;
28170Sstevel@tonic-gate 
28180Sstevel@tonic-gate 		if (bp->b_band > q->q_nband) {
28190Sstevel@tonic-gate 			qbpp = &q->q_bandp;
28200Sstevel@tonic-gate 			while (*qbpp)
28210Sstevel@tonic-gate 				qbpp = &(*qbpp)->qb_next;
28220Sstevel@tonic-gate 			while (bp->b_band > q->q_nband) {
28230Sstevel@tonic-gate 				if ((*qbpp = allocband()) == NULL) {
28240Sstevel@tonic-gate 					if (freezer != curthread)
28250Sstevel@tonic-gate 						mutex_exit(QLOCK(q));
28260Sstevel@tonic-gate 					return (0);
28270Sstevel@tonic-gate 				}
28280Sstevel@tonic-gate 				(*qbpp)->qb_hiwat = q->q_hiwat;
28290Sstevel@tonic-gate 				(*qbpp)->qb_lowat = q->q_lowat;
28300Sstevel@tonic-gate 				q->q_nband++;
28310Sstevel@tonic-gate 				qbpp = &(*qbpp)->qb_next;
28320Sstevel@tonic-gate 			}
28330Sstevel@tonic-gate 		}
28340Sstevel@tonic-gate 		qbp = q->q_bandp;
28350Sstevel@tonic-gate 		i = bp->b_band;
28360Sstevel@tonic-gate 		while (--i)
28370Sstevel@tonic-gate 			qbp = qbp->qb_next;
28380Sstevel@tonic-gate 	}
28390Sstevel@tonic-gate 
28400Sstevel@tonic-gate 	/*
28410Sstevel@tonic-gate 	 * If queue is empty or if message is high priority,
28420Sstevel@tonic-gate 	 * place on the front of the queue.
28430Sstevel@tonic-gate 	 */
28440Sstevel@tonic-gate 	tmp = q->q_first;
28450Sstevel@tonic-gate 	if ((!tmp) || (mcls == QPCTL)) {
28460Sstevel@tonic-gate 		bp->b_next = tmp;
28470Sstevel@tonic-gate 		if (tmp)
28480Sstevel@tonic-gate 			tmp->b_prev = bp;
28490Sstevel@tonic-gate 		else
28500Sstevel@tonic-gate 			q->q_last = bp;
28510Sstevel@tonic-gate 		q->q_first = bp;
28520Sstevel@tonic-gate 		bp->b_prev = NULL;
28530Sstevel@tonic-gate 		if (qbp) {
28540Sstevel@tonic-gate 			qbp->qb_first = bp;
28550Sstevel@tonic-gate 			qbp->qb_last = bp;
28560Sstevel@tonic-gate 		}
28570Sstevel@tonic-gate 	} else if (qbp) {	/* bp->b_band != 0 */
28580Sstevel@tonic-gate 		tmp = qbp->qb_first;
28590Sstevel@tonic-gate 		if (tmp) {
28600Sstevel@tonic-gate 
28610Sstevel@tonic-gate 			/*
28620Sstevel@tonic-gate 			 * Insert bp before the first message in this band.
28630Sstevel@tonic-gate 			 */
28640Sstevel@tonic-gate 			bp->b_next = tmp;
28650Sstevel@tonic-gate 			bp->b_prev = tmp->b_prev;
28660Sstevel@tonic-gate 			if (tmp->b_prev)
28670Sstevel@tonic-gate 				tmp->b_prev->b_next = bp;
28680Sstevel@tonic-gate 			else
28690Sstevel@tonic-gate 				q->q_first = bp;
28700Sstevel@tonic-gate 			tmp->b_prev = bp;
28710Sstevel@tonic-gate 		} else {
28720Sstevel@tonic-gate 			tmp = q->q_last;
28730Sstevel@tonic-gate 			if ((mcls < (int)queclass(tmp)) ||
28740Sstevel@tonic-gate 			    (bp->b_band < tmp->b_band)) {
28750Sstevel@tonic-gate 
28760Sstevel@tonic-gate 				/*
28770Sstevel@tonic-gate 				 * Tack bp on end of queue.
28780Sstevel@tonic-gate 				 */
28790Sstevel@tonic-gate 				bp->b_next = NULL;
28800Sstevel@tonic-gate 				bp->b_prev = tmp;
28810Sstevel@tonic-gate 				tmp->b_next = bp;
28820Sstevel@tonic-gate 				q->q_last = bp;
28830Sstevel@tonic-gate 			} else {
28840Sstevel@tonic-gate 				tmp = q->q_first;
28850Sstevel@tonic-gate 				while (tmp->b_datap->db_type >= QPCTL)
28860Sstevel@tonic-gate 					tmp = tmp->b_next;
28870Sstevel@tonic-gate 				while (tmp->b_band > bp->b_band)
28880Sstevel@tonic-gate 					tmp = tmp->b_next;
28890Sstevel@tonic-gate 
28900Sstevel@tonic-gate 				/*
28910Sstevel@tonic-gate 				 * Insert bp before tmp.
28920Sstevel@tonic-gate 				 */
28930Sstevel@tonic-gate 				bp->b_next = tmp;
28940Sstevel@tonic-gate 				bp->b_prev = tmp->b_prev;
28950Sstevel@tonic-gate 				if (tmp->b_prev)
28960Sstevel@tonic-gate 					tmp->b_prev->b_next = bp;
28970Sstevel@tonic-gate 				else
28980Sstevel@tonic-gate 					q->q_first = bp;
28990Sstevel@tonic-gate 				tmp->b_prev = bp;
29000Sstevel@tonic-gate 			}
29010Sstevel@tonic-gate 			qbp->qb_last = bp;
29020Sstevel@tonic-gate 		}
29030Sstevel@tonic-gate 		qbp->qb_first = bp;
29040Sstevel@tonic-gate 	} else {		/* bp->b_band == 0 && !QPCTL */
29050Sstevel@tonic-gate 
29060Sstevel@tonic-gate 		/*
29070Sstevel@tonic-gate 		 * If the queue class or band is less than that of the last
29080Sstevel@tonic-gate 		 * message on the queue, tack bp on the end of the queue.
29090Sstevel@tonic-gate 		 */
29100Sstevel@tonic-gate 		tmp = q->q_last;
29110Sstevel@tonic-gate 		if ((mcls < (int)queclass(tmp)) || (bp->b_band < tmp->b_band)) {
29120Sstevel@tonic-gate 			bp->b_next = NULL;
29130Sstevel@tonic-gate 			bp->b_prev = tmp;
29140Sstevel@tonic-gate 			tmp->b_next = bp;
29150Sstevel@tonic-gate 			q->q_last = bp;
29160Sstevel@tonic-gate 		} else {
29170Sstevel@tonic-gate 			tmp = q->q_first;
29180Sstevel@tonic-gate 			while (tmp->b_datap->db_type >= QPCTL)
29190Sstevel@tonic-gate 				tmp = tmp->b_next;
29200Sstevel@tonic-gate 			while (tmp->b_band > bp->b_band)
29210Sstevel@tonic-gate 				tmp = tmp->b_next;
29220Sstevel@tonic-gate 
29230Sstevel@tonic-gate 			/*
29240Sstevel@tonic-gate 			 * Insert bp before tmp.
29250Sstevel@tonic-gate 			 */
29260Sstevel@tonic-gate 			bp->b_next = tmp;
29270Sstevel@tonic-gate 			bp->b_prev = tmp->b_prev;
29280Sstevel@tonic-gate 			if (tmp->b_prev)
29290Sstevel@tonic-gate 				tmp->b_prev->b_next = bp;
29300Sstevel@tonic-gate 			else
29310Sstevel@tonic-gate 				q->q_first = bp;
29320Sstevel@tonic-gate 			tmp->b_prev = bp;
29330Sstevel@tonic-gate 		}
29340Sstevel@tonic-gate 	}
29350Sstevel@tonic-gate 
29360Sstevel@tonic-gate 	/* Get message byte count for q_count accounting */
29376769Sja97890 	bytecnt = mp_cont_len(bp, &mblkcnt);
29386769Sja97890 
29390Sstevel@tonic-gate 	if (qbp) {
29400Sstevel@tonic-gate 		qbp->qb_count += bytecnt;
29410Sstevel@tonic-gate 		qbp->qb_mblkcnt += mblkcnt;
29420Sstevel@tonic-gate 		if ((qbp->qb_count >= qbp->qb_hiwat) ||
29430Sstevel@tonic-gate 		    (qbp->qb_mblkcnt >= qbp->qb_hiwat)) {
29440Sstevel@tonic-gate 			qbp->qb_flag |= QB_FULL;
29450Sstevel@tonic-gate 		}
29460Sstevel@tonic-gate 	} else {
29470Sstevel@tonic-gate 		q->q_count += bytecnt;
29480Sstevel@tonic-gate 		q->q_mblkcnt += mblkcnt;
29490Sstevel@tonic-gate 		if ((q->q_count >= q->q_hiwat) ||
29500Sstevel@tonic-gate 		    (q->q_mblkcnt >= q->q_hiwat)) {
29510Sstevel@tonic-gate 			q->q_flag |= QFULL;
29520Sstevel@tonic-gate 		}
29530Sstevel@tonic-gate 	}
29540Sstevel@tonic-gate 
29550Sstevel@tonic-gate 	STR_FTEVENT_MSG(bp, q, FTEV_PUTBQ, NULL);
29560Sstevel@tonic-gate 
29570Sstevel@tonic-gate 	if ((mcls > QNORM) || (canenable(q) && (q->q_flag & QWANTR)))
29580Sstevel@tonic-gate 		qenable_locked(q);
29590Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(QLOCK(q)));
29600Sstevel@tonic-gate 	if (freezer != curthread)
29610Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
29620Sstevel@tonic-gate 
29630Sstevel@tonic-gate 	return (1);
29640Sstevel@tonic-gate }
29650Sstevel@tonic-gate 
29660Sstevel@tonic-gate /*
29670Sstevel@tonic-gate  * Insert a message before an existing message on the queue.  If the
29680Sstevel@tonic-gate  * existing message is NULL, the new messages is placed on the end of
29690Sstevel@tonic-gate  * the queue.  The queue class of the new message is ignored.  However,
29700Sstevel@tonic-gate  * the priority band of the new message must adhere to the following
29710Sstevel@tonic-gate  * ordering:
29720Sstevel@tonic-gate  *
29730Sstevel@tonic-gate  *	emp->b_prev->b_band >= mp->b_band >= emp->b_band.
29740Sstevel@tonic-gate  *
29750Sstevel@tonic-gate  * All flow control parameters are updated.
29760Sstevel@tonic-gate  *
29770Sstevel@tonic-gate  * insq can be called with the stream frozen, but other utility functions
29780Sstevel@tonic-gate  * holding QLOCK, and by streams modules without any locks/frozen.
29790Sstevel@tonic-gate  */
29800Sstevel@tonic-gate int
29810Sstevel@tonic-gate insq(queue_t *q, mblk_t *emp, mblk_t *mp)
29820Sstevel@tonic-gate {
29830Sstevel@tonic-gate 	mblk_t *tmp;
29840Sstevel@tonic-gate 	qband_t *qbp = NULL;
29850Sstevel@tonic-gate 	int mcls = (int)queclass(mp);
29860Sstevel@tonic-gate 	kthread_id_t freezer;
29870Sstevel@tonic-gate 	int	bytecnt = 0, mblkcnt = 0;
29880Sstevel@tonic-gate 
29890Sstevel@tonic-gate 	freezer = STREAM(q)->sd_freezer;
29900Sstevel@tonic-gate 	if (freezer == curthread) {
29910Sstevel@tonic-gate 		ASSERT(frozenstr(q));
29920Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(QLOCK(q)));
29930Sstevel@tonic-gate 	} else if (MUTEX_HELD(QLOCK(q))) {
29940Sstevel@tonic-gate 		/* Don't drop lock on exit */
29950Sstevel@tonic-gate 		freezer = curthread;
29960Sstevel@tonic-gate 	} else
29970Sstevel@tonic-gate 		mutex_enter(QLOCK(q));
29980Sstevel@tonic-gate 
29990Sstevel@tonic-gate 	if (mcls == QPCTL) {
30000Sstevel@tonic-gate 		if (mp->b_band != 0)
30010Sstevel@tonic-gate 			mp->b_band = 0;		/* force to be correct */
30020Sstevel@tonic-gate 		if (emp && emp->b_prev &&
30030Sstevel@tonic-gate 		    (emp->b_prev->b_datap->db_type < QPCTL))
30040Sstevel@tonic-gate 			goto badord;
30050Sstevel@tonic-gate 	}
30060Sstevel@tonic-gate 	if (emp) {
30070Sstevel@tonic-gate 		if (((mcls == QNORM) && (mp->b_band < emp->b_band)) ||
30080Sstevel@tonic-gate 		    (emp->b_prev && (emp->b_prev->b_datap->db_type < QPCTL) &&
30090Sstevel@tonic-gate 		    (emp->b_prev->b_band < mp->b_band))) {
30100Sstevel@tonic-gate 			goto badord;
30110Sstevel@tonic-gate 		}
30120Sstevel@tonic-gate 	} else {
30130Sstevel@tonic-gate 		tmp = q->q_last;
30140Sstevel@tonic-gate 		if (tmp && (mcls == QNORM) && (mp->b_band > tmp->b_band)) {
30150Sstevel@tonic-gate badord:
30160Sstevel@tonic-gate 			cmn_err(CE_WARN,
30170Sstevel@tonic-gate 			    "insq: attempt to insert message out of order "
30180Sstevel@tonic-gate 			    "on q %p", (void *)q);
30190Sstevel@tonic-gate 			if (freezer != curthread)
30200Sstevel@tonic-gate 				mutex_exit(QLOCK(q));
30210Sstevel@tonic-gate 			return (0);
30220Sstevel@tonic-gate 		}
30230Sstevel@tonic-gate 	}
30240Sstevel@tonic-gate 
30250Sstevel@tonic-gate 	if (mp->b_band != 0) {
30260Sstevel@tonic-gate 		int i;
30270Sstevel@tonic-gate 		qband_t **qbpp;
30280Sstevel@tonic-gate 
30290Sstevel@tonic-gate 		if (mp->b_band > q->q_nband) {
30300Sstevel@tonic-gate 			qbpp = &q->q_bandp;
30310Sstevel@tonic-gate 			while (*qbpp)
30320Sstevel@tonic-gate 				qbpp = &(*qbpp)->qb_next;
30330Sstevel@tonic-gate 			while (mp->b_band > q->q_nband) {
30340Sstevel@tonic-gate 				if ((*qbpp = allocband()) == NULL) {
30350Sstevel@tonic-gate 					if (freezer != curthread)
30360Sstevel@tonic-gate 						mutex_exit(QLOCK(q));
30370Sstevel@tonic-gate 					return (0);
30380Sstevel@tonic-gate 				}
30390Sstevel@tonic-gate 				(*qbpp)->qb_hiwat = q->q_hiwat;
30400Sstevel@tonic-gate 				(*qbpp)->qb_lowat = q->q_lowat;
30410Sstevel@tonic-gate 				q->q_nband++;
30420Sstevel@tonic-gate 				qbpp = &(*qbpp)->qb_next;
30430Sstevel@tonic-gate 			}
30440Sstevel@tonic-gate 		}
30450Sstevel@tonic-gate 		qbp = q->q_bandp;
30460Sstevel@tonic-gate 		i = mp->b_band;
30470Sstevel@tonic-gate 		while (--i)
30480Sstevel@tonic-gate 			qbp = qbp->qb_next;
30490Sstevel@tonic-gate 	}
30500Sstevel@tonic-gate 
30510Sstevel@tonic-gate 	if ((mp->b_next = emp) != NULL) {
30520Sstevel@tonic-gate 		if ((mp->b_prev = emp->b_prev) != NULL)
30530Sstevel@tonic-gate 			emp->b_prev->b_next = mp;
30540Sstevel@tonic-gate 		else
30550Sstevel@tonic-gate 			q->q_first = mp;
30560Sstevel@tonic-gate 		emp->b_prev = mp;
30570Sstevel@tonic-gate 	} else {
30580Sstevel@tonic-gate 		if ((mp->b_prev = q->q_last) != NULL)
30590Sstevel@tonic-gate 			q->q_last->b_next = mp;
30600Sstevel@tonic-gate 		else
30610Sstevel@tonic-gate 			q->q_first = mp;
30620Sstevel@tonic-gate 		q->q_last = mp;
30630Sstevel@tonic-gate 	}
30640Sstevel@tonic-gate 
30650Sstevel@tonic-gate 	/* Get mblk and byte count for q_count accounting */
30666769Sja97890 	bytecnt = mp_cont_len(mp, &mblkcnt);
30670Sstevel@tonic-gate 
30680Sstevel@tonic-gate 	if (qbp) {	/* adjust qband pointers and count */
30690Sstevel@tonic-gate 		if (!qbp->qb_first) {
30700Sstevel@tonic-gate 			qbp->qb_first = mp;
30710Sstevel@tonic-gate 			qbp->qb_last = mp;
30720Sstevel@tonic-gate 		} else {
30730Sstevel@tonic-gate 			if (mp->b_prev == NULL || (mp->b_prev != NULL &&
30740Sstevel@tonic-gate 			    (mp->b_prev->b_band != mp->b_band)))
30750Sstevel@tonic-gate 				qbp->qb_first = mp;
30760Sstevel@tonic-gate 			else if (mp->b_next == NULL || (mp->b_next != NULL &&
30770Sstevel@tonic-gate 			    (mp->b_next->b_band != mp->b_band)))
30780Sstevel@tonic-gate 				qbp->qb_last = mp;
30790Sstevel@tonic-gate 		}
30800Sstevel@tonic-gate 		qbp->qb_count += bytecnt;
30810Sstevel@tonic-gate 		qbp->qb_mblkcnt += mblkcnt;
30820Sstevel@tonic-gate 		if ((qbp->qb_count >= qbp->qb_hiwat) ||
30830Sstevel@tonic-gate 		    (qbp->qb_mblkcnt >= qbp->qb_hiwat)) {
30840Sstevel@tonic-gate 			qbp->qb_flag |= QB_FULL;
30850Sstevel@tonic-gate 		}
30860Sstevel@tonic-gate 	} else {
30870Sstevel@tonic-gate 		q->q_count += bytecnt;
30880Sstevel@tonic-gate 		q->q_mblkcnt += mblkcnt;
30890Sstevel@tonic-gate 		if ((q->q_count >= q->q_hiwat) ||
30900Sstevel@tonic-gate 		    (q->q_mblkcnt >= q->q_hiwat)) {
30910Sstevel@tonic-gate 			q->q_flag |= QFULL;
30920Sstevel@tonic-gate 		}
30930Sstevel@tonic-gate 	}
30940Sstevel@tonic-gate 
30950Sstevel@tonic-gate 	STR_FTEVENT_MSG(mp, q, FTEV_INSQ, NULL);
30960Sstevel@tonic-gate 
30970Sstevel@tonic-gate 	if (canenable(q) && (q->q_flag & QWANTR))
30980Sstevel@tonic-gate 		qenable_locked(q);
30990Sstevel@tonic-gate 
31000Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(QLOCK(q)));
31010Sstevel@tonic-gate 	if (freezer != curthread)
31020Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
31030Sstevel@tonic-gate 
31040Sstevel@tonic-gate 	return (1);
31050Sstevel@tonic-gate }
31060Sstevel@tonic-gate 
31070Sstevel@tonic-gate /*
31080Sstevel@tonic-gate  * Create and put a control message on queue.
31090Sstevel@tonic-gate  */
31100Sstevel@tonic-gate int
31110Sstevel@tonic-gate putctl(queue_t *q, int type)
31120Sstevel@tonic-gate {
31130Sstevel@tonic-gate 	mblk_t *bp;
31140Sstevel@tonic-gate 
31150Sstevel@tonic-gate 	if ((datamsg(type) && (type != M_DELAY)) ||
31160Sstevel@tonic-gate 	    (bp = allocb_tryhard(0)) == NULL)
31170Sstevel@tonic-gate 		return (0);
31180Sstevel@tonic-gate 	bp->b_datap->db_type = (unsigned char) type;
31190Sstevel@tonic-gate 
31200Sstevel@tonic-gate 	put(q, bp);
31210Sstevel@tonic-gate 
31220Sstevel@tonic-gate 	return (1);
31230Sstevel@tonic-gate }
31240Sstevel@tonic-gate 
31250Sstevel@tonic-gate /*
31260Sstevel@tonic-gate  * Control message with a single-byte parameter
31270Sstevel@tonic-gate  */
31280Sstevel@tonic-gate int
31290Sstevel@tonic-gate putctl1(queue_t *q, int type, int param)
31300Sstevel@tonic-gate {
31310Sstevel@tonic-gate 	mblk_t *bp;
31320Sstevel@tonic-gate 
31330Sstevel@tonic-gate 	if ((datamsg(type) && (type != M_DELAY)) ||
31340Sstevel@tonic-gate 	    (bp = allocb_tryhard(1)) == NULL)
31350Sstevel@tonic-gate 		return (0);
31360Sstevel@tonic-gate 	bp->b_datap->db_type = (unsigned char)type;
31370Sstevel@tonic-gate 	*bp->b_wptr++ = (unsigned char)param;
31380Sstevel@tonic-gate 
31390Sstevel@tonic-gate 	put(q, bp);
31400Sstevel@tonic-gate 
31410Sstevel@tonic-gate 	return (1);
31420Sstevel@tonic-gate }
31430Sstevel@tonic-gate 
31440Sstevel@tonic-gate int
31450Sstevel@tonic-gate putnextctl1(queue_t *q, int type, int param)
31460Sstevel@tonic-gate {
31470Sstevel@tonic-gate 	mblk_t *bp;
31480Sstevel@tonic-gate 
31490Sstevel@tonic-gate 	if ((datamsg(type) && (type != M_DELAY)) ||
31506707Sbrutus 	    ((bp = allocb_tryhard(1)) == NULL))
31510Sstevel@tonic-gate 		return (0);
31520Sstevel@tonic-gate 
31530Sstevel@tonic-gate 	bp->b_datap->db_type = (unsigned char)type;
31540Sstevel@tonic-gate 	*bp->b_wptr++ = (unsigned char)param;
31550Sstevel@tonic-gate 
31560Sstevel@tonic-gate 	putnext(q, bp);
31570Sstevel@tonic-gate 
31580Sstevel@tonic-gate 	return (1);
31590Sstevel@tonic-gate }
31600Sstevel@tonic-gate 
31610Sstevel@tonic-gate int
31620Sstevel@tonic-gate putnextctl(queue_t *q, int type)
31630Sstevel@tonic-gate {
31640Sstevel@tonic-gate 	mblk_t *bp;
31650Sstevel@tonic-gate 
31660Sstevel@tonic-gate 	if ((datamsg(type) && (type != M_DELAY)) ||
31676707Sbrutus 	    ((bp = allocb_tryhard(0)) == NULL))
31680Sstevel@tonic-gate 		return (0);
31690Sstevel@tonic-gate 	bp->b_datap->db_type = (unsigned char)type;
31700Sstevel@tonic-gate 
31710Sstevel@tonic-gate 	putnext(q, bp);
31720Sstevel@tonic-gate 
31730Sstevel@tonic-gate 	return (1);
31740Sstevel@tonic-gate }
31750Sstevel@tonic-gate 
31760Sstevel@tonic-gate /*
31770Sstevel@tonic-gate  * Return the queue upstream from this one
31780Sstevel@tonic-gate  */
31790Sstevel@tonic-gate queue_t *
31800Sstevel@tonic-gate backq(queue_t *q)
31810Sstevel@tonic-gate {
31820Sstevel@tonic-gate 	q = _OTHERQ(q);
31830Sstevel@tonic-gate 	if (q->q_next) {
31840Sstevel@tonic-gate 		q = q->q_next;
31850Sstevel@tonic-gate 		return (_OTHERQ(q));
31860Sstevel@tonic-gate 	}
31870Sstevel@tonic-gate 	return (NULL);
31880Sstevel@tonic-gate }
31890Sstevel@tonic-gate 
31900Sstevel@tonic-gate /*
31910Sstevel@tonic-gate  * Send a block back up the queue in reverse from this
31920Sstevel@tonic-gate  * one (e.g. to respond to ioctls)
31930Sstevel@tonic-gate  */
31940Sstevel@tonic-gate void
31950Sstevel@tonic-gate qreply(queue_t *q, mblk_t *bp)
31960Sstevel@tonic-gate {
31970Sstevel@tonic-gate 	ASSERT(q && bp);
31980Sstevel@tonic-gate 
31990Sstevel@tonic-gate 	putnext(_OTHERQ(q), bp);
32000Sstevel@tonic-gate }
32010Sstevel@tonic-gate 
32020Sstevel@tonic-gate /*
32030Sstevel@tonic-gate  * Streams Queue Scheduling
32040Sstevel@tonic-gate  *
32050Sstevel@tonic-gate  * Queues are enabled through qenable() when they have messages to
32060Sstevel@tonic-gate  * process.  They are serviced by queuerun(), which runs each enabled
32070Sstevel@tonic-gate  * queue's service procedure.  The call to queuerun() is processor
32080Sstevel@tonic-gate  * dependent - the general principle is that it be run whenever a queue
32090Sstevel@tonic-gate  * is enabled but before returning to user level.  For system calls,
32100Sstevel@tonic-gate  * the function runqueues() is called if their action causes a queue
32110Sstevel@tonic-gate  * to be enabled.  For device interrupts, queuerun() should be
32120Sstevel@tonic-gate  * called before returning from the last level of interrupt.  Beyond
32130Sstevel@tonic-gate  * this, no timing assumptions should be made about queue scheduling.
32140Sstevel@tonic-gate  */
32150Sstevel@tonic-gate 
32160Sstevel@tonic-gate /*
32170Sstevel@tonic-gate  * Enable a queue: put it on list of those whose service procedures are
32180Sstevel@tonic-gate  * ready to run and set up the scheduling mechanism.
32190Sstevel@tonic-gate  * The broadcast is done outside the mutex -> to avoid the woken thread
32200Sstevel@tonic-gate  * from contending with the mutex. This is OK 'cos the queue has been
32210Sstevel@tonic-gate  * enqueued on the runlist and flagged safely at this point.
32220Sstevel@tonic-gate  */
32230Sstevel@tonic-gate void
32240Sstevel@tonic-gate qenable(queue_t *q)
32250Sstevel@tonic-gate {
32260Sstevel@tonic-gate 	mutex_enter(QLOCK(q));
32270Sstevel@tonic-gate 	qenable_locked(q);
32280Sstevel@tonic-gate 	mutex_exit(QLOCK(q));
32290Sstevel@tonic-gate }
32300Sstevel@tonic-gate /*
32310Sstevel@tonic-gate  * Return number of messages on queue
32320Sstevel@tonic-gate  */
32330Sstevel@tonic-gate int
32340Sstevel@tonic-gate qsize(queue_t *qp)
32350Sstevel@tonic-gate {
32360Sstevel@tonic-gate 	int count = 0;
32370Sstevel@tonic-gate 	mblk_t *mp;
32380Sstevel@tonic-gate 
32390Sstevel@tonic-gate 	mutex_enter(QLOCK(qp));
32400Sstevel@tonic-gate 	for (mp = qp->q_first; mp; mp = mp->b_next)
32410Sstevel@tonic-gate 		count++;
32420Sstevel@tonic-gate 	mutex_exit(QLOCK(qp));
32430Sstevel@tonic-gate 	return (count);
32440Sstevel@tonic-gate }
32450Sstevel@tonic-gate 
32460Sstevel@tonic-gate /*
32470Sstevel@tonic-gate  * noenable - set queue so that putq() will not enable it.
32480Sstevel@tonic-gate  * enableok - set queue so that putq() can enable it.
32490Sstevel@tonic-gate  */
32500Sstevel@tonic-gate void
32510Sstevel@tonic-gate noenable(queue_t *q)
32520Sstevel@tonic-gate {
32530Sstevel@tonic-gate 	mutex_enter(QLOCK(q));
32540Sstevel@tonic-gate 	q->q_flag |= QNOENB;
32550Sstevel@tonic-gate 	mutex_exit(QLOCK(q));
32560Sstevel@tonic-gate }
32570Sstevel@tonic-gate 
32580Sstevel@tonic-gate void
32590Sstevel@tonic-gate enableok(queue_t *q)
32600Sstevel@tonic-gate {
32610Sstevel@tonic-gate 	mutex_enter(QLOCK(q));
32620Sstevel@tonic-gate 	q->q_flag &= ~QNOENB;
32630Sstevel@tonic-gate 	mutex_exit(QLOCK(q));
32640Sstevel@tonic-gate }
32650Sstevel@tonic-gate 
32660Sstevel@tonic-gate /*
32670Sstevel@tonic-gate  * Set queue fields.
32680Sstevel@tonic-gate  */
32690Sstevel@tonic-gate int
32700Sstevel@tonic-gate strqset(queue_t *q, qfields_t what, unsigned char pri, intptr_t val)
32710Sstevel@tonic-gate {
32720Sstevel@tonic-gate 	qband_t *qbp = NULL;
32730Sstevel@tonic-gate 	queue_t	*wrq;
32740Sstevel@tonic-gate 	int error = 0;
32750Sstevel@tonic-gate 	kthread_id_t freezer;
32760Sstevel@tonic-gate 
32770Sstevel@tonic-gate 	freezer = STREAM(q)->sd_freezer;
32780Sstevel@tonic-gate 	if (freezer == curthread) {
32790Sstevel@tonic-gate 		ASSERT(frozenstr(q));
32800Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(QLOCK(q)));
32810Sstevel@tonic-gate 	} else
32820Sstevel@tonic-gate 		mutex_enter(QLOCK(q));
32830Sstevel@tonic-gate 
32840Sstevel@tonic-gate 	if (what >= QBAD) {
32850Sstevel@tonic-gate 		error = EINVAL;
32860Sstevel@tonic-gate 		goto done;
32870Sstevel@tonic-gate 	}
32880Sstevel@tonic-gate 	if (pri != 0) {
32890Sstevel@tonic-gate 		int i;
32900Sstevel@tonic-gate 		qband_t **qbpp;
32910Sstevel@tonic-gate 
32920Sstevel@tonic-gate 		if (pri > q->q_nband) {
32930Sstevel@tonic-gate 			qbpp = &q->q_bandp;
32940Sstevel@tonic-gate 			while (*qbpp)
32950Sstevel@tonic-gate 				qbpp = &(*qbpp)->qb_next;
32960Sstevel@tonic-gate 			while (pri > q->q_nband) {
32970Sstevel@tonic-gate 				if ((*qbpp = allocband()) == NULL) {
32980Sstevel@tonic-gate 					error = EAGAIN;
32990Sstevel@tonic-gate 					goto done;
33000Sstevel@tonic-gate 				}
33010Sstevel@tonic-gate 				(*qbpp)->qb_hiwat = q->q_hiwat;
33020Sstevel@tonic-gate 				(*qbpp)->qb_lowat = q->q_lowat;
33030Sstevel@tonic-gate 				q->q_nband++;
33040Sstevel@tonic-gate 				qbpp = &(*qbpp)->qb_next;
33050Sstevel@tonic-gate 			}
33060Sstevel@tonic-gate 		}
33070Sstevel@tonic-gate 		qbp = q->q_bandp;
33080Sstevel@tonic-gate 		i = pri;
33090Sstevel@tonic-gate 		while (--i)
33100Sstevel@tonic-gate 			qbp = qbp->qb_next;
33110Sstevel@tonic-gate 	}
33120Sstevel@tonic-gate 	switch (what) {
33130Sstevel@tonic-gate 
33140Sstevel@tonic-gate 	case QHIWAT:
33150Sstevel@tonic-gate 		if (qbp)
33160Sstevel@tonic-gate 			qbp->qb_hiwat = (size_t)val;
33170Sstevel@tonic-gate 		else
33180Sstevel@tonic-gate 			q->q_hiwat = (size_t)val;
33190Sstevel@tonic-gate 		break;
33200Sstevel@tonic-gate 
33210Sstevel@tonic-gate 	case QLOWAT:
33220Sstevel@tonic-gate 		if (qbp)
33230Sstevel@tonic-gate 			qbp->qb_lowat = (size_t)val;
33240Sstevel@tonic-gate 		else
33250Sstevel@tonic-gate 			q->q_lowat = (size_t)val;
33260Sstevel@tonic-gate 		break;
33270Sstevel@tonic-gate 
33280Sstevel@tonic-gate 	case QMAXPSZ:
33290Sstevel@tonic-gate 		if (qbp)
33300Sstevel@tonic-gate 			error = EINVAL;
33310Sstevel@tonic-gate 		else
33320Sstevel@tonic-gate 			q->q_maxpsz = (ssize_t)val;
33330Sstevel@tonic-gate 
33340Sstevel@tonic-gate 		/*
33350Sstevel@tonic-gate 		 * Performance concern, strwrite looks at the module below
33360Sstevel@tonic-gate 		 * the stream head for the maxpsz each time it does a write
33370Sstevel@tonic-gate 		 * we now cache it at the stream head.  Check to see if this
33380Sstevel@tonic-gate 		 * queue is sitting directly below the stream head.
33390Sstevel@tonic-gate 		 */
33400Sstevel@tonic-gate 		wrq = STREAM(q)->sd_wrq;
33410Sstevel@tonic-gate 		if (q != wrq->q_next)
33420Sstevel@tonic-gate 			break;
33430Sstevel@tonic-gate 
33440Sstevel@tonic-gate 		/*
33450Sstevel@tonic-gate 		 * If the stream is not frozen drop the current QLOCK and
33460Sstevel@tonic-gate 		 * acquire the sd_wrq QLOCK which protects sd_qn_*
33470Sstevel@tonic-gate 		 */
33480Sstevel@tonic-gate 		if (freezer != curthread) {
33490Sstevel@tonic-gate 			mutex_exit(QLOCK(q));
33500Sstevel@tonic-gate 			mutex_enter(QLOCK(wrq));
33510Sstevel@tonic-gate 		}
33520Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(QLOCK(wrq)));
33530Sstevel@tonic-gate 
33540Sstevel@tonic-gate 		if (strmsgsz != 0) {
33550Sstevel@tonic-gate 			if (val == INFPSZ)
33560Sstevel@tonic-gate 				val = strmsgsz;
33570Sstevel@tonic-gate 			else  {
33580Sstevel@tonic-gate 				if (STREAM(q)->sd_vnode->v_type == VFIFO)
33590Sstevel@tonic-gate 					val = MIN(PIPE_BUF, val);
33600Sstevel@tonic-gate 				else
33610Sstevel@tonic-gate 					val = MIN(strmsgsz, val);
33620Sstevel@tonic-gate 			}
33630Sstevel@tonic-gate 		}
33640Sstevel@tonic-gate 		STREAM(q)->sd_qn_maxpsz = val;
33650Sstevel@tonic-gate 		if (freezer != curthread) {
33660Sstevel@tonic-gate 			mutex_exit(QLOCK(wrq));
33670Sstevel@tonic-gate 			mutex_enter(QLOCK(q));
33680Sstevel@tonic-gate 		}
33690Sstevel@tonic-gate 		break;
33700Sstevel@tonic-gate 
33710Sstevel@tonic-gate 	case QMINPSZ:
33720Sstevel@tonic-gate 		if (qbp)
33730Sstevel@tonic-gate 			error = EINVAL;
33740Sstevel@tonic-gate 		else
33750Sstevel@tonic-gate 			q->q_minpsz = (ssize_t)val;
33760Sstevel@tonic-gate 
33770Sstevel@tonic-gate 		/*
33780Sstevel@tonic-gate 		 * Performance concern, strwrite looks at the module below
33790Sstevel@tonic-gate 		 * the stream head for the maxpsz each time it does a write
33800Sstevel@tonic-gate 		 * we now cache it at the stream head.  Check to see if this
33810Sstevel@tonic-gate 		 * queue is sitting directly below the stream head.
33820Sstevel@tonic-gate 		 */
33830Sstevel@tonic-gate 		wrq = STREAM(q)->sd_wrq;
33840Sstevel@tonic-gate 		if (q != wrq->q_next)
33850Sstevel@tonic-gate 			break;
33860Sstevel@tonic-gate 
33870Sstevel@tonic-gate 		/*
33880Sstevel@tonic-gate 		 * If the stream is not frozen drop the current QLOCK and
33890Sstevel@tonic-gate 		 * acquire the sd_wrq QLOCK which protects sd_qn_*
33900Sstevel@tonic-gate 		 */
33910Sstevel@tonic-gate 		if (freezer != curthread) {
33920Sstevel@tonic-gate 			mutex_exit(QLOCK(q));
33930Sstevel@tonic-gate 			mutex_enter(QLOCK(wrq));
33940Sstevel@tonic-gate 		}
33950Sstevel@tonic-gate 		STREAM(q)->sd_qn_minpsz = (ssize_t)val;
33960Sstevel@tonic-gate 
33970Sstevel@tonic-gate 		if (freezer != curthread) {
33980Sstevel@tonic-gate 			mutex_exit(QLOCK(wrq));
33990Sstevel@tonic-gate 			mutex_enter(QLOCK(q));
34000Sstevel@tonic-gate 		}
34010Sstevel@tonic-gate 		break;
34020Sstevel@tonic-gate 
34030Sstevel@tonic-gate 	case QSTRUIOT:
34040Sstevel@tonic-gate 		if (qbp)
34050Sstevel@tonic-gate 			error = EINVAL;
34060Sstevel@tonic-gate 		else
34070Sstevel@tonic-gate 			q->q_struiot = (ushort_t)val;
34080Sstevel@tonic-gate 		break;
34090Sstevel@tonic-gate 
34100Sstevel@tonic-gate 	case QCOUNT:
34110Sstevel@tonic-gate 	case QFIRST:
34120Sstevel@tonic-gate 	case QLAST:
34130Sstevel@tonic-gate 	case QFLAG:
34140Sstevel@tonic-gate 		error = EPERM;
34150Sstevel@tonic-gate 		break;
34160Sstevel@tonic-gate 
34170Sstevel@tonic-gate 	default:
34180Sstevel@tonic-gate 		error = EINVAL;
34190Sstevel@tonic-gate 		break;
34200Sstevel@tonic-gate 	}
34210Sstevel@tonic-gate done:
34220Sstevel@tonic-gate 	if (freezer != curthread)
34230Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
34240Sstevel@tonic-gate 	return (error);
34250Sstevel@tonic-gate }
34260Sstevel@tonic-gate 
34270Sstevel@tonic-gate /*
34280Sstevel@tonic-gate  * Get queue fields.
34290Sstevel@tonic-gate  */
34300Sstevel@tonic-gate int
34310Sstevel@tonic-gate strqget(queue_t *q, qfields_t what, unsigned char pri, void *valp)
34320Sstevel@tonic-gate {
34330Sstevel@tonic-gate 	qband_t 	*qbp = NULL;
34340Sstevel@tonic-gate 	int 		error = 0;
34350Sstevel@tonic-gate 	kthread_id_t 	freezer;
34360Sstevel@tonic-gate 
34370Sstevel@tonic-gate 	freezer = STREAM(q)->sd_freezer;
34380Sstevel@tonic-gate 	if (freezer == curthread) {
34390Sstevel@tonic-gate 		ASSERT(frozenstr(q));
34400Sstevel@tonic-gate 		ASSERT(MUTEX_HELD(QLOCK(q)));
34410Sstevel@tonic-gate 	} else
34420Sstevel@tonic-gate 		mutex_enter(QLOCK(q));
34430Sstevel@tonic-gate 	if (what >= QBAD) {
34440Sstevel@tonic-gate 		error = EINVAL;
34450Sstevel@tonic-gate 		goto done;
34460Sstevel@tonic-gate 	}
34470Sstevel@tonic-gate 	if (pri != 0) {
34480Sstevel@tonic-gate 		int i;
34490Sstevel@tonic-gate 		qband_t **qbpp;
34500Sstevel@tonic-gate 
34510Sstevel@tonic-gate 		if (pri > q->q_nband) {
34520Sstevel@tonic-gate 			qbpp = &q->q_bandp;
34530Sstevel@tonic-gate 			while (*qbpp)
34540Sstevel@tonic-gate 				qbpp = &(*qbpp)->qb_next;
34550Sstevel@tonic-gate 			while (pri > q->q_nband) {
34560Sstevel@tonic-gate 				if ((*qbpp = allocband()) == NULL) {
34570Sstevel@tonic-gate 					error = EAGAIN;
34580Sstevel@tonic-gate 					goto done;
34590Sstevel@tonic-gate 				}
34600Sstevel@tonic-gate 				(*qbpp)->qb_hiwat = q->q_hiwat;
34610Sstevel@tonic-gate 				(*qbpp)->qb_lowat = q->q_lowat;
34620Sstevel@tonic-gate 				q->q_nband++;
34630Sstevel@tonic-gate 				qbpp = &(*qbpp)->qb_next;
34640Sstevel@tonic-gate 			}
34650Sstevel@tonic-gate 		}
34660Sstevel@tonic-gate 		qbp = q->q_bandp;
34670Sstevel@tonic-gate 		i = pri;
34680Sstevel@tonic-gate 		while (--i)
34690Sstevel@tonic-gate 			qbp = qbp->qb_next;
34700Sstevel@tonic-gate 	}
34710Sstevel@tonic-gate 	switch (what) {
34720Sstevel@tonic-gate 	case QHIWAT:
34730Sstevel@tonic-gate 		if (qbp)
34740Sstevel@tonic-gate 			*(size_t *)valp = qbp->qb_hiwat;
34750Sstevel@tonic-gate 		else
34760Sstevel@tonic-gate 			*(size_t *)valp = q->q_hiwat;
34770Sstevel@tonic-gate 		break;
34780Sstevel@tonic-gate 
34790Sstevel@tonic-gate 	case QLOWAT:
34800Sstevel@tonic-gate 		if (qbp)
34810Sstevel@tonic-gate 			*(size_t *)valp = qbp->qb_lowat;
34820Sstevel@tonic-gate 		else
34830Sstevel@tonic-gate 			*(size_t *)valp = q->q_lowat;
34840Sstevel@tonic-gate 		break;
34850Sstevel@tonic-gate 
34860Sstevel@tonic-gate 	case QMAXPSZ:
34870Sstevel@tonic-gate 		if (qbp)
34880Sstevel@tonic-gate 			error = EINVAL;
34890Sstevel@tonic-gate 		else
34900Sstevel@tonic-gate 			*(ssize_t *)valp = q->q_maxpsz;
34910Sstevel@tonic-gate 		break;
34920Sstevel@tonic-gate 
34930Sstevel@tonic-gate 	case QMINPSZ:
34940Sstevel@tonic-gate 		if (qbp)
34950Sstevel@tonic-gate 			error = EINVAL;
34960Sstevel@tonic-gate 		else
34970Sstevel@tonic-gate 			*(ssize_t *)valp = q->q_minpsz;
34980Sstevel@tonic-gate 		break;
34990Sstevel@tonic-gate 
35000Sstevel@tonic-gate 	case QCOUNT:
35010Sstevel@tonic-gate 		if (qbp)
35020Sstevel@tonic-gate 			*(size_t *)valp = qbp->qb_count;
35030Sstevel@tonic-gate 		else
35040Sstevel@tonic-gate 			*(size_t *)valp = q->q_count;
35050Sstevel@tonic-gate 		break;
35060Sstevel@tonic-gate 
35070Sstevel@tonic-gate 	case QFIRST:
35080Sstevel@tonic-gate 		if (qbp)
35090Sstevel@tonic-gate 			*(mblk_t **)valp = qbp->qb_first;
35100Sstevel@tonic-gate 		else
35110Sstevel@tonic-gate 			*(mblk_t **)valp = q->q_first;
35120Sstevel@tonic-gate 		break;
35130Sstevel@tonic-gate 
35140Sstevel@tonic-gate 	case QLAST:
35150Sstevel@tonic-gate 		if (qbp)
35160Sstevel@tonic-gate 			*(mblk_t **)valp = qbp->qb_last;
35170Sstevel@tonic-gate 		else
35180Sstevel@tonic-gate 			*(mblk_t **)valp = q->q_last;
35190Sstevel@tonic-gate 		break;
35200Sstevel@tonic-gate 
35210Sstevel@tonic-gate 	case QFLAG:
35220Sstevel@tonic-gate 		if (qbp)
35230Sstevel@tonic-gate 			*(uint_t *)valp = qbp->qb_flag;
35240Sstevel@tonic-gate 		else
35250Sstevel@tonic-gate 			*(uint_t *)valp = q->q_flag;
35260Sstevel@tonic-gate 		break;
35270Sstevel@tonic-gate 
35280Sstevel@tonic-gate 	case QSTRUIOT:
35290Sstevel@tonic-gate 		if (qbp)
35300Sstevel@tonic-gate 			error = EINVAL;
35310Sstevel@tonic-gate 		else
35320Sstevel@tonic-gate 			*(short *)valp = q->q_struiot;
35330Sstevel@tonic-gate 		break;
35340Sstevel@tonic-gate 
35350Sstevel@tonic-gate 	default:
35360Sstevel@tonic-gate 		error = EINVAL;
35370Sstevel@tonic-gate 		break;
35380Sstevel@tonic-gate 	}
35390Sstevel@tonic-gate done:
35400Sstevel@tonic-gate 	if (freezer != curthread)
35410Sstevel@tonic-gate 		mutex_exit(QLOCK(q));
35420Sstevel@tonic-gate 	return (error);
35430Sstevel@tonic-gate }
35440Sstevel@tonic-gate 
35450Sstevel@tonic-gate /*
35460Sstevel@tonic-gate  * Function awakes all in cvwait/sigwait/pollwait, on one of:
35470Sstevel@tonic-gate  *	QWANTWSYNC or QWANTR or QWANTW,
35480Sstevel@tonic-gate  *
35490Sstevel@tonic-gate  * Note: for QWANTWSYNC/QWANTW and QWANTR, if no WSLEEPer or RSLEEPer then a
35500Sstevel@tonic-gate  *	 deferred wakeup will be done. Also if strpoll() in progress then a
35510Sstevel@tonic-gate  *	 deferred pollwakeup will be done.
35520Sstevel@tonic-gate  */
35530Sstevel@tonic-gate void
35540Sstevel@tonic-gate strwakeq(queue_t *q, int flag)
35550Sstevel@tonic-gate {
35560Sstevel@tonic-gate 	stdata_t 	*stp = STREAM(q);
35570Sstevel@tonic-gate 	pollhead_t 	*pl;
35580Sstevel@tonic-gate 
35590Sstevel@tonic-gate 	mutex_enter(&stp->sd_lock);
35600Sstevel@tonic-gate 	pl = &stp->sd_pollist;
35610Sstevel@tonic-gate 	if (flag & QWANTWSYNC) {
35620Sstevel@tonic-gate 		ASSERT(!(q->q_flag & QREADR));
35630Sstevel@tonic-gate 		if (stp->sd_flag & WSLEEP) {
35640Sstevel@tonic-gate 			stp->sd_flag &= ~WSLEEP;
35650Sstevel@tonic-gate 			cv_broadcast(&stp->sd_wrq->q_wait);
35660Sstevel@tonic-gate 		} else {
35670Sstevel@tonic-gate 			stp->sd_wakeq |= WSLEEP;
35680Sstevel@tonic-gate 		}
35690Sstevel@tonic-gate 
35700Sstevel@tonic-gate 		mutex_exit(&stp->sd_lock);
35710Sstevel@tonic-gate 		pollwakeup(pl, POLLWRNORM);
35720Sstevel@tonic-gate 		mutex_enter(&stp->sd_lock);
35730Sstevel@tonic-gate 
35740Sstevel@tonic-gate 		if (stp->sd_sigflags & S_WRNORM)
35750Sstevel@tonic-gate 			strsendsig(stp->sd_siglist, S_WRNORM, 0, 0);
35760Sstevel@tonic-gate 	} else if (flag & QWANTR) {
35770Sstevel@tonic-gate 		if (stp->sd_flag & RSLEEP) {
35780Sstevel@tonic-gate 			stp->sd_flag &= ~RSLEEP;
35790Sstevel@tonic-gate 			cv_broadcast(&_RD(stp->sd_wrq)->q_wait);
35800Sstevel@tonic-gate 		} else {
35810Sstevel@tonic-gate 			stp->sd_wakeq |= RSLEEP;
35820Sstevel@tonic-gate 		}
35830Sstevel@tonic-gate 
35840Sstevel@tonic-gate 		mutex_exit(&stp->sd_lock);
35850Sstevel@tonic-gate 		pollwakeup(pl, POLLIN | POLLRDNORM);
35860Sstevel@tonic-gate 		mutex_enter(&stp->sd_lock);
35870Sstevel@tonic-gate 
35880Sstevel@tonic-gate 		{
35890Sstevel@tonic-gate 			int events = stp->sd_sigflags & (S_INPUT | S_RDNORM);
35900Sstevel@tonic-gate 
35910Sstevel@tonic-gate 			if (events)
35920Sstevel@tonic-gate 				strsendsig(stp->sd_siglist, events, 0, 0);
35930Sstevel@tonic-gate 		}
35940Sstevel@tonic-gate 	} else {
35950Sstevel@tonic-gate 		if (stp->sd_flag & WSLEEP) {
35960Sstevel@tonic-gate 			stp->sd_flag &= ~WSLEEP;
35970Sstevel@tonic-gate 			cv_broadcast(&stp->sd_wrq->q_wait);
35980Sstevel@tonic-gate 		}
35990Sstevel@tonic-gate 
36000Sstevel@tonic-gate 		mutex_exit(&stp->sd_lock);
36010Sstevel@tonic-gate 		pollwakeup(pl, POLLWRNORM);
36020Sstevel@tonic-gate 		mutex_enter(&stp->sd_lock);
36030Sstevel@tonic-gate 
36040Sstevel@tonic-gate 		if (stp->sd_sigflags & S_WRNORM)
36050Sstevel@tonic-gate 			strsendsig(stp->sd_siglist, S_WRNORM, 0, 0);
36060Sstevel@tonic-gate 	}
36070Sstevel@tonic-gate 	mutex_exit(&stp->sd_lock);
36080Sstevel@tonic-gate }
36090Sstevel@tonic-gate 
36100Sstevel@tonic-gate int
36110Sstevel@tonic-gate struioget(queue_t *q, mblk_t *mp, struiod_t *dp, int noblock)
36120Sstevel@tonic-gate {
36130Sstevel@tonic-gate 	stdata_t *stp = STREAM(q);
36140Sstevel@tonic-gate 	int typ  = STRUIOT_STANDARD;
36150Sstevel@tonic-gate 	uio_t	 *uiop = &dp->d_uio;
36160Sstevel@tonic-gate 	dblk_t	 *dbp;
36170Sstevel@tonic-gate 	ssize_t	 uiocnt;
36180Sstevel@tonic-gate 	ssize_t	 cnt;
36190Sstevel@tonic-gate 	unsigned char *ptr;
36200Sstevel@tonic-gate 	ssize_t	 resid;
36210Sstevel@tonic-gate 	int	 error = 0;
36220Sstevel@tonic-gate 	on_trap_data_t otd;
36230Sstevel@tonic-gate 	queue_t	*stwrq;
36240Sstevel@tonic-gate 
36250Sstevel@tonic-gate 	/*
36260Sstevel@tonic-gate 	 * Plumbing may change while taking the type so store the
36270Sstevel@tonic-gate 	 * queue in a temporary variable. It doesn't matter even
36280Sstevel@tonic-gate 	 * if the we take the type from the previous plumbing,
36290Sstevel@tonic-gate 	 * that's because if the plumbing has changed when we were
36300Sstevel@tonic-gate 	 * holding the queue in a temporary variable, we can continue
36310Sstevel@tonic-gate 	 * processing the message the way it would have been processed
36320Sstevel@tonic-gate 	 * in the old plumbing, without any side effects but a bit
36330Sstevel@tonic-gate 	 * extra processing for partial ip header checksum.
36340Sstevel@tonic-gate 	 *
36350Sstevel@tonic-gate 	 * This has been done to avoid holding the sd_lock which is
36360Sstevel@tonic-gate 	 * very hot.
36370Sstevel@tonic-gate 	 */
36380Sstevel@tonic-gate 
36390Sstevel@tonic-gate 	stwrq = stp->sd_struiowrq;
36400Sstevel@tonic-gate 	if (stwrq)
36410Sstevel@tonic-gate 		typ = stwrq->q_struiot;
36420Sstevel@tonic-gate 
36430Sstevel@tonic-gate 	for (; (resid = uiop->uio_resid) > 0 && mp; mp = mp->b_cont) {
36440Sstevel@tonic-gate 		dbp = mp->b_datap;
36450Sstevel@tonic-gate 		ptr = (uchar_t *)(mp->b_rptr + dbp->db_cksumstuff);
36460Sstevel@tonic-gate 		uiocnt = dbp->db_cksumend - dbp->db_cksumstuff;
36470Sstevel@tonic-gate 		cnt = MIN(uiocnt, uiop->uio_resid);
36480Sstevel@tonic-gate 		if (!(dbp->db_struioflag & STRUIO_SPEC) ||
36490Sstevel@tonic-gate 		    (dbp->db_struioflag & STRUIO_DONE) || cnt == 0) {
36500Sstevel@tonic-gate 			/*
36510Sstevel@tonic-gate 			 * Either this mblk has already been processed
36520Sstevel@tonic-gate 			 * or there is no more room in this mblk (?).
36530Sstevel@tonic-gate 			 */
36540Sstevel@tonic-gate 			continue;
36550Sstevel@tonic-gate 		}
36560Sstevel@tonic-gate 		switch (typ) {
36570Sstevel@tonic-gate 		case STRUIOT_STANDARD:
36580Sstevel@tonic-gate 			if (noblock) {
36590Sstevel@tonic-gate 				if (on_trap(&otd, OT_DATA_ACCESS)) {
36600Sstevel@tonic-gate 					no_trap();
36610Sstevel@tonic-gate 					error = EWOULDBLOCK;
36620Sstevel@tonic-gate 					goto out;
36630Sstevel@tonic-gate 				}
36640Sstevel@tonic-gate 			}
36650Sstevel@tonic-gate 			if (error = uiomove(ptr, cnt, UIO_WRITE, uiop)) {
36660Sstevel@tonic-gate 				if (noblock)
36670Sstevel@tonic-gate 					no_trap();
36680Sstevel@tonic-gate 				goto out;
36690Sstevel@tonic-gate 			}
36700Sstevel@tonic-gate 			if (noblock)
36710Sstevel@tonic-gate 				no_trap();
36720Sstevel@tonic-gate 			break;
36730Sstevel@tonic-gate 
36740Sstevel@tonic-gate 		default:
36750Sstevel@tonic-gate 			error = EIO;
36760Sstevel@tonic-gate 			goto out;
36770Sstevel@tonic-gate 		}
36780Sstevel@tonic-gate 		dbp->db_struioflag |= STRUIO_DONE;
36790Sstevel@tonic-gate 		dbp->db_cksumstuff += cnt;
36800Sstevel@tonic-gate 	}
36810Sstevel@tonic-gate out:
36820Sstevel@tonic-gate 	if (error == EWOULDBLOCK && (resid -= uiop->uio_resid) > 0) {
36830Sstevel@tonic-gate 		/*
36840Sstevel@tonic-gate 		 * A fault has occured and some bytes were moved to the
36850Sstevel@tonic-gate 		 * current mblk, the uio_t has already been updated by
36860Sstevel@tonic-gate 		 * the appropriate uio routine, so also update the mblk
36870Sstevel@tonic-gate 		 * to reflect this in case this same mblk chain is used
36880Sstevel@tonic-gate 		 * again (after the fault has been handled).
36890Sstevel@tonic-gate 		 */
36900Sstevel@tonic-gate 		uiocnt = dbp->db_cksumend - dbp->db_cksumstuff;
36910Sstevel@tonic-gate 		if (uiocnt >= resid)
36920Sstevel@tonic-gate 			dbp->db_cksumstuff += resid;
36930Sstevel@tonic-gate 	}
36940Sstevel@tonic-gate 	return (error);
36950Sstevel@tonic-gate }
36960Sstevel@tonic-gate 
36970Sstevel@tonic-gate /*
36980Sstevel@tonic-gate  * Try to enter queue synchronously. Any attempt to enter a closing queue will
36990Sstevel@tonic-gate  * fails. The qp->q_rwcnt keeps track of the number of successful entries so
37000Sstevel@tonic-gate  * that removeq() will not try to close the queue while a thread is inside the
37010Sstevel@tonic-gate  * queue.
37020Sstevel@tonic-gate  */
37030Sstevel@tonic-gate static boolean_t
37040Sstevel@tonic-gate rwnext_enter(queue_t *qp)
37050Sstevel@tonic-gate {
37060Sstevel@tonic-gate 	mutex_enter(QLOCK(qp));
37070Sstevel@tonic-gate 	if (qp->q_flag & QWCLOSE) {
37080Sstevel@tonic-gate 		mutex_exit(QLOCK(qp));
37090Sstevel@tonic-gate 		return (B_FALSE);
37100Sstevel@tonic-gate 	}
37110Sstevel@tonic-gate 	qp->q_rwcnt++;
37120Sstevel@tonic-gate 	ASSERT(qp->q_rwcnt != 0);
37130Sstevel@tonic-gate 	mutex_exit(QLOCK(qp));
37140Sstevel@tonic-gate 	return (B_TRUE);
37150Sstevel@tonic-gate }
37160Sstevel@tonic-gate 
37170Sstevel@tonic-gate /*
37180Sstevel@tonic-gate  * Decrease the count of threads running in sync stream queue and wake up any
37190Sstevel@tonic-gate  * threads blocked in removeq().
37200Sstevel@tonic-gate  */
37210Sstevel@tonic-gate static void
37220Sstevel@tonic-gate rwnext_exit(queue_t *qp)
37230Sstevel@tonic-gate {
37240Sstevel@tonic-gate 	mutex_enter(QLOCK(qp));
37250Sstevel@tonic-gate 	qp->q_rwcnt--;
37260Sstevel@tonic-gate 	if (qp->q_flag & QWANTRMQSYNC) {
37270Sstevel@tonic-gate 		qp->q_flag &= ~QWANTRMQSYNC;
37280Sstevel@tonic-gate 		cv_broadcast(&qp->q_wait);
37290Sstevel@tonic-gate 	}
37300Sstevel@tonic-gate 	mutex_exit(QLOCK(qp));
37310Sstevel@tonic-gate }
37320Sstevel@tonic-gate 
37330Sstevel@tonic-gate /*
37340Sstevel@tonic-gate  * The purpose of rwnext() is to call the rw procedure of the next
37350Sstevel@tonic-gate  * (downstream) modules queue.
37360Sstevel@tonic-gate  *
37370Sstevel@tonic-gate  * treated as put entrypoint for perimeter syncronization.
37380Sstevel@tonic-gate  *
37390Sstevel@tonic-gate  * There's no need to grab sq_putlocks here (which only exist for CIPUT
37400Sstevel@tonic-gate  * sync queues). If it is CIPUT sync queue sq_count is incremented and it does
37410Sstevel@tonic-gate  * not matter if any regular put entrypoints have been already entered. We
37420Sstevel@tonic-gate  * can't increment one of the sq_putcounts (instead of sq_count) because
37430Sstevel@tonic-gate  * qwait_rw won't know which counter to decrement.
37440Sstevel@tonic-gate  *
37450Sstevel@tonic-gate  * It would be reasonable to add the lockless FASTPUT logic.
37460Sstevel@tonic-gate  */
37470Sstevel@tonic-gate int
37480Sstevel@tonic-gate rwnext(queue_t *qp, struiod_t *dp)
37490Sstevel@tonic-gate {
37500Sstevel@tonic-gate 	queue_t		*nqp;
37510Sstevel@tonic-gate 	syncq_t		*sq;
37520Sstevel@tonic-gate 	uint16_t	count;
37530Sstevel@tonic-gate 	uint16_t	flags;
37540Sstevel@tonic-gate 	struct qinit	*qi;
37550Sstevel@tonic-gate 	int		(*proc)();
37560Sstevel@tonic-gate 	struct stdata	*stp;
37570Sstevel@tonic-gate 	int		isread;
37580Sstevel@tonic-gate 	int		rval;
37590Sstevel@tonic-gate 
37600Sstevel@tonic-gate 	stp = STREAM(qp);
37610Sstevel@tonic-gate 	/*
37620Sstevel@tonic-gate 	 * Prevent q_next from changing by holding sd_lock until acquiring
37630Sstevel@tonic-gate 	 * SQLOCK. Note that a read-side rwnext from the streamhead will
37640Sstevel@tonic-gate 	 * already have sd_lock acquired. In either case sd_lock is always
37650Sstevel@tonic-gate 	 * released after acquiring SQLOCK.
37660Sstevel@tonic-gate 	 *
37670Sstevel@tonic-gate 	 * The streamhead read-side holding sd_lock when calling rwnext is
37680Sstevel@tonic-gate 	 * required to prevent a race condition were M_DATA mblks flowing
37690Sstevel@tonic-gate 	 * up the read-side of the stream could be bypassed by a rwnext()
37700Sstevel@tonic-gate 	 * down-call. In this case sd_lock acts as the streamhead perimeter.
37710Sstevel@tonic-gate 	 */
37720Sstevel@tonic-gate 	if ((nqp = _WR(qp)) == qp) {
37730Sstevel@tonic-gate 		isread = 0;
37740Sstevel@tonic-gate 		mutex_enter(&stp->sd_lock);
37750Sstevel@tonic-gate 		qp = nqp->q_next;
37760Sstevel@tonic-gate 	} else {
37770Sstevel@tonic-gate 		isread = 1;
37780Sstevel@tonic-gate 		if (nqp != stp->sd_wrq)
37790Sstevel@tonic-gate 			/* Not streamhead */
37800Sstevel@tonic-gate 			mutex_enter(&stp->sd_lock);
37810Sstevel@tonic-gate 		qp = _RD(nqp->q_next);
37820Sstevel@tonic-gate 	}
37830Sstevel@tonic-gate 	qi = qp->q_qinfo;
37840Sstevel@tonic-gate 	if (qp->q_struiot == STRUIOT_NONE || ! (proc = qi->qi_rwp)) {
37850Sstevel@tonic-gate 		/*
37860Sstevel@tonic-gate 		 * Not a synchronous module or no r/w procedure for this
37870Sstevel@tonic-gate 		 * queue, so just return EINVAL and let the caller handle it.
37880Sstevel@tonic-gate 		 */
37890Sstevel@tonic-gate 		mutex_exit(&stp->sd_lock);
37900Sstevel@tonic-gate 		return (EINVAL);
37910Sstevel@tonic-gate 	}
37920Sstevel@tonic-gate 
37930Sstevel@tonic-gate 	if (rwnext_enter(qp) == B_FALSE) {
37940Sstevel@tonic-gate 		mutex_exit(&stp->sd_lock);
37950Sstevel@tonic-gate 		return (EINVAL);
37960Sstevel@tonic-gate 	}
37970Sstevel@tonic-gate 
37980Sstevel@tonic-gate 	sq = qp->q_syncq;
37990Sstevel@tonic-gate 	mutex_enter(SQLOCK(sq));
38000Sstevel@tonic-gate 	mutex_exit(&stp->sd_lock);
38010Sstevel@tonic-gate 	count = sq->sq_count;
38020Sstevel@tonic-gate 	flags = sq->sq_flags;
38030Sstevel@tonic-gate 	ASSERT(sq->sq_ciputctrl == NULL || (flags & SQ_CIPUT));
38040Sstevel@tonic-gate 
38050Sstevel@tonic-gate 	while ((flags & SQ_GOAWAY) || (!(flags & SQ_CIPUT) && count != 0)) {
38060Sstevel@tonic-gate 		/*
38070Sstevel@tonic-gate 		 * if this queue is being closed, return.
38080Sstevel@tonic-gate 		 */
38090Sstevel@tonic-gate 		if (qp->q_flag & QWCLOSE) {
38100Sstevel@tonic-gate 			mutex_exit(SQLOCK(sq));
38110Sstevel@tonic-gate 			rwnext_exit(qp);
38120Sstevel@tonic-gate 			return (EINVAL);
38130Sstevel@tonic-gate 		}
38140Sstevel@tonic-gate 
38150Sstevel@tonic-gate 		/*
38160Sstevel@tonic-gate 		 * Wait until we can enter the inner perimeter.
38170Sstevel@tonic-gate 		 */
38180Sstevel@tonic-gate 		sq->sq_flags = flags | SQ_WANTWAKEUP;
38190Sstevel@tonic-gate 		cv_wait(&sq->sq_wait, SQLOCK(sq));
38200Sstevel@tonic-gate 		count = sq->sq_count;
38210Sstevel@tonic-gate 		flags = sq->sq_flags;
38220Sstevel@tonic-gate 	}
38230Sstevel@tonic-gate 
38240Sstevel@tonic-gate 	if (isread == 0 && stp->sd_struiowrq == NULL ||
38250Sstevel@tonic-gate 	    isread == 1 && stp->sd_struiordq == NULL) {
38260Sstevel@tonic-gate 		/*
38270Sstevel@tonic-gate 		 * Stream plumbing changed while waiting for inner perimeter
38280Sstevel@tonic-gate 		 * so just return EINVAL and let the caller handle it.
38290Sstevel@tonic-gate 		 */
38300Sstevel@tonic-gate 		mutex_exit(SQLOCK(sq));
38310Sstevel@tonic-gate 		rwnext_exit(qp);
38320Sstevel@tonic-gate 		return (EINVAL);
38330Sstevel@tonic-gate 	}
38340Sstevel@tonic-gate 	if (!(flags & SQ_CIPUT))
38350Sstevel@tonic-gate 		sq->sq_flags = flags | SQ_EXCL;
38360Sstevel@tonic-gate 	sq->sq_count = count + 1;
38370Sstevel@tonic-gate 	ASSERT(sq->sq_count != 0);		/* Wraparound */
38380Sstevel@tonic-gate 	/*
38390Sstevel@tonic-gate 	 * Note: The only message ordering guarantee that rwnext() makes is
38400Sstevel@tonic-gate 	 *	 for the write queue flow-control case. All others (r/w queue
38410Sstevel@tonic-gate 	 *	 with q_count > 0 (or q_first != 0)) are the resposibilty of
38420Sstevel@tonic-gate 	 *	 the queue's rw procedure. This could be genralized here buy
38430Sstevel@tonic-gate 	 *	 running the queue's service procedure, but that wouldn't be
38440Sstevel@tonic-gate 	 *	 the most efficent for all cases.
38450Sstevel@tonic-gate 	 */
38460Sstevel@tonic-gate 	mutex_exit(SQLOCK(sq));
38470Sstevel@tonic-gate 	if (! isread && (qp->q_flag & QFULL)) {
38480Sstevel@tonic-gate 		/*
38490Sstevel@tonic-gate 		 * Write queue may be flow controlled. If so,
38500Sstevel@tonic-gate 		 * mark the queue for wakeup when it's not.
38510Sstevel@tonic-gate 		 */
38520Sstevel@tonic-gate 		mutex_enter(QLOCK(qp));
38530Sstevel@tonic-gate 		if (qp->q_flag & QFULL) {
38540Sstevel@tonic-gate 			qp->q_flag |= QWANTWSYNC;
38550Sstevel@tonic-gate 			mutex_exit(QLOCK(qp));
38560Sstevel@tonic-gate 			rval = EWOULDBLOCK;
38570Sstevel@tonic-gate 			goto out;
38580Sstevel@tonic-gate 		}
38590Sstevel@tonic-gate 		mutex_exit(QLOCK(qp));
38600Sstevel@tonic-gate 	}
38610Sstevel@tonic-gate 
38620Sstevel@tonic-gate 	if (! isread && dp->d_mp)
38630Sstevel@tonic-gate 		STR_FTEVENT_MSG(dp->d_mp, nqp, FTEV_RWNEXT, dp->d_mp->b_rptr -
38640Sstevel@tonic-gate 		    dp->d_mp->b_datap->db_base);
38650Sstevel@tonic-gate 
38660Sstevel@tonic-gate 	rval = (*proc)(qp, dp);
38670Sstevel@tonic-gate 
38680Sstevel@tonic-gate 	if (isread && dp->d_mp)
38690Sstevel@tonic-gate 		STR_FTEVENT_MSG(dp->d_mp, _RD(nqp), FTEV_RWNEXT,
38700Sstevel@tonic-gate 		    dp->d_mp->b_rptr - dp->d_mp->b_datap->db_base);
38710Sstevel@tonic-gate out:
38720Sstevel@tonic-gate 	/*
38730Sstevel@tonic-gate 	 * The queue is protected from being freed by sq_count, so it is
38740Sstevel@tonic-gate 	 * safe to call rwnext_exit and reacquire SQLOCK(sq).
38750Sstevel@tonic-gate 	 */
38760Sstevel@tonic-gate 	rwnext_exit(qp);
38770Sstevel@tonic-gate 
38780Sstevel@tonic-gate 	mutex_enter(SQLOCK(sq));
38790Sstevel@tonic-gate 	flags = sq->sq_flags;
38800Sstevel@tonic-gate 	ASSERT(sq->sq_count != 0);
38810Sstevel@tonic-gate 	sq->sq_count--;
38820Sstevel@tonic-gate 	if (flags & SQ_TAIL) {
38830Sstevel@tonic-gate 		putnext_tail(sq, qp, flags);
38840Sstevel@tonic-gate 		/*
38850Sstevel@tonic-gate 		 * The only purpose of this ASSERT is to preserve calling stack
38860Sstevel@tonic-gate 		 * in DEBUG kernel.
38870Sstevel@tonic-gate 		 */
38880Sstevel@tonic-gate 		ASSERT(flags & SQ_TAIL);
38890Sstevel@tonic-gate 		return (rval);
38900Sstevel@tonic-gate 	}
38910Sstevel@tonic-gate 	ASSERT(flags & (SQ_EXCL|SQ_CIPUT));
38920Sstevel@tonic-gate 	/*
38930Sstevel@tonic-gate 	 * Safe to always drop SQ_EXCL:
38940Sstevel@tonic-gate 	 *	Not SQ_CIPUT means we set SQ_EXCL above
38950Sstevel@tonic-gate 	 *	For SQ_CIPUT SQ_EXCL will only be set if the put procedure
38960Sstevel@tonic-gate 	 *	did a qwriter(INNER) in which case nobody else
38970Sstevel@tonic-gate 	 *	is in the inner perimeter and we are exiting.
38980Sstevel@tonic-gate 	 *
38990Sstevel@tonic-gate 	 * I would like to make the following assertion:
39000Sstevel@tonic-gate 	 *
39010Sstevel@tonic-gate 	 * ASSERT((flags & (SQ_EXCL|SQ_CIPUT)) != (SQ_EXCL|SQ_CIPUT) ||
39020Sstevel@tonic-gate 	 * 	sq->sq_count == 0);
39030Sstevel@tonic-gate 	 *
39040Sstevel@tonic-gate 	 * which indicates that if we are both putshared and exclusive,
39050Sstevel@tonic-gate 	 * we became exclusive while executing the putproc, and the only
39060Sstevel@tonic-gate 	 * claim on the syncq was the one we dropped a few lines above.
39070Sstevel@tonic-gate 	 * But other threads that enter putnext while the syncq is exclusive
39080Sstevel@tonic-gate 	 * need to make a claim as they may need to drop SQLOCK in the
39090Sstevel@tonic-gate 	 * has_writers case to avoid deadlocks.  If these threads are
39100Sstevel@tonic-gate 	 * delayed or preempted, it is possible that the writer thread can
39110Sstevel@tonic-gate 	 * find out that there are other claims making the (sq_count == 0)
39120Sstevel@tonic-gate 	 * test invalid.
39130Sstevel@tonic-gate 	 */
39140Sstevel@tonic-gate 
39150Sstevel@tonic-gate 	sq->sq_flags = flags & ~SQ_EXCL;
39160Sstevel@tonic-gate 	if (sq->sq_flags & SQ_WANTWAKEUP) {
39170Sstevel@tonic-gate 		sq->sq_flags &= ~SQ_WANTWAKEUP;
39180Sstevel@tonic-gate 		cv_broadcast(&sq->sq_wait);
39190Sstevel@tonic-gate 	}
39200Sstevel@tonic-gate 	mutex_exit(SQLOCK(sq));
39210Sstevel@tonic-gate 	return (rval);
39220Sstevel@tonic-gate }
39230Sstevel@tonic-gate 
39240Sstevel@tonic-gate /*
39250Sstevel@tonic-gate  * The purpose of infonext() is to call the info procedure of the next
39260Sstevel@tonic-gate  * (downstream) modules queue.
39270Sstevel@tonic-gate  *
39280Sstevel@tonic-gate  * treated as put entrypoint for perimeter syncronization.
39290Sstevel@tonic-gate  *
39300Sstevel@tonic-gate  * There's no need to grab sq_putlocks here (which only exist for CIPUT
39310Sstevel@tonic-gate  * sync queues). If it is CIPUT sync queue regular sq_count is incremented and
39320Sstevel@tonic-gate  * it does not matter if any regular put entrypoints have been already
39330Sstevel@tonic-gate  * entered.
39340Sstevel@tonic-gate  */
39350Sstevel@tonic-gate int
39360Sstevel@tonic-gate infonext(queue_t *qp, infod_t *idp)
39370Sstevel@tonic-gate {
39380Sstevel@tonic-gate 	queue_t		*nqp;
39390Sstevel@tonic-gate 	syncq_t		*sq;
39400Sstevel@tonic-gate 	uint16_t	count;
39410Sstevel@tonic-gate 	uint16_t 	flags;
39420Sstevel@tonic-gate 	struct qinit	*qi;
39430Sstevel@tonic-gate 	int		(*proc)();
39440Sstevel@tonic-gate 	struct stdata	*stp;
39450Sstevel@tonic-gate 	int		rval;
39460Sstevel@tonic-gate 
39470Sstevel@tonic-gate 	stp = STREAM(qp);
39480Sstevel@tonic-gate 	/*
39490Sstevel@tonic-gate 	 * Prevent q_next from changing by holding sd_lock until
39500Sstevel@tonic-gate 	 * acquiring SQLOCK.
39510Sstevel@tonic-gate 	 */
39520Sstevel@tonic-gate 	mutex_enter(&stp->sd_lock);
39530Sstevel@tonic-gate 	if ((nqp = _WR(qp)) == qp) {
39540Sstevel@tonic-gate 		qp = nqp->q_next;
39550Sstevel@tonic-gate 	} else {
39560Sstevel@tonic-gate 		qp = _RD(nqp->q_next);
39570Sstevel@tonic-gate 	}
39580Sstevel@tonic-gate 	qi = qp->q_qinfo;
39590Sstevel@tonic-gate 	if (qp->q_struiot == STRUIOT_NONE || ! (proc = qi->qi_infop)) {
39600Sstevel@tonic-gate 		mutex_exit(&stp->sd_lock);
39610Sstevel@tonic-gate 		return (EINVAL);
39620Sstevel@tonic-gate 	}
39630Sstevel@tonic-gate 	sq = qp->q_syncq;
39640Sstevel@tonic-gate 	mutex_enter(SQLOCK(sq));
39650Sstevel@tonic-gate 	mutex_exit(&stp->sd_lock);
39660Sstevel@tonic-gate 	count = sq->sq_count;
39670Sstevel@tonic-gate 	flags = sq->sq_flags;
39680Sstevel@tonic-gate 	ASSERT(sq->sq_ciputctrl == NULL || (flags & SQ_CIPUT));
39690Sstevel@tonic-gate 
39700Sstevel@tonic-gate 	while ((flags & SQ_GOAWAY) || (!(flags & SQ_CIPUT) && count != 0)) {
39710Sstevel@tonic-gate 		/*
39720Sstevel@tonic-gate 		 * Wait until we can enter the inner perimeter.
39730Sstevel@tonic-gate 		 */
39740Sstevel@tonic-gate 		sq->sq_flags = flags | SQ_WANTWAKEUP;
39750Sstevel@tonic-gate 		cv_wait(&sq->sq_wait, SQLOCK(sq));
39760Sstevel@tonic-gate 		count = sq->sq_count;
39770Sstevel@tonic-gate 		flags = sq->sq_flags;
39780Sstevel@tonic-gate 	}
39790Sstevel@tonic-gate 
39800Sstevel@tonic-gate 	if (! (flags & SQ_CIPUT))
39810Sstevel@tonic-gate 		sq->sq_flags = flags | SQ_EXCL;
39820Sstevel@tonic-gate 	sq->sq_count = count + 1;
39830Sstevel@tonic-gate 	ASSERT(sq->sq_count != 0);		/* Wraparound */
39840Sstevel@tonic-gate 	mutex_exit(SQLOCK(sq));
39850Sstevel@tonic-gate 
39860Sstevel@tonic-gate 	rval = (*proc)(qp, idp);
39870Sstevel@tonic-gate 
39880Sstevel@tonic-gate 	mutex_enter(SQLOCK(sq));
39890Sstevel@tonic-gate 	flags = sq->sq_flags;
39900Sstevel@tonic-gate 	ASSERT(sq->sq_count != 0);
39910Sstevel@tonic-gate 	sq->sq_count--;
39920Sstevel@tonic-gate 	if (flags & SQ_TAIL) {
39930Sstevel@tonic-gate 		putnext_tail(sq, qp, flags);
39940Sstevel@tonic-gate 		/*
39950Sstevel@tonic-gate 		 * The only purpose of this ASSERT is to preserve calling stack
39960Sstevel@tonic-gate 		 * in DEBUG kernel.
39970Sstevel@tonic-gate 		 */
39980Sstevel@tonic-gate 		ASSERT(flags & SQ_TAIL);
39990Sstevel@tonic-gate 		return (rval);
40000Sstevel@tonic-gate 	}
40010Sstevel@tonic-gate 	ASSERT(flags & (SQ_EXCL|SQ_CIPUT));
40020Sstevel@tonic-gate /*
40030Sstevel@tonic-gate  * XXXX
40040Sstevel@tonic-gate  * I am not certain the next comment is correct here.  I need to consider
40050Sstevel@tonic-gate  * why the infonext is called, and if dropping SQ_EXCL unless non-CIPUT
40060Sstevel@tonic-gate  * might cause other problems.  It just might be safer to drop it if
40070Sstevel@tonic-gate  * !SQ_CIPUT because that is when we set it.
40080Sstevel@tonic-gate  */
40090Sstevel@tonic-gate 	/*
40100Sstevel@tonic-gate 	 * Safe to always drop SQ_EXCL:
40110Sstevel@tonic-gate 	 *	Not SQ_CIPUT means we set SQ_EXCL above
40120Sstevel@tonic-gate 	 *	For SQ_CIPUT SQ_EXCL will only be set if the put procedure
40130Sstevel@tonic-gate 	 *	did a qwriter(INNER) in which case nobody else
40140Sstevel@tonic-gate 	 *	is in the inner perimeter and we are exiting.
40150Sstevel@tonic-gate 	 *
40160Sstevel@tonic-gate 	 * I would like to make the following assertion:
40170Sstevel@tonic-gate 	 *
40180Sstevel@tonic-gate 	 * ASSERT((flags & (SQ_EXCL|SQ_CIPUT)) != (SQ_EXCL|SQ_CIPUT) ||
40190Sstevel@tonic-gate 	 *	sq->sq_count == 0);
40200Sstevel@tonic-gate 	 *
40210Sstevel@tonic-gate 	 * which indicates that if we are both putshared and exclusive,
40220Sstevel@tonic-gate 	 * we became exclusive while executing the putproc, and the only
40230Sstevel@tonic-gate 	 * claim on the syncq was the one we dropped a few lines above.
40240Sstevel@tonic-gate 	 * But other threads that enter putnext while the syncq is exclusive
40250Sstevel@tonic-gate 	 * need to make a claim as they may need to drop SQLOCK in the
40260Sstevel@tonic-gate 	 * has_writers case to avoid deadlocks.  If these threads are
40270Sstevel@tonic-gate 	 * delayed or preempted, it is possible that the writer thread can
40280Sstevel@tonic-gate 	 * find out that there are other claims making the (sq_count == 0)
40290Sstevel@tonic-gate 	 * test invalid.
40300Sstevel@tonic-gate 	 */
40310Sstevel@tonic-gate 
40320Sstevel@tonic-gate 	sq->sq_flags = flags & ~SQ_EXCL;
40330Sstevel@tonic-gate 	mutex_exit(SQLOCK(sq));
40340Sstevel@tonic-gate 	return (rval);
40350Sstevel@tonic-gate }
40360Sstevel@tonic-gate 
40370Sstevel@tonic-gate /*
40380Sstevel@tonic-gate  * Return nonzero if the queue is responsible for struio(), else return 0.
40390Sstevel@tonic-gate  */
40400Sstevel@tonic-gate int
40410Sstevel@tonic-gate isuioq(queue_t *q)
40420Sstevel@tonic-gate {
40430Sstevel@tonic-gate 	if (q->q_flag & QREADR)
40440Sstevel@tonic-gate 		return (STREAM(q)->sd_struiordq == q);
40450Sstevel@tonic-gate 	else
40460Sstevel@tonic-gate 		return (STREAM(q)->sd_struiowrq == q);
40470Sstevel@tonic-gate }
40480Sstevel@tonic-gate 
40490Sstevel@tonic-gate #if defined(__sparc)
40500Sstevel@tonic-gate int disable_putlocks = 0;
40510Sstevel@tonic-gate #else
40520Sstevel@tonic-gate int disable_putlocks = 1;
40530Sstevel@tonic-gate #endif
40540Sstevel@tonic-gate 
40550Sstevel@tonic-gate /*
40560Sstevel@tonic-gate  * called by create_putlock.
40570Sstevel@tonic-gate  */
40580Sstevel@tonic-gate static void
40590Sstevel@tonic-gate create_syncq_putlocks(queue_t *q)
40600Sstevel@tonic-gate {
40610Sstevel@tonic-gate 	syncq_t	*sq = q->q_syncq;
40620Sstevel@tonic-gate 	ciputctrl_t *cip;
40630Sstevel@tonic-gate 	int i;
40640Sstevel@tonic-gate 
40650Sstevel@tonic-gate 	ASSERT(sq != NULL);
40660Sstevel@tonic-gate 
40670Sstevel@tonic-gate 	ASSERT(disable_putlocks == 0);
40680Sstevel@tonic-gate 	ASSERT(n_ciputctrl >= min_n_ciputctrl);
40690Sstevel@tonic-gate 	ASSERT(ciputctrl_cache != NULL);
40700Sstevel@tonic-gate 
40710Sstevel@tonic-gate 	if (!(sq->sq_type & SQ_CIPUT))
40720Sstevel@tonic-gate 		return;
40730Sstevel@tonic-gate 
40740Sstevel@tonic-gate 	for (i = 0; i <= 1; i++) {
40750Sstevel@tonic-gate 		if (sq->sq_ciputctrl == NULL) {
40760Sstevel@tonic-gate 			cip = kmem_cache_alloc(ciputctrl_cache, KM_SLEEP);
40770Sstevel@tonic-gate 			SUMCHECK_CIPUTCTRL_COUNTS(cip, n_ciputctrl - 1, 0);
40780Sstevel@tonic-gate 			mutex_enter(SQLOCK(sq));
40790Sstevel@tonic-gate 			if (sq->sq_ciputctrl != NULL) {
40800Sstevel@tonic-gate 				mutex_exit(SQLOCK(sq));
40810Sstevel@tonic-gate 				kmem_cache_free(ciputctrl_cache, cip);
40820Sstevel@tonic-gate 			} else {
40830Sstevel@tonic-gate 				ASSERT(sq->sq_nciputctrl == 0);
40840Sstevel@tonic-gate 				sq->sq_nciputctrl = n_ciputctrl - 1;
40850Sstevel@tonic-gate 				/*
40860Sstevel@tonic-gate 				 * putnext checks sq_ciputctrl without holding
40870Sstevel@tonic-gate 				 * SQLOCK. if it is not NULL putnext assumes
40880Sstevel@tonic-gate 				 * sq_nciputctrl is initialized. membar below
40890Sstevel@tonic-gate 				 * insures that.
40900Sstevel@tonic-gate 				 */
40910Sstevel@tonic-gate 				membar_producer();
40920Sstevel@tonic-gate 				sq->sq_ciputctrl = cip;
40930Sstevel@tonic-gate 				mutex_exit(SQLOCK(sq));
40940Sstevel@tonic-gate 			}
40950Sstevel@tonic-gate 		}
40960Sstevel@tonic-gate 		ASSERT(sq->sq_nciputctrl == n_ciputctrl - 1);
40970Sstevel@tonic-gate 		if (i == 1)
40980Sstevel@tonic-gate 			break;
40990Sstevel@tonic-gate 		q = _OTHERQ(q);
41000Sstevel@tonic-gate 		if (!(q->q_flag & QPERQ)) {
41010Sstevel@tonic-gate 			ASSERT(sq == q->q_syncq);
41020Sstevel@tonic-gate 			break;
41030Sstevel@tonic-gate 		}
41040Sstevel@tonic-gate 		ASSERT(q->q_syncq != NULL);
41050Sstevel@tonic-gate 		ASSERT(sq != q->q_syncq);
41060Sstevel@tonic-gate 		sq = q->q_syncq;
41070Sstevel@tonic-gate 		ASSERT(sq->sq_type & SQ_CIPUT);
41080Sstevel@tonic-gate 	}
41090Sstevel@tonic-gate }
41100Sstevel@tonic-gate 
41110Sstevel@tonic-gate /*
41120Sstevel@tonic-gate  * If stream argument is 0 only create per cpu sq_putlocks/sq_putcounts for
41130Sstevel@tonic-gate  * syncq of q. If stream argument is not 0 create per cpu stream_putlocks for
41140Sstevel@tonic-gate  * the stream of q and per cpu sq_putlocks/sq_putcounts for all syncq's
41150Sstevel@tonic-gate  * starting from q and down to the driver.
41160Sstevel@tonic-gate  *
41170Sstevel@tonic-gate  * This should be called after the affected queues are part of stream
41180Sstevel@tonic-gate  * geometry. It should be called from driver/module open routine after
41190Sstevel@tonic-gate  * qprocson() call. It is also called from nfs syscall where it is known that
41200Sstevel@tonic-gate  * stream is configured and won't change its geometry during create_putlock
41210Sstevel@tonic-gate  * call.
41220Sstevel@tonic-gate  *
41230Sstevel@tonic-gate  * caller normally uses 0 value for the stream argument to speed up MT putnext
41240Sstevel@tonic-gate  * into the perimeter of q for example because its perimeter is per module
41250Sstevel@tonic-gate  * (e.g. IP).
41260Sstevel@tonic-gate  *
41270Sstevel@tonic-gate  * caller normally uses non 0 value for the stream argument to hint the system
41280Sstevel@tonic-gate  * that the stream of q is a very contended global system stream
41290Sstevel@tonic-gate  * (e.g. NFS/UDP) and the part of the stream from q to the driver is
41300Sstevel@tonic-gate  * particularly MT hot.
41310Sstevel@tonic-gate  *
41320Sstevel@tonic-gate  * Caller insures stream plumbing won't happen while we are here and therefore
41330Sstevel@tonic-gate  * q_next can be safely used.
41340Sstevel@tonic-gate  */
41350Sstevel@tonic-gate 
41360Sstevel@tonic-gate void
41370Sstevel@tonic-gate create_putlocks(queue_t *q, int stream)
41380Sstevel@tonic-gate {
41390Sstevel@tonic-gate 	ciputctrl_t	*cip;
41400Sstevel@tonic-gate 	struct stdata	*stp = STREAM(q);
41410Sstevel@tonic-gate 
41420Sstevel@tonic-gate 	q = _WR(q);
41430Sstevel@tonic-gate 	ASSERT(stp != NULL);
41440Sstevel@tonic-gate 
41450Sstevel@tonic-gate 	if (disable_putlocks != 0)
41460Sstevel@tonic-gate 		return;
41470Sstevel@tonic-gate 
41480Sstevel@tonic-gate 	if (n_ciputctrl < min_n_ciputctrl)
41490Sstevel@tonic-gate 		return;
41500Sstevel@tonic-gate 
41510Sstevel@tonic-gate 	ASSERT(ciputctrl_cache != NULL);
41520Sstevel@tonic-gate 
41530Sstevel@tonic-gate 	if (stream != 0 && stp->sd_ciputctrl == NULL) {
41540Sstevel@tonic-gate 		cip = kmem_cache_alloc(ciputctrl_cache, KM_SLEEP);
41550Sstevel@tonic-gate 		SUMCHECK_CIPUTCTRL_COUNTS(cip, n_ciputctrl - 1, 0);
41560Sstevel@tonic-gate 		mutex_enter(&stp->sd_lock);
41570Sstevel@tonic-gate 		if (stp->sd_ciputctrl != NULL) {
41580Sstevel@tonic-gate 			mutex_exit(&stp->sd_lock);
41590Sstevel@tonic-gate 			kmem_cache_free(ciputctrl_cache, cip);
41600Sstevel@tonic-gate 		} else {
41610Sstevel@tonic-gate 			ASSERT(stp->sd_nciputctrl == 0);
41620Sstevel@tonic-gate 			stp->sd_nciputctrl = n_ciputctrl - 1;
41630Sstevel@tonic-gate 			/*
41640Sstevel@tonic-gate 			 * putnext checks sd_ciputctrl without holding
41650Sstevel@tonic-gate 			 * sd_lock. if it is not NULL putnext assumes
41660Sstevel@tonic-gate 			 * sd_nciputctrl is initialized. membar below
41670Sstevel@tonic-gate 			 * insures that.
41680Sstevel@tonic-gate 			 */
41690Sstevel@tonic-gate 			membar_producer();
41700Sstevel@tonic-gate 			stp->sd_ciputctrl = cip;
41710Sstevel@tonic-gate 			mutex_exit(&stp->sd_lock);
41720Sstevel@tonic-gate 		}
41730Sstevel@tonic-gate 	}
41740Sstevel@tonic-gate 
41750Sstevel@tonic-gate 	ASSERT(stream == 0 || stp->sd_nciputctrl == n_ciputctrl - 1);
41760Sstevel@tonic-gate 
41770Sstevel@tonic-gate 	while (_SAMESTR(q)) {
41780Sstevel@tonic-gate 		create_syncq_putlocks(q);
41790Sstevel@tonic-gate 		if (stream == 0)
41800Sstevel@tonic-gate 			return;
41810Sstevel@tonic-gate 		q = q->q_next;
41820Sstevel@tonic-gate 	}
41830Sstevel@tonic-gate 	ASSERT(q != NULL);
41840Sstevel@tonic-gate 	create_syncq_putlocks(q);
41850Sstevel@tonic-gate }
41860Sstevel@tonic-gate 
41870Sstevel@tonic-gate /*
41880Sstevel@tonic-gate  * STREAMS Flow Trace - record STREAMS Flow Trace events as an mblk flows
41890Sstevel@tonic-gate  * through a stream.
41900Sstevel@tonic-gate  *
41918752SPeter.Memishian@Sun.COM  * Data currently record per-event is a timestamp, module/driver name,
41928752SPeter.Memishian@Sun.COM  * downstream module/driver name, optional callstack, event type and a per
41938752SPeter.Memishian@Sun.COM  * type datum.  Much of the STREAMS framework is instrumented for automatic
41948752SPeter.Memishian@Sun.COM  * flow tracing (when enabled).  Events can be defined and used by STREAMS
41958752SPeter.Memishian@Sun.COM  * modules and drivers.
41960Sstevel@tonic-gate  *
41970Sstevel@tonic-gate  * Global objects:
41980Sstevel@tonic-gate  *
41990Sstevel@tonic-gate  *	str_ftevent() - Add a flow-trace event to a dblk.
42000Sstevel@tonic-gate  *	str_ftfree() - Free flow-trace data
42010Sstevel@tonic-gate  *
42020Sstevel@tonic-gate  * Local objects:
42030Sstevel@tonic-gate  *
42040Sstevel@tonic-gate  *	fthdr_cache - pointer to the kmem cache for trace header.
42050Sstevel@tonic-gate  *	ftblk_cache - pointer to the kmem cache for trace data blocks.
42060Sstevel@tonic-gate  */
42070Sstevel@tonic-gate 
42080Sstevel@tonic-gate int str_ftnever = 1;	/* Don't do STREAMS flow tracing */
42098752SPeter.Memishian@Sun.COM int str_ftstack = 0;	/* Don't record event call stacks */
42100Sstevel@tonic-gate 
42110Sstevel@tonic-gate void
42120Sstevel@tonic-gate str_ftevent(fthdr_t *hp, void *p, ushort_t evnt, ushort_t data)
42130Sstevel@tonic-gate {
42140Sstevel@tonic-gate 	ftblk_t *bp = hp->tail;
42150Sstevel@tonic-gate 	ftblk_t *nbp;
42160Sstevel@tonic-gate 	ftevnt_t *ep;
42170Sstevel@tonic-gate 	int ix, nix;
42180Sstevel@tonic-gate 
42190Sstevel@tonic-gate 	ASSERT(hp != NULL);
42200Sstevel@tonic-gate 
42210Sstevel@tonic-gate 	for (;;) {
42220Sstevel@tonic-gate 		if ((ix = bp->ix) == FTBLK_EVNTS) {
42230Sstevel@tonic-gate 			/*
42240Sstevel@tonic-gate 			 * Tail doesn't have room, so need a new tail.
42250Sstevel@tonic-gate 			 *
42260Sstevel@tonic-gate 			 * To make this MT safe, first, allocate a new
42270Sstevel@tonic-gate 			 * ftblk, and initialize it.  To make life a
42280Sstevel@tonic-gate 			 * little easier, reserve the first slot (mostly
42290Sstevel@tonic-gate 			 * by making ix = 1).  When we are finished with
42300Sstevel@tonic-gate 			 * the initialization, CAS this pointer to the
42310Sstevel@tonic-gate 			 * tail.  If this succeeds, this is the new
42320Sstevel@tonic-gate 			 * "next" block.  Otherwise, another thread
42330Sstevel@tonic-gate 			 * got here first, so free the block and start
42340Sstevel@tonic-gate 			 * again.
42350Sstevel@tonic-gate 			 */
42368752SPeter.Memishian@Sun.COM 			nbp = kmem_cache_alloc(ftblk_cache, KM_NOSLEEP);
42378752SPeter.Memishian@Sun.COM 			if (nbp == NULL) {
42380Sstevel@tonic-gate 				/* no mem, so punt */
42390Sstevel@tonic-gate 				str_ftnever++;
42400Sstevel@tonic-gate 				/* free up all flow data? */
42410Sstevel@tonic-gate 				return;
42420Sstevel@tonic-gate 			}
42430Sstevel@tonic-gate 			nbp->nxt = NULL;
42440Sstevel@tonic-gate 			nbp->ix = 1;
42450Sstevel@tonic-gate 			/*
42460Sstevel@tonic-gate 			 * Just in case there is another thread about
42470Sstevel@tonic-gate 			 * to get the next index, we need to make sure
42480Sstevel@tonic-gate 			 * the value is there for it.
42490Sstevel@tonic-gate 			 */
42500Sstevel@tonic-gate 			membar_producer();
42510Sstevel@tonic-gate 			if (casptr(&hp->tail, bp, nbp) == bp) {
42520Sstevel@tonic-gate 				/* CAS was successful */
42530Sstevel@tonic-gate 				bp->nxt = nbp;
42540Sstevel@tonic-gate 				membar_producer();
42550Sstevel@tonic-gate 				bp = nbp;
42560Sstevel@tonic-gate 				ix = 0;
42570Sstevel@tonic-gate 				goto cas_good;
42580Sstevel@tonic-gate 			} else {
42590Sstevel@tonic-gate 				kmem_cache_free(ftblk_cache, nbp);
42600Sstevel@tonic-gate 				bp = hp->tail;
42610Sstevel@tonic-gate 				continue;
42620Sstevel@tonic-gate 			}
42630Sstevel@tonic-gate 		}
42640Sstevel@tonic-gate 		nix = ix + 1;
42650Sstevel@tonic-gate 		if (cas32((uint32_t *)&bp->ix, ix, nix) == ix) {
42660Sstevel@tonic-gate 		cas_good:
42670Sstevel@tonic-gate 			if (curthread != hp->thread) {
42680Sstevel@tonic-gate 				hp->thread = curthread;
42690Sstevel@tonic-gate 				evnt |= FTEV_CS;
42700Sstevel@tonic-gate 			}
42710Sstevel@tonic-gate 			if (CPU->cpu_seqid != hp->cpu_seqid) {
42720Sstevel@tonic-gate 				hp->cpu_seqid = CPU->cpu_seqid;
42730Sstevel@tonic-gate 				evnt |= FTEV_PS;
42740Sstevel@tonic-gate 			}
42750Sstevel@tonic-gate 			ep = &bp->ev[ix];
42760Sstevel@tonic-gate 			break;
42770Sstevel@tonic-gate 		}
42780Sstevel@tonic-gate 	}
42790Sstevel@tonic-gate 
42800Sstevel@tonic-gate 	if (evnt & FTEV_QMASK) {
42810Sstevel@tonic-gate 		queue_t *qp = p;
42820Sstevel@tonic-gate 
42830Sstevel@tonic-gate 		if (!(qp->q_flag & QREADR))
42840Sstevel@tonic-gate 			evnt |= FTEV_ISWR;
42858752SPeter.Memishian@Sun.COM 
42868752SPeter.Memishian@Sun.COM 		ep->mid = Q2NAME(qp);
42878752SPeter.Memishian@Sun.COM 
42888752SPeter.Memishian@Sun.COM 		/*
42898752SPeter.Memishian@Sun.COM 		 * We only record the next queue name for FTEV_PUTNEXT since
42908752SPeter.Memishian@Sun.COM 		 * that's the only time we *really* need it, and the putnext()
42918752SPeter.Memishian@Sun.COM 		 * code ensures that qp->q_next won't vanish.  (We could use
42928752SPeter.Memishian@Sun.COM 		 * claimstr()/releasestr() but at a performance cost.)
42938752SPeter.Memishian@Sun.COM 		 */
42948752SPeter.Memishian@Sun.COM 		if ((evnt & FTEV_MASK) == FTEV_PUTNEXT && qp->q_next != NULL)
42958752SPeter.Memishian@Sun.COM 			ep->midnext = Q2NAME(qp->q_next);
42968752SPeter.Memishian@Sun.COM 		else
42978752SPeter.Memishian@Sun.COM 			ep->midnext = NULL;
42980Sstevel@tonic-gate 	} else {
42998752SPeter.Memishian@Sun.COM 		ep->mid = p;
43008752SPeter.Memishian@Sun.COM 		ep->midnext = NULL;
43010Sstevel@tonic-gate 	}
43020Sstevel@tonic-gate 
43038752SPeter.Memishian@Sun.COM 	if (ep->stk != NULL)
43048752SPeter.Memishian@Sun.COM 		ep->stk->fs_depth = getpcstack(ep->stk->fs_stk, FTSTK_DEPTH);
43058752SPeter.Memishian@Sun.COM 
43060Sstevel@tonic-gate 	ep->ts = gethrtime();
43070Sstevel@tonic-gate 	ep->evnt = evnt;
43080Sstevel@tonic-gate 	ep->data = data;
43090Sstevel@tonic-gate 	hp->hash = (hp->hash << 9) + hp->hash;
43100Sstevel@tonic-gate 	hp->hash += (evnt << 16) | data;
43110Sstevel@tonic-gate 	hp->hash += (uintptr_t)ep->mid;
43120Sstevel@tonic-gate }
43130Sstevel@tonic-gate 
43140Sstevel@tonic-gate /*
43150Sstevel@tonic-gate  * Free flow-trace data.
43160Sstevel@tonic-gate  */
43170Sstevel@tonic-gate void
43180Sstevel@tonic-gate str_ftfree(dblk_t *dbp)
43190Sstevel@tonic-gate {
43200Sstevel@tonic-gate 	fthdr_t *hp = dbp->db_fthdr;
43210Sstevel@tonic-gate 	ftblk_t *bp = &hp->first;
43220Sstevel@tonic-gate 	ftblk_t *nbp;
43230Sstevel@tonic-gate 
43240Sstevel@tonic-gate 	if (bp != hp->tail || bp->ix != 0) {
43250Sstevel@tonic-gate 		/*
43260Sstevel@tonic-gate 		 * Clear out the hash, have the tail point to itself, and free
43270Sstevel@tonic-gate 		 * any continuation blocks.
43280Sstevel@tonic-gate 		 */
43290Sstevel@tonic-gate 		bp = hp->first.nxt;
43300Sstevel@tonic-gate 		hp->tail = &hp->first;
43310Sstevel@tonic-gate 		hp->hash = 0;
43320Sstevel@tonic-gate 		hp->first.nxt = NULL;
43330Sstevel@tonic-gate 		hp->first.ix = 0;
43340Sstevel@tonic-gate 		while (bp != NULL) {
43350Sstevel@tonic-gate 			nbp = bp->nxt;
43360Sstevel@tonic-gate 			kmem_cache_free(ftblk_cache, bp);
43370Sstevel@tonic-gate 			bp = nbp;
43380Sstevel@tonic-gate 		}
43390Sstevel@tonic-gate 	}
43400Sstevel@tonic-gate 	kmem_cache_free(fthdr_cache, hp);
43410Sstevel@tonic-gate 	dbp->db_fthdr = NULL;
43420Sstevel@tonic-gate }
4343