xref: /onnv-gate/usr/src/uts/common/os/msg.c (revision 6147)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
52565Sudpa  * Common Development and Distribution License (the "License").
62565Sudpa  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
226071Sdv142724  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
270Sstevel@tonic-gate /*	  All Rights Reserved  	*/
280Sstevel@tonic-gate 
290Sstevel@tonic-gate 
300Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
310Sstevel@tonic-gate 
320Sstevel@tonic-gate /*
330Sstevel@tonic-gate  * Inter-Process Communication Message Facility.
340Sstevel@tonic-gate  *
350Sstevel@tonic-gate  * See os/ipc.c for a description of common IPC functionality.
360Sstevel@tonic-gate  *
370Sstevel@tonic-gate  * Resource controls
380Sstevel@tonic-gate  * -----------------
390Sstevel@tonic-gate  *
402677Sml93401  * Control:      zone.max-msg-ids (rc_zone_msgmni)
412677Sml93401  * Description:  Maximum number of message queue ids allowed a zone.
422677Sml93401  *
432677Sml93401  *   When msgget() is used to allocate a message queue, one id is
442677Sml93401  *   allocated.  If the id allocation doesn't succeed, msgget() fails
452677Sml93401  *   and errno is set to ENOSPC.  Upon successful msgctl(, IPC_RMID)
462677Sml93401  *   the id is deallocated.
472677Sml93401  *
480Sstevel@tonic-gate  * Control:      project.max-msg-ids (rc_project_msgmni)
490Sstevel@tonic-gate  * Description:  Maximum number of message queue ids allowed a project.
500Sstevel@tonic-gate  *
510Sstevel@tonic-gate  *   When msgget() is used to allocate a message queue, one id is
520Sstevel@tonic-gate  *   allocated.  If the id allocation doesn't succeed, msgget() fails
530Sstevel@tonic-gate  *   and errno is set to ENOSPC.  Upon successful msgctl(, IPC_RMID)
540Sstevel@tonic-gate  *   the id is deallocated.
550Sstevel@tonic-gate  *
560Sstevel@tonic-gate  * Control:      process.max-msg-qbytes (rc_process_msgmnb)
570Sstevel@tonic-gate  * Description:  Maximum number of bytes of messages on a message queue.
580Sstevel@tonic-gate  *
590Sstevel@tonic-gate  *   When msgget() successfully allocates a message queue, the minimum
600Sstevel@tonic-gate  *   enforced value of this limit is used to initialize msg_qbytes.
610Sstevel@tonic-gate  *
620Sstevel@tonic-gate  * Control:      process.max-msg-messages (rc_process_msgtql)
630Sstevel@tonic-gate  * Description:  Maximum number of messages on a message queue.
640Sstevel@tonic-gate  *
650Sstevel@tonic-gate  *   When msgget() successfully allocates a message queue, the minimum
660Sstevel@tonic-gate  *   enforced value of this limit is used to initialize a per-queue
670Sstevel@tonic-gate  *   limit on the number of messages.
680Sstevel@tonic-gate  */
690Sstevel@tonic-gate 
700Sstevel@tonic-gate #include <sys/types.h>
710Sstevel@tonic-gate #include <sys/t_lock.h>
720Sstevel@tonic-gate #include <sys/param.h>
730Sstevel@tonic-gate #include <sys/cred.h>
740Sstevel@tonic-gate #include <sys/user.h>
750Sstevel@tonic-gate #include <sys/proc.h>
760Sstevel@tonic-gate #include <sys/time.h>
770Sstevel@tonic-gate #include <sys/ipc.h>
780Sstevel@tonic-gate #include <sys/ipc_impl.h>
790Sstevel@tonic-gate #include <sys/msg.h>
800Sstevel@tonic-gate #include <sys/msg_impl.h>
810Sstevel@tonic-gate #include <sys/list.h>
820Sstevel@tonic-gate #include <sys/systm.h>
830Sstevel@tonic-gate #include <sys/sysmacros.h>
840Sstevel@tonic-gate #include <sys/cpuvar.h>
850Sstevel@tonic-gate #include <sys/kmem.h>
860Sstevel@tonic-gate #include <sys/ddi.h>
870Sstevel@tonic-gate #include <sys/errno.h>
880Sstevel@tonic-gate #include <sys/cmn_err.h>
890Sstevel@tonic-gate #include <sys/debug.h>
900Sstevel@tonic-gate #include <sys/project.h>
910Sstevel@tonic-gate #include <sys/modctl.h>
920Sstevel@tonic-gate #include <sys/syscall.h>
930Sstevel@tonic-gate #include <sys/policy.h>
940Sstevel@tonic-gate #include <sys/zone.h>
950Sstevel@tonic-gate 
960Sstevel@tonic-gate #include <c2/audit.h>
970Sstevel@tonic-gate 
980Sstevel@tonic-gate /*
990Sstevel@tonic-gate  * The following tunables are obsolete.  Though for compatibility we
1000Sstevel@tonic-gate  * still read and interpret msginfo_msgmnb, msginfo_msgmni, and
1010Sstevel@tonic-gate  * msginfo_msgtql (see os/project.c and os/rctl_proc.c), the preferred
1020Sstevel@tonic-gate  * mechanism for administrating the IPC Message facility is through the
1030Sstevel@tonic-gate  * resource controls described at the top of this file.
1040Sstevel@tonic-gate  */
1050Sstevel@tonic-gate size_t	msginfo_msgmax = 2048;	/* (obsolete) */
1060Sstevel@tonic-gate size_t	msginfo_msgmnb = 4096;	/* (obsolete) */
1070Sstevel@tonic-gate int	msginfo_msgmni = 50;	/* (obsolete) */
1080Sstevel@tonic-gate int	msginfo_msgtql = 40;	/* (obsolete) */
1090Sstevel@tonic-gate int	msginfo_msgssz = 8;	/* (obsolete) */
1100Sstevel@tonic-gate int	msginfo_msgmap = 0;	/* (obsolete) */
1110Sstevel@tonic-gate ushort_t msginfo_msgseg = 1024;	/* (obsolete) */
1120Sstevel@tonic-gate 
1132677Sml93401 extern rctl_hndl_t rc_zone_msgmni;
1140Sstevel@tonic-gate extern rctl_hndl_t rc_project_msgmni;
1150Sstevel@tonic-gate extern rctl_hndl_t rc_process_msgmnb;
1160Sstevel@tonic-gate extern rctl_hndl_t rc_process_msgtql;
1170Sstevel@tonic-gate static ipc_service_t *msq_svc;
1180Sstevel@tonic-gate static zone_key_t msg_zone_key;
1190Sstevel@tonic-gate 
1200Sstevel@tonic-gate static void msg_dtor(kipc_perm_t *);
1210Sstevel@tonic-gate static void msg_rmid(kipc_perm_t *);
1220Sstevel@tonic-gate static void msg_remove_zone(zoneid_t, void *);
1230Sstevel@tonic-gate 
1240Sstevel@tonic-gate /*
1250Sstevel@tonic-gate  * Module linkage information for the kernel.
1260Sstevel@tonic-gate  */
1270Sstevel@tonic-gate static ssize_t msgsys(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2,
1280Sstevel@tonic-gate 	uintptr_t a4, uintptr_t a5);
1290Sstevel@tonic-gate 
1300Sstevel@tonic-gate static struct sysent ipcmsg_sysent = {
1310Sstevel@tonic-gate 	6,
1320Sstevel@tonic-gate #ifdef	_LP64
1330Sstevel@tonic-gate 	SE_ARGC | SE_NOUNLOAD | SE_64RVAL,
1340Sstevel@tonic-gate #else
1350Sstevel@tonic-gate 	SE_ARGC | SE_NOUNLOAD | SE_32RVAL1,
1360Sstevel@tonic-gate #endif
1370Sstevel@tonic-gate 	(int (*)())msgsys
1380Sstevel@tonic-gate };
1390Sstevel@tonic-gate 
1400Sstevel@tonic-gate #ifdef	_SYSCALL32_IMPL
1410Sstevel@tonic-gate static ssize32_t msgsys32(int opcode, uint32_t a0, uint32_t a1, uint32_t a2,
1420Sstevel@tonic-gate 	uint32_t a4, uint32_t a5);
1430Sstevel@tonic-gate 
1440Sstevel@tonic-gate static struct sysent ipcmsg_sysent32 = {
1450Sstevel@tonic-gate 	6,
1460Sstevel@tonic-gate 	SE_ARGC | SE_NOUNLOAD | SE_32RVAL1,
1470Sstevel@tonic-gate 	(int (*)())msgsys32
1480Sstevel@tonic-gate };
1490Sstevel@tonic-gate #endif	/* _SYSCALL32_IMPL */
1500Sstevel@tonic-gate 
1510Sstevel@tonic-gate static struct modlsys modlsys = {
1520Sstevel@tonic-gate 	&mod_syscallops, "System V message facility", &ipcmsg_sysent
1530Sstevel@tonic-gate };
1540Sstevel@tonic-gate 
1550Sstevel@tonic-gate #ifdef _SYSCALL32_IMPL
1560Sstevel@tonic-gate static struct modlsys modlsys32 = {
1570Sstevel@tonic-gate 	&mod_syscallops32, "32-bit System V message facility", &ipcmsg_sysent32
1580Sstevel@tonic-gate };
1590Sstevel@tonic-gate #endif
1600Sstevel@tonic-gate 
1614153Sdv142724 /*
1624153Sdv142724  *      Big Theory statement for message queue correctness
1634153Sdv142724  *
1644153Sdv142724  * The msgrcv and msgsnd functions no longer uses cv_broadcast to wake up
1654153Sdv142724  * receivers who are waiting for an event.  Using the cv_broadcast method
1664153Sdv142724  * resulted in negative scaling when the number of waiting receivers are large
1674153Sdv142724  * (the thundering herd problem).  Instead, the receivers waiting to receive a
1684153Sdv142724  * message are now linked in a queue-like fashion and awaken one at a time in
1694153Sdv142724  * a controlled manner.
1704153Sdv142724  *
1714153Sdv142724  * Receivers can block on two different classes of waiting list:
1724153Sdv142724  *    1) "sendwait" list, which is the more complex list of the two.  The
1734153Sdv142724  *	  receiver will be awakened by a sender posting a new message.  There
1744153Sdv142724  *	  are two types of "sendwait" list used:
1754153Sdv142724  *		a) msg_wait_snd: handles all receivers who are looking for
1764153Sdv142724  *		   a message type >= 0, but was unable to locate a match.
1774153Sdv142724  *
1784153Sdv142724  *		   slot 0: reserved for receivers that have designated they
1794153Sdv142724  *			   will take any message type.
1804153Sdv142724  *		   rest:   consist of receivers requesting a specific type
1814153Sdv142724  *			   but the type was not present.  The entries are
1824153Sdv142724  *			   hashed into a bucket in an attempt to keep
1834153Sdv142724  *			   any list search relatively short.
1844153Sdv142724  * 		b) msg_wait_snd_ngt: handles all receivers that have designated
1854153Sdv142724  *		   a negative message type. Unlike msg_wait_snd, the hash bucket
1864153Sdv142724  *		   serves a range of negative message types (-1 to -5, -6 to -10
1874153Sdv142724  *		   and so forth), where the last bucket is reserved for all the
1884153Sdv142724  *		   negative message types that hash outside of MSG_MAX_QNUM - 1.
1894153Sdv142724  *		   This is done this way to simplify the operation of locating a
1904153Sdv142724  *		   negative message type.
1914153Sdv142724  *
1924153Sdv142724  *    2) "copyout" list, where the receiver is awakened by another
1934153Sdv142724  *	 receiver after a message is copied out.  This is a linked list
1944153Sdv142724  *	 of waiters that are awakened one at a time.  Although the solution is
1954153Sdv142724  *	 not optimal, the complexity that would be added in for waking
1964153Sdv142724  *	 up the right entry far exceeds any potential pay back (too many
1974153Sdv142724  *	 correctness and corner case issues).
1984153Sdv142724  *
1994153Sdv142724  * The lists are doubly linked.  In the case of the "sendwait"
2004153Sdv142724  * list, this allows the thread to remove itself from the list without having
2014153Sdv142724  * to traverse the list.  In the case of the "copyout" list it simply allows
2024153Sdv142724  * us to use common functions with the "sendwait" list.
2034153Sdv142724  *
2044153Sdv142724  * To make sure receivers are not hung out to dry, we must guarantee:
2054153Sdv142724  *    1. If any queued message matches any receiver, then at least one
2064153Sdv142724  *       matching receiver must be processing the request.
2074153Sdv142724  *    2. Blocking on the copyout queue is only temporary while messages
2084153Sdv142724  *	 are being copied out.  The process is guaranted to wakeup
2094153Sdv142724  *	 when it gets to front of the queue (copyout is a FIFO).
2104153Sdv142724  *
2114153Sdv142724  * Rules for blocking and waking up:
2124153Sdv142724  *   1. A receiver entering msgrcv must examine all messages for a match
2134153Sdv142724  *      before blocking on a sendwait queue.
2144153Sdv142724  *   2. If the receiver blocks because the message it chose is already
2154153Sdv142724  *	being copied out, then when it wakes up needs to start start
2164153Sdv142724  *	checking the messages from the beginning.
2174153Sdv142724  *   3) When ever a process returns from msgrcv for any reason, if it
2184153Sdv142724  *	had attempted to copy a message or blocked waiting for a copy
2194153Sdv142724  *	to complete it needs to wakeup the next receiver blocked on
2204153Sdv142724  *	a copy out.
2214153Sdv142724  *   4) When a message is sent, the sender selects a process waiting
2224153Sdv142724  *	for that type of message.  This selection process rotates between
2234153Sdv142724  *	receivers types of 0, negative and positive to prevent starvation of
2244153Sdv142724  *	any one particular receiver type.
2254153Sdv142724  *   5) The following are the scenarios for processes that are awakened
2264153Sdv142724  *	by a msgsnd:
2274153Sdv142724  *		a) The process finds the message and is able to copy
2284153Sdv142724  *		   it out.  Once complete, the process returns.
2294153Sdv142724  *		b) The message that was sent that triggered the wakeup is no
2304153Sdv142724  *		   longer available (another process found the message first).
2314153Sdv142724  *		   We issue a wakeup on copy queue and then go back to
2324153Sdv142724  *		   sleep waiting for another matching message to be sent.
2334153Sdv142724  *		c) The message that was supposed to be processed was
2344153Sdv142724  *		   already serviced by another process.  However a different
2354153Sdv142724  *		   message is present which we can service.  The message
2364153Sdv142724  *		   is copied and the process returns.
2374153Sdv142724  *		d) The message is found, but some sort of error occurs that
2384153Sdv142724  *		   prevents the message from being copied.  The receiver
2394153Sdv142724  *		   wakes up the next sender that can service this message
2404153Sdv142724  *		   type and returns an error to the caller.
2414153Sdv142724  *		e) The message is found, but it is marked as being copied
2424153Sdv142724  *		   out.  The receiver then goes to sleep on the copyout
2434153Sdv142724  *		   queue where it will be awakened again sometime in the future.
2444153Sdv142724  *
2454153Sdv142724  *
2464153Sdv142724  *   6) Whenever a message is found that matches the message type designated,
2474153Sdv142724  * 	but is being copied out we have to block on the copyout queue.
2484153Sdv142724  *	After process copying finishes the copy out, it  must wakeup (either
2494153Sdv142724  *	directly or indirectly) all receivers who blocked on its copyout,
2504153Sdv142724  *	so they are guaranteed a chance to examine the remaining messages.
2514153Sdv142724  *	This is implemented via a chain of wakeups: Y wakes X, who wakes Z,
2524153Sdv142724  *	and so on.  The chain cannot be broken.  This leads to the following
2534153Sdv142724  *	cases:
2544153Sdv142724  *		a) A receiver is finished copying the message (or encountered)
2554153Sdv142724  *		   an error), the first entry on the copyout queue is woken
2564153Sdv142724  *		   up.
2574153Sdv142724  *		b) When the receiver is woken up, it attempts to locate
2584153Sdv142724  *		   a message type match.
2594153Sdv142724  *		c) If a message type is found and
2604153Sdv142724  *			-- MSG_RCVCOPY flag is not set, the message is
2614153Sdv142724  *			   marked for copying out.  Regardless of the copyout
2624153Sdv142724  *			   success the next entry on the copyout queue is
2634153Sdv142724  *			   awakened and the operation is completed.
2644153Sdv142724  *			-- MSG_RCVCOPY is set, we simply go back to sleep again
2654153Sdv142724  *			   on the copyout queue.
2664153Sdv142724  *		d) If the message type is not found then we wakeup the next
2674153Sdv142724  *		   process on the copyout queue.
2684153Sdv142724  */
2694153Sdv142724 
2706071Sdv142724 static uint_t msg_type_hash(long);
2714153Sdv142724 static int msgq_check_err(kmsqid_t *qp, int cvres);
2724153Sdv142724 static int msg_rcvq_sleep(list_t *, msgq_wakeup_t *, kmutex_t **,
2734153Sdv142724     kmsqid_t *);
2744153Sdv142724 static int msg_copyout(kmsqid_t *, long, kmutex_t **, size_t *, size_t,
2754153Sdv142724     struct msg *, struct ipcmsgbuf *, int);
2764153Sdv142724 static void msg_rcvq_wakeup_all(list_t *);
2774153Sdv142724 static void msg_wakeup_rdr(kmsqid_t *, msg_select_t **, long);
2784153Sdv142724 static msgq_wakeup_t *msg_fnd_any_snd(kmsqid_t *, int, long);
2794153Sdv142724 static msgq_wakeup_t *msg_fnd_any_rdr(kmsqid_t *, int, long);
2804153Sdv142724 static msgq_wakeup_t *msg_fnd_neg_snd(kmsqid_t *, int, long);
2814153Sdv142724 static msgq_wakeup_t *msg_fnd_spc_snd(kmsqid_t *, int, long);
2824153Sdv142724 static struct msg *msgrcv_lookup(kmsqid_t *, long);
2834153Sdv142724 
2844153Sdv142724 msg_select_t msg_fnd_sndr[] = {
2854153Sdv142724 	{ msg_fnd_any_snd, &msg_fnd_sndr[1] },
2864153Sdv142724 	{ msg_fnd_spc_snd, &msg_fnd_sndr[2] },
2874153Sdv142724 	{ msg_fnd_neg_snd, &msg_fnd_sndr[0] }
2884153Sdv142724 };
2894153Sdv142724 
2904153Sdv142724 msg_select_t msg_fnd_rdr[1] = {
2914153Sdv142724 	{ msg_fnd_any_rdr, &msg_fnd_rdr[0] },
2924153Sdv142724 };
2934153Sdv142724 
2940Sstevel@tonic-gate static struct modlinkage modlinkage = {
2950Sstevel@tonic-gate 	MODREV_1,
2960Sstevel@tonic-gate 	&modlsys,
2970Sstevel@tonic-gate #ifdef _SYSCALL32_IMPL
2980Sstevel@tonic-gate 	&modlsys32,
2990Sstevel@tonic-gate #endif
3000Sstevel@tonic-gate 	NULL
3010Sstevel@tonic-gate };
3020Sstevel@tonic-gate 
3030Sstevel@tonic-gate 
3040Sstevel@tonic-gate int
3050Sstevel@tonic-gate _init(void)
3060Sstevel@tonic-gate {
3070Sstevel@tonic-gate 	int result;
3080Sstevel@tonic-gate 
3092677Sml93401 	msq_svc = ipcs_create("msqids", rc_project_msgmni, rc_zone_msgmni,
3102677Sml93401 	    sizeof (kmsqid_t), msg_dtor, msg_rmid, AT_IPC_MSG,
3112677Sml93401 	    offsetof(ipc_rqty_t, ipcq_msgmni));
3120Sstevel@tonic-gate 	zone_key_create(&msg_zone_key, NULL, msg_remove_zone, NULL);
3130Sstevel@tonic-gate 
3140Sstevel@tonic-gate 	if ((result = mod_install(&modlinkage)) == 0)
3150Sstevel@tonic-gate 		return (0);
3160Sstevel@tonic-gate 
3170Sstevel@tonic-gate 	(void) zone_key_delete(msg_zone_key);
3180Sstevel@tonic-gate 	ipcs_destroy(msq_svc);
3190Sstevel@tonic-gate 
3200Sstevel@tonic-gate 	return (result);
3210Sstevel@tonic-gate }
3220Sstevel@tonic-gate 
3230Sstevel@tonic-gate int
3240Sstevel@tonic-gate _fini(void)
3250Sstevel@tonic-gate {
3260Sstevel@tonic-gate 	return (EBUSY);
3270Sstevel@tonic-gate }
3280Sstevel@tonic-gate 
3290Sstevel@tonic-gate int
3300Sstevel@tonic-gate _info(struct modinfo *modinfop)
3310Sstevel@tonic-gate {
3320Sstevel@tonic-gate 	return (mod_info(&modlinkage, modinfop));
3330Sstevel@tonic-gate }
3340Sstevel@tonic-gate 
3350Sstevel@tonic-gate static void
3360Sstevel@tonic-gate msg_dtor(kipc_perm_t *perm)
3370Sstevel@tonic-gate {
3380Sstevel@tonic-gate 	kmsqid_t *qp = (kmsqid_t *)perm;
3392565Sudpa 	int		ii;
3400Sstevel@tonic-gate 
3414153Sdv142724 	for (ii = 0; ii <= MSG_MAX_QNUM; ii++) {
3424153Sdv142724 		ASSERT(list_is_empty(&qp->msg_wait_snd[ii]));
3434153Sdv142724 		ASSERT(list_is_empty(&qp->msg_wait_snd_ngt[ii]));
3444153Sdv142724 		list_destroy(&qp->msg_wait_snd[ii]);
3454153Sdv142724 		list_destroy(&qp->msg_wait_snd_ngt[ii]);
3464153Sdv142724 	}
3474153Sdv142724 	ASSERT(list_is_empty(&qp->msg_cpy_block));
3484153Sdv142724 	list_destroy(&qp->msg_cpy_block);
3490Sstevel@tonic-gate 	ASSERT(qp->msg_snd_cnt == 0);
3500Sstevel@tonic-gate 	ASSERT(qp->msg_cbytes == 0);
3510Sstevel@tonic-gate 	list_destroy(&qp->msg_list);
3520Sstevel@tonic-gate }
3530Sstevel@tonic-gate 
3540Sstevel@tonic-gate 
3550Sstevel@tonic-gate #define	msg_hold(mp)	(mp)->msg_copycnt++
3560Sstevel@tonic-gate 
3570Sstevel@tonic-gate /*
3580Sstevel@tonic-gate  * msg_rele - decrement the reference count on the message.  When count
3590Sstevel@tonic-gate  * reaches zero, free message header and contents.
3600Sstevel@tonic-gate  */
3610Sstevel@tonic-gate static void
3620Sstevel@tonic-gate msg_rele(struct msg *mp)
3630Sstevel@tonic-gate {
3640Sstevel@tonic-gate 	ASSERT(mp->msg_copycnt > 0);
3650Sstevel@tonic-gate 	if (mp->msg_copycnt-- == 1) {
3660Sstevel@tonic-gate 		if (mp->msg_addr)
3670Sstevel@tonic-gate 			kmem_free(mp->msg_addr, mp->msg_size);
3680Sstevel@tonic-gate 		kmem_free(mp, sizeof (struct msg));
3690Sstevel@tonic-gate 	}
3700Sstevel@tonic-gate }
3710Sstevel@tonic-gate 
3720Sstevel@tonic-gate /*
3730Sstevel@tonic-gate  * msgunlink - Unlink msg from queue, decrement byte count and wake up anyone
3740Sstevel@tonic-gate  * waiting for free bytes on queue.
3750Sstevel@tonic-gate  *
3760Sstevel@tonic-gate  * Called with queue locked.
3770Sstevel@tonic-gate  */
3780Sstevel@tonic-gate static void
3790Sstevel@tonic-gate msgunlink(kmsqid_t *qp, struct msg *mp)
3800Sstevel@tonic-gate {
3810Sstevel@tonic-gate 	list_remove(&qp->msg_list, mp);
3820Sstevel@tonic-gate 	qp->msg_qnum--;
3830Sstevel@tonic-gate 	qp->msg_cbytes -= mp->msg_size;
3840Sstevel@tonic-gate 	msg_rele(mp);
3850Sstevel@tonic-gate 
3860Sstevel@tonic-gate 	/* Wake up waiting writers */
3870Sstevel@tonic-gate 	if (qp->msg_snd_cnt)
3880Sstevel@tonic-gate 		cv_broadcast(&qp->msg_snd_cv);
3890Sstevel@tonic-gate }
3900Sstevel@tonic-gate 
3910Sstevel@tonic-gate static void
3920Sstevel@tonic-gate msg_rmid(kipc_perm_t *perm)
3930Sstevel@tonic-gate {
3940Sstevel@tonic-gate 	kmsqid_t *qp = (kmsqid_t *)perm;
3950Sstevel@tonic-gate 	struct msg *mp;
3962565Sudpa 	int		ii;
3970Sstevel@tonic-gate 
3980Sstevel@tonic-gate 
3990Sstevel@tonic-gate 	while ((mp = list_head(&qp->msg_list)) != NULL)
4000Sstevel@tonic-gate 		msgunlink(qp, mp);
4010Sstevel@tonic-gate 	ASSERT(qp->msg_cbytes == 0);
4020Sstevel@tonic-gate 
4034153Sdv142724 	/*
4044153Sdv142724 	 * Wake up everyone who is in a wait state of some sort
4054153Sdv142724 	 * for this message queue.
4064153Sdv142724 	 */
4074153Sdv142724 	for (ii = 0; ii <= MSG_MAX_QNUM; ii++) {
4084153Sdv142724 		msg_rcvq_wakeup_all(&qp->msg_wait_snd[ii]);
4094153Sdv142724 		msg_rcvq_wakeup_all(&qp->msg_wait_snd_ngt[ii]);
4102565Sudpa 	}
4114153Sdv142724 	msg_rcvq_wakeup_all(&qp->msg_cpy_block);
4120Sstevel@tonic-gate 	if (qp->msg_snd_cnt)
4130Sstevel@tonic-gate 		cv_broadcast(&qp->msg_snd_cv);
4140Sstevel@tonic-gate }
4150Sstevel@tonic-gate 
4160Sstevel@tonic-gate /*
4170Sstevel@tonic-gate  * msgctl system call.
4180Sstevel@tonic-gate  *
4190Sstevel@tonic-gate  * gets q lock (via ipc_lookup), releases before return.
4200Sstevel@tonic-gate  * may call users of msg_lock
4210Sstevel@tonic-gate  */
4220Sstevel@tonic-gate static int
4230Sstevel@tonic-gate msgctl(int msgid, int cmd, void *arg)
4240Sstevel@tonic-gate {
4250Sstevel@tonic-gate 	STRUCT_DECL(msqid_ds, ds);		/* SVR4 queue work area */
4260Sstevel@tonic-gate 	kmsqid_t		*qp;		/* ptr to associated q */
4274153Sdv142724 	int			error;
4280Sstevel@tonic-gate 	struct	cred		*cr;
4290Sstevel@tonic-gate 	model_t	mdl = get_udatamodel();
4300Sstevel@tonic-gate 	struct msqid_ds64	ds64;
4310Sstevel@tonic-gate 	kmutex_t		*lock;
4320Sstevel@tonic-gate 	proc_t			*pp = curproc;
4330Sstevel@tonic-gate 
4340Sstevel@tonic-gate 	STRUCT_INIT(ds, mdl);
4350Sstevel@tonic-gate 	cr = CRED();
4360Sstevel@tonic-gate 
4370Sstevel@tonic-gate 	/*
4380Sstevel@tonic-gate 	 * Perform pre- or non-lookup actions (e.g. copyins, RMID).
4390Sstevel@tonic-gate 	 */
4400Sstevel@tonic-gate 	switch (cmd) {
4410Sstevel@tonic-gate 	case IPC_SET:
4420Sstevel@tonic-gate 		if (copyin(arg, STRUCT_BUF(ds), STRUCT_SIZE(ds)))
4430Sstevel@tonic-gate 			return (set_errno(EFAULT));
4440Sstevel@tonic-gate 		break;
4450Sstevel@tonic-gate 
4460Sstevel@tonic-gate 	case IPC_SET64:
4470Sstevel@tonic-gate 		if (copyin(arg, &ds64, sizeof (struct msqid_ds64)))
4480Sstevel@tonic-gate 			return (set_errno(EFAULT));
4490Sstevel@tonic-gate 		break;
4500Sstevel@tonic-gate 
4510Sstevel@tonic-gate 	case IPC_RMID:
4520Sstevel@tonic-gate 		if (error = ipc_rmid(msq_svc, msgid, cr))
4530Sstevel@tonic-gate 			return (set_errno(error));
4540Sstevel@tonic-gate 		return (0);
4550Sstevel@tonic-gate 	}
4560Sstevel@tonic-gate 
4570Sstevel@tonic-gate 	/*
4580Sstevel@tonic-gate 	 * get msqid_ds for this msgid
4590Sstevel@tonic-gate 	 */
4600Sstevel@tonic-gate 	if ((lock = ipc_lookup(msq_svc, msgid, (kipc_perm_t **)&qp)) == NULL)
4610Sstevel@tonic-gate 		return (set_errno(EINVAL));
4620Sstevel@tonic-gate 
4630Sstevel@tonic-gate 	switch (cmd) {
4640Sstevel@tonic-gate 	case IPC_SET:
4650Sstevel@tonic-gate 		if (STRUCT_FGET(ds, msg_qbytes) > qp->msg_qbytes &&
4660Sstevel@tonic-gate 		    secpolicy_ipc_config(cr) != 0) {
4670Sstevel@tonic-gate 			mutex_exit(lock);
4680Sstevel@tonic-gate 			return (set_errno(EPERM));
4690Sstevel@tonic-gate 		}
4700Sstevel@tonic-gate 		if (error = ipcperm_set(msq_svc, cr, &qp->msg_perm,
4710Sstevel@tonic-gate 		    &STRUCT_BUF(ds)->msg_perm, mdl)) {
4720Sstevel@tonic-gate 			mutex_exit(lock);
4730Sstevel@tonic-gate 			return (set_errno(error));
4740Sstevel@tonic-gate 		}
4750Sstevel@tonic-gate 		qp->msg_qbytes = STRUCT_FGET(ds, msg_qbytes);
4760Sstevel@tonic-gate 		qp->msg_ctime = gethrestime_sec();
4770Sstevel@tonic-gate 		break;
4780Sstevel@tonic-gate 
4790Sstevel@tonic-gate 	case IPC_STAT:
4800Sstevel@tonic-gate 		if (error = ipcperm_access(&qp->msg_perm, MSG_R, cr)) {
4810Sstevel@tonic-gate 			mutex_exit(lock);
4820Sstevel@tonic-gate 			return (set_errno(error));
4830Sstevel@tonic-gate 		}
4840Sstevel@tonic-gate 
4854153Sdv142724 		if (qp->msg_rcv_cnt)
4864153Sdv142724 			qp->msg_perm.ipc_mode |= MSG_RWAIT;
4870Sstevel@tonic-gate 		if (qp->msg_snd_cnt)
4880Sstevel@tonic-gate 			qp->msg_perm.ipc_mode |= MSG_WWAIT;
4890Sstevel@tonic-gate 		ipcperm_stat(&STRUCT_BUF(ds)->msg_perm, &qp->msg_perm, mdl);
4900Sstevel@tonic-gate 		qp->msg_perm.ipc_mode &= ~(MSG_RWAIT|MSG_WWAIT);
4910Sstevel@tonic-gate 		STRUCT_FSETP(ds, msg_first, NULL); 	/* kernel addr */
4920Sstevel@tonic-gate 		STRUCT_FSETP(ds, msg_last, NULL);
4930Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_cbytes, qp->msg_cbytes);
4940Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_qnum, qp->msg_qnum);
4950Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_qbytes, qp->msg_qbytes);
4960Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_lspid, qp->msg_lspid);
4970Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_lrpid, qp->msg_lrpid);
4980Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_stime, qp->msg_stime);
4990Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_rtime, qp->msg_rtime);
5000Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_ctime, qp->msg_ctime);
5010Sstevel@tonic-gate 		break;
5020Sstevel@tonic-gate 
5030Sstevel@tonic-gate 	case IPC_SET64:
5040Sstevel@tonic-gate 		mutex_enter(&pp->p_lock);
5050Sstevel@tonic-gate 		if ((ds64.msgx_qbytes > qp->msg_qbytes) &&
5060Sstevel@tonic-gate 		    secpolicy_ipc_config(cr) != 0 &&
5070Sstevel@tonic-gate 		    rctl_test(rc_process_msgmnb, pp->p_rctls, pp,
5080Sstevel@tonic-gate 		    ds64.msgx_qbytes, RCA_SAFE) & RCT_DENY) {
5090Sstevel@tonic-gate 			mutex_exit(&pp->p_lock);
5100Sstevel@tonic-gate 			mutex_exit(lock);
5110Sstevel@tonic-gate 			return (set_errno(EPERM));
5120Sstevel@tonic-gate 		}
5130Sstevel@tonic-gate 		mutex_exit(&pp->p_lock);
5140Sstevel@tonic-gate 		if (error = ipcperm_set64(msq_svc, cr, &qp->msg_perm,
5150Sstevel@tonic-gate 		    &ds64.msgx_perm)) {
5160Sstevel@tonic-gate 			mutex_exit(lock);
5170Sstevel@tonic-gate 			return (set_errno(error));
5180Sstevel@tonic-gate 		}
5190Sstevel@tonic-gate 		qp->msg_qbytes = ds64.msgx_qbytes;
5200Sstevel@tonic-gate 		qp->msg_ctime = gethrestime_sec();
5210Sstevel@tonic-gate 		break;
5220Sstevel@tonic-gate 
5230Sstevel@tonic-gate 	case IPC_STAT64:
5244153Sdv142724 		if (qp->msg_rcv_cnt)
5254153Sdv142724 			qp->msg_perm.ipc_mode |= MSG_RWAIT;
5260Sstevel@tonic-gate 		if (qp->msg_snd_cnt)
5270Sstevel@tonic-gate 			qp->msg_perm.ipc_mode |= MSG_WWAIT;
5280Sstevel@tonic-gate 		ipcperm_stat64(&ds64.msgx_perm, &qp->msg_perm);
5290Sstevel@tonic-gate 		qp->msg_perm.ipc_mode &= ~(MSG_RWAIT|MSG_WWAIT);
5300Sstevel@tonic-gate 		ds64.msgx_cbytes = qp->msg_cbytes;
5310Sstevel@tonic-gate 		ds64.msgx_qnum = qp->msg_qnum;
5320Sstevel@tonic-gate 		ds64.msgx_qbytes = qp->msg_qbytes;
5330Sstevel@tonic-gate 		ds64.msgx_lspid = qp->msg_lspid;
5340Sstevel@tonic-gate 		ds64.msgx_lrpid = qp->msg_lrpid;
5350Sstevel@tonic-gate 		ds64.msgx_stime = qp->msg_stime;
5360Sstevel@tonic-gate 		ds64.msgx_rtime = qp->msg_rtime;
5370Sstevel@tonic-gate 		ds64.msgx_ctime = qp->msg_ctime;
5380Sstevel@tonic-gate 		break;
5390Sstevel@tonic-gate 
5400Sstevel@tonic-gate 	default:
5410Sstevel@tonic-gate 		mutex_exit(lock);
5420Sstevel@tonic-gate 		return (set_errno(EINVAL));
5430Sstevel@tonic-gate 	}
5440Sstevel@tonic-gate 
5450Sstevel@tonic-gate 	mutex_exit(lock);
5460Sstevel@tonic-gate 
5470Sstevel@tonic-gate 	/*
5480Sstevel@tonic-gate 	 * Do copyout last (after releasing mutex).
5490Sstevel@tonic-gate 	 */
5500Sstevel@tonic-gate 	switch (cmd) {
5510Sstevel@tonic-gate 	case IPC_STAT:
5520Sstevel@tonic-gate 		if (copyout(STRUCT_BUF(ds), arg, STRUCT_SIZE(ds)))
5530Sstevel@tonic-gate 			return (set_errno(EFAULT));
5540Sstevel@tonic-gate 		break;
5550Sstevel@tonic-gate 
5560Sstevel@tonic-gate 	case IPC_STAT64:
5570Sstevel@tonic-gate 		if (copyout(&ds64, arg, sizeof (struct msqid_ds64)))
5580Sstevel@tonic-gate 			return (set_errno(EFAULT));
5590Sstevel@tonic-gate 		break;
5600Sstevel@tonic-gate 	}
5610Sstevel@tonic-gate 
5620Sstevel@tonic-gate 	return (0);
5630Sstevel@tonic-gate }
5640Sstevel@tonic-gate 
5650Sstevel@tonic-gate /*
5660Sstevel@tonic-gate  * Remove all message queues associated with a given zone.  Called by
5670Sstevel@tonic-gate  * zone_shutdown when the zone is halted.
5680Sstevel@tonic-gate  */
5690Sstevel@tonic-gate /*ARGSUSED1*/
5700Sstevel@tonic-gate static void
5710Sstevel@tonic-gate msg_remove_zone(zoneid_t zoneid, void *arg)
5720Sstevel@tonic-gate {
5730Sstevel@tonic-gate 	ipc_remove_zone(msq_svc, zoneid);
5740Sstevel@tonic-gate }
5750Sstevel@tonic-gate 
5760Sstevel@tonic-gate /*
5770Sstevel@tonic-gate  * msgget system call.
5780Sstevel@tonic-gate  */
5790Sstevel@tonic-gate static int
5800Sstevel@tonic-gate msgget(key_t key, int msgflg)
5810Sstevel@tonic-gate {
5820Sstevel@tonic-gate 	kmsqid_t	*qp;
5830Sstevel@tonic-gate 	kmutex_t	*lock;
5840Sstevel@tonic-gate 	int		id, error;
5852565Sudpa 	int		ii;
5860Sstevel@tonic-gate 	proc_t		*pp = curproc;
5870Sstevel@tonic-gate 
5880Sstevel@tonic-gate top:
5890Sstevel@tonic-gate 	if (error = ipc_get(msq_svc, key, msgflg, (kipc_perm_t **)&qp, &lock))
5900Sstevel@tonic-gate 		return (set_errno(error));
5910Sstevel@tonic-gate 
5920Sstevel@tonic-gate 	if (IPC_FREE(&qp->msg_perm)) {
5930Sstevel@tonic-gate 		mutex_exit(lock);
5940Sstevel@tonic-gate 		mutex_exit(&pp->p_lock);
5950Sstevel@tonic-gate 
5960Sstevel@tonic-gate 		list_create(&qp->msg_list, sizeof (struct msg),
5970Sstevel@tonic-gate 		    offsetof(struct msg, msg_node));
5980Sstevel@tonic-gate 		qp->msg_qnum = 0;
5990Sstevel@tonic-gate 		qp->msg_lspid = qp->msg_lrpid = 0;
6000Sstevel@tonic-gate 		qp->msg_stime = qp->msg_rtime = 0;
6010Sstevel@tonic-gate 		qp->msg_ctime = gethrestime_sec();
6024153Sdv142724 		qp->msg_ngt_cnt = 0;
6034153Sdv142724 		qp->msg_neg_copy = 0;
6044153Sdv142724 		for (ii = 0; ii <= MSG_MAX_QNUM; ii++) {
6054153Sdv142724 			list_create(&qp->msg_wait_snd[ii],
6064153Sdv142724 			    sizeof (msgq_wakeup_t),
6074153Sdv142724 			    offsetof(msgq_wakeup_t, msgw_list));
6084153Sdv142724 			list_create(&qp->msg_wait_snd_ngt[ii],
6094153Sdv142724 			    sizeof (msgq_wakeup_t),
6104153Sdv142724 			    offsetof(msgq_wakeup_t, msgw_list));
6114153Sdv142724 		}
6124153Sdv142724 		/*
6134153Sdv142724 		 * The proper initialization of msg_lowest_type is to the
6144153Sdv142724 		 * highest possible value.  By doing this we guarantee that
6154153Sdv142724 		 * when the first send happens, the lowest type will be set
6164153Sdv142724 		 * properly.
6174153Sdv142724 		 */
6186071Sdv142724 		qp->msg_lowest_type = LONG_MAX;
6194153Sdv142724 		list_create(&qp->msg_cpy_block,
6204153Sdv142724 		    sizeof (msgq_wakeup_t),
6214153Sdv142724 		    offsetof(msgq_wakeup_t, msgw_list));
6224153Sdv142724 		qp->msg_fnd_sndr = &msg_fnd_sndr[0];
6234153Sdv142724 		qp->msg_fnd_rdr = &msg_fnd_rdr[0];
6244153Sdv142724 		qp->msg_rcv_cnt = 0;
6252565Sudpa 		qp->msg_snd_cnt = 0;
6260Sstevel@tonic-gate 
6270Sstevel@tonic-gate 		if (error = ipc_commit_begin(msq_svc, key, msgflg,
6280Sstevel@tonic-gate 		    (kipc_perm_t *)qp)) {
6290Sstevel@tonic-gate 			if (error == EAGAIN)
6300Sstevel@tonic-gate 				goto top;
6310Sstevel@tonic-gate 			return (set_errno(error));
6320Sstevel@tonic-gate 		}
6330Sstevel@tonic-gate 		qp->msg_qbytes = rctl_enforced_value(rc_process_msgmnb,
6340Sstevel@tonic-gate 		    pp->p_rctls, pp);
6350Sstevel@tonic-gate 		qp->msg_qmax = rctl_enforced_value(rc_process_msgtql,
6360Sstevel@tonic-gate 		    pp->p_rctls, pp);
6370Sstevel@tonic-gate 		lock = ipc_commit_end(msq_svc, &qp->msg_perm);
6380Sstevel@tonic-gate 	}
6390Sstevel@tonic-gate 	if (audit_active)
6400Sstevel@tonic-gate 		audit_ipcget(AT_IPC_MSG, (void *)qp);
6410Sstevel@tonic-gate 	id = qp->msg_perm.ipc_id;
6420Sstevel@tonic-gate 	mutex_exit(lock);
6430Sstevel@tonic-gate 	return (id);
6440Sstevel@tonic-gate }
6450Sstevel@tonic-gate 
6460Sstevel@tonic-gate static ssize_t
6470Sstevel@tonic-gate msgrcv(int msqid, struct ipcmsgbuf *msgp, size_t msgsz, long msgtyp, int msgflg)
6480Sstevel@tonic-gate {
6490Sstevel@tonic-gate 	struct msg	*smp;	/* ptr to best msg on q */
6500Sstevel@tonic-gate 	kmsqid_t	*qp;	/* ptr to associated q */
6510Sstevel@tonic-gate 	kmutex_t	*lock;
6520Sstevel@tonic-gate 	size_t		xtsz;	/* transfer byte count */
6534153Sdv142724 	int		error = 0;
6540Sstevel@tonic-gate 	int		cvres;
6556071Sdv142724 	uint_t		msg_hash;
6564153Sdv142724 	msgq_wakeup_t	msg_entry;
6570Sstevel@tonic-gate 
6580Sstevel@tonic-gate 	CPU_STATS_ADDQ(CPU, sys, msg, 1);	/* bump msg send/rcv count */
6590Sstevel@tonic-gate 
6604153Sdv142724 	msg_hash = msg_type_hash(msgtyp);
6614153Sdv142724 	if ((lock = ipc_lookup(msq_svc, msqid, (kipc_perm_t **)&qp)) == NULL) {
6620Sstevel@tonic-gate 		return ((ssize_t)set_errno(EINVAL));
6634153Sdv142724 	}
6640Sstevel@tonic-gate 	ipc_hold(msq_svc, (kipc_perm_t *)qp);
6650Sstevel@tonic-gate 
6664153Sdv142724 	if (error = ipcperm_access(&qp->msg_perm, MSG_R, CRED())) {
6670Sstevel@tonic-gate 		goto msgrcv_out;
6684153Sdv142724 	}
6694153Sdv142724 
6704153Sdv142724 	/*
6714153Sdv142724 	 * Various information (including the condvar_t) required for the
6724153Sdv142724 	 * process to sleep is provided by it's stack.
6734153Sdv142724 	 */
6744153Sdv142724 	msg_entry.msgw_thrd = curthread;
6754153Sdv142724 	msg_entry.msgw_snd_wake = 0;
6764153Sdv142724 	msg_entry.msgw_type = msgtyp;
6774153Sdv142724 findmsg:
6784153Sdv142724 	smp = msgrcv_lookup(qp, msgtyp);
6794153Sdv142724 
6804153Sdv142724 	if (smp) {
6814153Sdv142724 		/*
6824153Sdv142724 		 * We found a possible message to copy out.
6834153Sdv142724 		 */
6844153Sdv142724 		if ((smp->msg_flags & MSG_RCVCOPY) == 0) {
6856071Sdv142724 			long t = msg_entry.msgw_snd_wake;
686*6147Sqiao 			long copy_type = smp->msg_type;
687*6147Sqiao 
6884153Sdv142724 			/*
6894153Sdv142724 			 * It is available, attempt to copy it.
6904153Sdv142724 			 */
6914153Sdv142724 			error = msg_copyout(qp, msgtyp, &lock, &xtsz, msgsz,
6924153Sdv142724 			    smp, msgp, msgflg);
6936071Sdv142724 
6946071Sdv142724 			/*
6956071Sdv142724 			 * It is possible to consume a different message
6966071Sdv142724 			 * type then what originally awakened for (negative
6976071Sdv142724 			 * types).  If this happens a check must be done to
6986071Sdv142724 			 * to determine if another receiver is available
6996071Sdv142724 			 * for the waking message type,  Failure to do this
7006071Sdv142724 			 * can result in a message on the queue that can be
7016071Sdv142724 			 * serviced by a sleeping receiver.
7026071Sdv142724 			 */
703*6147Sqiao 			if (!error && t && (copy_type != t))
7046071Sdv142724 				msg_wakeup_rdr(qp, &qp->msg_fnd_sndr, t);
7056071Sdv142724 
7064153Sdv142724 			/*
7074153Sdv142724 			 * Don't forget to wakeup a sleeper that blocked because
7084153Sdv142724 			 * we were copying things out.
7094153Sdv142724 			 */
7104153Sdv142724 			msg_wakeup_rdr(qp, &qp->msg_fnd_rdr, 0);
7114153Sdv142724 			goto msgrcv_out;
7124153Sdv142724 		}
7134153Sdv142724 		/*
7144153Sdv142724 		 * The selected message is being copied out, so block.  We do
7154153Sdv142724 		 * not need to wake the next person up on the msg_cpy_block list
7164153Sdv142724 		 * due to the fact some one is copying out and they will get
7174153Sdv142724 		 * things moving again once the copy is completed.
7184153Sdv142724 		 */
7194153Sdv142724 		cvres = msg_rcvq_sleep(&qp->msg_cpy_block,
7204153Sdv142724 		    &msg_entry, &lock, qp);
7214153Sdv142724 		error = msgq_check_err(qp, cvres);
7224153Sdv142724 		if (error) {
7234153Sdv142724 			goto msgrcv_out;
7244153Sdv142724 		}
7254153Sdv142724 		goto findmsg;
7264153Sdv142724 	}
7274153Sdv142724 	/*
7284153Sdv142724 	 * There isn't a message to copy out that matches the designated
7294153Sdv142724 	 * criteria.
7304153Sdv142724 	 */
7314153Sdv142724 	if (msgflg & IPC_NOWAIT) {
7324153Sdv142724 		error = ENOMSG;
7334153Sdv142724 		goto msgrcv_out;
7344153Sdv142724 	}
7354153Sdv142724 	msg_wakeup_rdr(qp,  &qp->msg_fnd_rdr, 0);
7364153Sdv142724 
7374153Sdv142724 	/*
7384153Sdv142724 	 * Wait for new message.  We keep the negative and positive types
7394153Sdv142724 	 * separate for performance reasons.
7404153Sdv142724 	 */
7414153Sdv142724 	msg_entry.msgw_snd_wake = 0;
7424153Sdv142724 	if (msgtyp >= 0) {
7434153Sdv142724 		cvres = msg_rcvq_sleep(&qp->msg_wait_snd[msg_hash],
7444153Sdv142724 		    &msg_entry, &lock, qp);
7454153Sdv142724 	} else {
7464153Sdv142724 		qp->msg_ngt_cnt++;
7474153Sdv142724 		cvres = msg_rcvq_sleep(&qp->msg_wait_snd_ngt[msg_hash],
7484153Sdv142724 		    &msg_entry, &lock, qp);
7494153Sdv142724 		qp->msg_ngt_cnt--;
7504153Sdv142724 	}
7514153Sdv142724 
7524153Sdv142724 	if (!(error = msgq_check_err(qp, cvres))) {
7534153Sdv142724 		goto findmsg;
7544153Sdv142724 	}
7554153Sdv142724 
7564153Sdv142724 msgrcv_out:
7574153Sdv142724 	if (error) {
7584153Sdv142724 		msg_wakeup_rdr(qp,  &qp->msg_fnd_rdr, 0);
7594153Sdv142724 		if (msg_entry.msgw_snd_wake) {
7604153Sdv142724 			msg_wakeup_rdr(qp, &qp->msg_fnd_sndr,
7614153Sdv142724 			    msg_entry.msgw_snd_wake);
7624153Sdv142724 		}
7634153Sdv142724 		ipc_rele(msq_svc, (kipc_perm_t *)qp);
7644153Sdv142724 		return ((ssize_t)set_errno(error));
7654153Sdv142724 	}
7664153Sdv142724 	ipc_rele(msq_svc, (kipc_perm_t *)qp);
7674153Sdv142724 	return ((ssize_t)xtsz);
7684153Sdv142724 }
7690Sstevel@tonic-gate 
7704153Sdv142724 static int
7714153Sdv142724 msgq_check_err(kmsqid_t *qp, int cvres)
7724153Sdv142724 {
7734153Sdv142724 	if (IPC_FREE(&qp->msg_perm)) {
7744153Sdv142724 		return (EIDRM);
7754153Sdv142724 	}
7764153Sdv142724 
7774153Sdv142724 	if (cvres == 0) {
7784153Sdv142724 		return (EINTR);
7794153Sdv142724 	}
7804153Sdv142724 
7814153Sdv142724 	return (0);
7824153Sdv142724 }
7834153Sdv142724 
7844153Sdv142724 static int
7854153Sdv142724 msg_copyout(kmsqid_t *qp, long msgtyp, kmutex_t **lock, size_t *xtsz_ret,
7864153Sdv142724     size_t msgsz, struct msg *smp, struct ipcmsgbuf *msgp, int msgflg)
7874153Sdv142724 {
7884153Sdv142724 	size_t		xtsz;
7894153Sdv142724 	STRUCT_HANDLE(ipcmsgbuf, umsgp);
7904153Sdv142724 	model_t		mdl = get_udatamodel();
7914153Sdv142724 	int		copyerror = 0;
7924153Sdv142724 
7934153Sdv142724 	STRUCT_SET_HANDLE(umsgp, mdl, msgp);
7944153Sdv142724 	if (msgsz < smp->msg_size) {
7954153Sdv142724 		if ((msgflg & MSG_NOERROR) == 0) {
7964153Sdv142724 			return (E2BIG);
7974153Sdv142724 		} else {
7984153Sdv142724 			xtsz = msgsz;
7994153Sdv142724 		}
8004153Sdv142724 	} else {
8014153Sdv142724 		xtsz = smp->msg_size;
8024153Sdv142724 	}
8034153Sdv142724 	*xtsz_ret = xtsz;
8044153Sdv142724 
8054153Sdv142724 	/*
8064153Sdv142724 	 * To prevent a DOS attack we mark the message as being
8074153Sdv142724 	 * copied out and release mutex.  When the copy is completed
8084153Sdv142724 	 * we need to acquire the mutex and make the appropriate updates.
8094153Sdv142724 	 */
8104153Sdv142724 	ASSERT((smp->msg_flags & MSG_RCVCOPY) == 0);
8114153Sdv142724 	smp->msg_flags |= MSG_RCVCOPY;
8124153Sdv142724 	msg_hold(smp);
8134153Sdv142724 	if (msgtyp < 0) {
8144153Sdv142724 		ASSERT(qp->msg_neg_copy == 0);
8154153Sdv142724 		qp->msg_neg_copy = 1;
8164153Sdv142724 	}
8174153Sdv142724 	mutex_exit(*lock);
8184153Sdv142724 
8194153Sdv142724 	if (mdl == DATAMODEL_NATIVE) {
8204153Sdv142724 		copyerror = copyout(&smp->msg_type, msgp,
8214153Sdv142724 		    sizeof (smp->msg_type));
8224153Sdv142724 	} else {
8234153Sdv142724 		/*
8244153Sdv142724 		 * 32-bit callers need an imploded msg type.
8254153Sdv142724 		 */
8264153Sdv142724 		int32_t	msg_type32 = smp->msg_type;
8274153Sdv142724 
8284153Sdv142724 		copyerror = copyout(&msg_type32, msgp,
8294153Sdv142724 		    sizeof (msg_type32));
8304153Sdv142724 	}
8314153Sdv142724 
8324153Sdv142724 	if (copyerror == 0 && xtsz) {
8334153Sdv142724 		copyerror = copyout(smp->msg_addr,
8344153Sdv142724 		    STRUCT_FADDR(umsgp, mtext), xtsz);
8354153Sdv142724 	}
8364153Sdv142724 
8374153Sdv142724 	/*
8384153Sdv142724 	 * Reclaim the mutex and make sure the message queue still exists.
8394153Sdv142724 	 */
8404153Sdv142724 
8414153Sdv142724 	*lock = ipc_lock(msq_svc, qp->msg_perm.ipc_id);
8424153Sdv142724 	if (msgtyp < 0) {
8434153Sdv142724 		qp->msg_neg_copy = 0;
8444153Sdv142724 	}
8454153Sdv142724 	ASSERT(smp->msg_flags & MSG_RCVCOPY);
8464153Sdv142724 	smp->msg_flags &= ~MSG_RCVCOPY;
8474153Sdv142724 	msg_rele(smp);
8484153Sdv142724 	if (IPC_FREE(&qp->msg_perm)) {
8494153Sdv142724 		return (EIDRM);
8504153Sdv142724 	}
8514153Sdv142724 	if (copyerror) {
8524153Sdv142724 		return (EFAULT);
8534153Sdv142724 	}
8544153Sdv142724 	qp->msg_lrpid = ttoproc(curthread)->p_pid;
8554153Sdv142724 	qp->msg_rtime = gethrestime_sec();
8564153Sdv142724 	msgunlink(qp, smp);
8574153Sdv142724 	return (0);
8584153Sdv142724 }
8594153Sdv142724 
8604153Sdv142724 static struct msg *
8614153Sdv142724 msgrcv_lookup(kmsqid_t *qp, long msgtyp)
8624153Sdv142724 {
8634153Sdv142724 	struct msg 		*smp = NULL;
8646071Sdv142724 	long			qp_low;
8654153Sdv142724 	struct msg		*mp;	/* ptr to msg on q */
8666071Sdv142724 	long			low_msgtype;
8674153Sdv142724 	static struct msg	neg_copy_smp;
8684153Sdv142724 
8690Sstevel@tonic-gate 	mp = list_head(&qp->msg_list);
8700Sstevel@tonic-gate 	if (msgtyp == 0) {
8710Sstevel@tonic-gate 		smp = mp;
8720Sstevel@tonic-gate 	} else {
8734153Sdv142724 		qp_low = qp->msg_lowest_type;
8744153Sdv142724 		if (msgtyp > 0) {
8754153Sdv142724 			/*
8764153Sdv142724 			 * If our lowest possible message type is larger than
8774153Sdv142724 			 * the message type desired, then we know there is
8784153Sdv142724 			 * no entry present.
8794153Sdv142724 			 */
8804153Sdv142724 			if (qp_low > msgtyp) {
8814153Sdv142724 				return (NULL);
8824153Sdv142724 			}
8834153Sdv142724 
8844153Sdv142724 			for (; mp; mp = list_next(&qp->msg_list, mp)) {
8854153Sdv142724 				if (msgtyp == mp->msg_type) {
8864153Sdv142724 					smp = mp;
8874153Sdv142724 					break;
8884153Sdv142724 				}
8890Sstevel@tonic-gate 			}
8904153Sdv142724 		} else {
8914153Sdv142724 			/*
8924153Sdv142724 			 * We have kept track of the lowest possible message
8934153Sdv142724 			 * type on the send queue.  This allows us to terminate
8944153Sdv142724 			 * the search early if we find a message type of that
8954153Sdv142724 			 * type.  Note, the lowest type may not be the actual
8964153Sdv142724 			 * lowest value in the system, it is only guaranteed
8974153Sdv142724 			 * that there isn't a value lower than that.
8984153Sdv142724 			 */
8994153Sdv142724 			low_msgtype = -msgtyp;
9006071Sdv142724 			if (low_msgtype < qp_low) {
9014153Sdv142724 				return (NULL);
9024153Sdv142724 			}
9034153Sdv142724 			if (qp->msg_neg_copy) {
9044153Sdv142724 				neg_copy_smp.msg_flags = MSG_RCVCOPY;
9054153Sdv142724 				return (&neg_copy_smp);
9064153Sdv142724 			}
9074153Sdv142724 			for (; mp; mp = list_next(&qp->msg_list, mp)) {
9086071Sdv142724 				if (mp->msg_type <= low_msgtype &&
9096071Sdv142724 				    !(smp && smp->msg_type <= mp->msg_type)) {
9104153Sdv142724 					smp = mp;
9114153Sdv142724 					low_msgtype = mp->msg_type;
9124153Sdv142724 					if (low_msgtype == qp_low) {
9134153Sdv142724 						break;
9144153Sdv142724 					}
9154153Sdv142724 				}
9164153Sdv142724 			}
9174153Sdv142724 			if (smp) {
9184153Sdv142724 				/*
9194153Sdv142724 				 * Update the lowest message type.
9204153Sdv142724 				 */
9214153Sdv142724 				qp->msg_lowest_type = smp->msg_type;
9220Sstevel@tonic-gate 			}
9230Sstevel@tonic-gate 		}
9240Sstevel@tonic-gate 	}
9254153Sdv142724 	return (smp);
9260Sstevel@tonic-gate }
9270Sstevel@tonic-gate 
9280Sstevel@tonic-gate /*
9290Sstevel@tonic-gate  * msgids system call.
9300Sstevel@tonic-gate  */
9310Sstevel@tonic-gate static int
9320Sstevel@tonic-gate msgids(int *buf, uint_t nids, uint_t *pnids)
9330Sstevel@tonic-gate {
9340Sstevel@tonic-gate 	int error;
9350Sstevel@tonic-gate 
9360Sstevel@tonic-gate 	if (error = ipc_ids(msq_svc, buf, nids, pnids))
9370Sstevel@tonic-gate 		return (set_errno(error));
9380Sstevel@tonic-gate 
9390Sstevel@tonic-gate 	return (0);
9400Sstevel@tonic-gate }
9410Sstevel@tonic-gate 
9420Sstevel@tonic-gate #define	RND(x)		roundup((x), sizeof (size_t))
9430Sstevel@tonic-gate #define	RND32(x)	roundup((x), sizeof (size32_t))
9440Sstevel@tonic-gate 
9450Sstevel@tonic-gate /*
9460Sstevel@tonic-gate  * msgsnap system call.
9470Sstevel@tonic-gate  */
9480Sstevel@tonic-gate static int
9490Sstevel@tonic-gate msgsnap(int msqid, caddr_t buf, size_t bufsz, long msgtyp)
9500Sstevel@tonic-gate {
9510Sstevel@tonic-gate 	struct msg	*mp;	/* ptr to msg on q */
9520Sstevel@tonic-gate 	kmsqid_t	*qp;	/* ptr to associated q */
9530Sstevel@tonic-gate 	kmutex_t	*lock;
9540Sstevel@tonic-gate 	size_t		size;
9550Sstevel@tonic-gate 	size_t		nmsg;
9560Sstevel@tonic-gate 	struct msg	**snaplist;
9570Sstevel@tonic-gate 	int		error, i;
9580Sstevel@tonic-gate 	model_t		mdl = get_udatamodel();
9590Sstevel@tonic-gate 	STRUCT_DECL(msgsnap_head, head);
9600Sstevel@tonic-gate 	STRUCT_DECL(msgsnap_mhead, mhead);
9610Sstevel@tonic-gate 
9620Sstevel@tonic-gate 	STRUCT_INIT(head, mdl);
9630Sstevel@tonic-gate 	STRUCT_INIT(mhead, mdl);
9640Sstevel@tonic-gate 
9650Sstevel@tonic-gate 	if (bufsz < STRUCT_SIZE(head))
9660Sstevel@tonic-gate 		return (set_errno(EINVAL));
9670Sstevel@tonic-gate 
9680Sstevel@tonic-gate 	if ((lock = ipc_lookup(msq_svc, msqid, (kipc_perm_t **)&qp)) == NULL)
9690Sstevel@tonic-gate 		return (set_errno(EINVAL));
9700Sstevel@tonic-gate 
9710Sstevel@tonic-gate 	if (error = ipcperm_access(&qp->msg_perm, MSG_R, CRED())) {
9720Sstevel@tonic-gate 		mutex_exit(lock);
9730Sstevel@tonic-gate 		return (set_errno(error));
9740Sstevel@tonic-gate 	}
9750Sstevel@tonic-gate 	ipc_hold(msq_svc, (kipc_perm_t *)qp);
9760Sstevel@tonic-gate 
9770Sstevel@tonic-gate 	/*
9780Sstevel@tonic-gate 	 * First compute the required buffer size and
9790Sstevel@tonic-gate 	 * the number of messages on the queue.
9800Sstevel@tonic-gate 	 */
9810Sstevel@tonic-gate 	size = nmsg = 0;
9820Sstevel@tonic-gate 	for (mp = list_head(&qp->msg_list); mp;
9830Sstevel@tonic-gate 	    mp = list_next(&qp->msg_list, mp)) {
9840Sstevel@tonic-gate 		if (msgtyp == 0 ||
9850Sstevel@tonic-gate 		    (msgtyp > 0 && msgtyp == mp->msg_type) ||
9860Sstevel@tonic-gate 		    (msgtyp < 0 && mp->msg_type <= -msgtyp)) {
9870Sstevel@tonic-gate 			nmsg++;
9880Sstevel@tonic-gate 			if (mdl == DATAMODEL_NATIVE)
9890Sstevel@tonic-gate 				size += RND(mp->msg_size);
9900Sstevel@tonic-gate 			else
9910Sstevel@tonic-gate 				size += RND32(mp->msg_size);
9920Sstevel@tonic-gate 		}
9930Sstevel@tonic-gate 	}
9940Sstevel@tonic-gate 
9950Sstevel@tonic-gate 	size += STRUCT_SIZE(head) + nmsg * STRUCT_SIZE(mhead);
9960Sstevel@tonic-gate 	if (size > bufsz)
9970Sstevel@tonic-gate 		nmsg = 0;
9980Sstevel@tonic-gate 
9990Sstevel@tonic-gate 	if (nmsg > 0) {
10000Sstevel@tonic-gate 		/*
10010Sstevel@tonic-gate 		 * Mark the messages as being copied.
10020Sstevel@tonic-gate 		 */
10030Sstevel@tonic-gate 		snaplist = (struct msg **)kmem_alloc(nmsg *
10040Sstevel@tonic-gate 		    sizeof (struct msg *), KM_SLEEP);
10050Sstevel@tonic-gate 		i = 0;
10060Sstevel@tonic-gate 		for (mp = list_head(&qp->msg_list); mp;
10070Sstevel@tonic-gate 		    mp = list_next(&qp->msg_list, mp)) {
10080Sstevel@tonic-gate 			if (msgtyp == 0 ||
10090Sstevel@tonic-gate 			    (msgtyp > 0 && msgtyp == mp->msg_type) ||
10100Sstevel@tonic-gate 			    (msgtyp < 0 && mp->msg_type <= -msgtyp)) {
10110Sstevel@tonic-gate 				msg_hold(mp);
10120Sstevel@tonic-gate 				snaplist[i] = mp;
10130Sstevel@tonic-gate 				i++;
10140Sstevel@tonic-gate 			}
10150Sstevel@tonic-gate 		}
10160Sstevel@tonic-gate 	}
10170Sstevel@tonic-gate 	mutex_exit(lock);
10180Sstevel@tonic-gate 
10190Sstevel@tonic-gate 	/*
10200Sstevel@tonic-gate 	 * Copy out the buffer header.
10210Sstevel@tonic-gate 	 */
10220Sstevel@tonic-gate 	STRUCT_FSET(head, msgsnap_size, size);
10230Sstevel@tonic-gate 	STRUCT_FSET(head, msgsnap_nmsg, nmsg);
10240Sstevel@tonic-gate 	if (copyout(STRUCT_BUF(head), buf, STRUCT_SIZE(head)))
10250Sstevel@tonic-gate 		error = EFAULT;
10260Sstevel@tonic-gate 
10270Sstevel@tonic-gate 	buf += STRUCT_SIZE(head);
10280Sstevel@tonic-gate 
10290Sstevel@tonic-gate 	/*
10300Sstevel@tonic-gate 	 * Now copy out the messages one by one.
10310Sstevel@tonic-gate 	 */
10320Sstevel@tonic-gate 	for (i = 0; i < nmsg; i++) {
10330Sstevel@tonic-gate 		mp = snaplist[i];
10340Sstevel@tonic-gate 		if (error == 0) {
10350Sstevel@tonic-gate 			STRUCT_FSET(mhead, msgsnap_mlen, mp->msg_size);
10360Sstevel@tonic-gate 			STRUCT_FSET(mhead, msgsnap_mtype, mp->msg_type);
10370Sstevel@tonic-gate 			if (copyout(STRUCT_BUF(mhead), buf, STRUCT_SIZE(mhead)))
10380Sstevel@tonic-gate 				error = EFAULT;
10390Sstevel@tonic-gate 			buf += STRUCT_SIZE(mhead);
10400Sstevel@tonic-gate 
10410Sstevel@tonic-gate 			if (error == 0 &&
10420Sstevel@tonic-gate 			    mp->msg_size != 0 &&
10430Sstevel@tonic-gate 			    copyout(mp->msg_addr, buf, mp->msg_size))
10440Sstevel@tonic-gate 				error = EFAULT;
10450Sstevel@tonic-gate 			if (mdl == DATAMODEL_NATIVE)
10460Sstevel@tonic-gate 				buf += RND(mp->msg_size);
10470Sstevel@tonic-gate 			else
10480Sstevel@tonic-gate 				buf += RND32(mp->msg_size);
10490Sstevel@tonic-gate 		}
10500Sstevel@tonic-gate 		lock = ipc_lock(msq_svc, qp->msg_perm.ipc_id);
10510Sstevel@tonic-gate 		msg_rele(mp);
10520Sstevel@tonic-gate 		/* Check for msg q deleted or reallocated */
10530Sstevel@tonic-gate 		if (IPC_FREE(&qp->msg_perm))
10540Sstevel@tonic-gate 			error = EIDRM;
10550Sstevel@tonic-gate 		mutex_exit(lock);
10560Sstevel@tonic-gate 	}
10570Sstevel@tonic-gate 
10580Sstevel@tonic-gate 	(void) ipc_lock(msq_svc, qp->msg_perm.ipc_id);
10590Sstevel@tonic-gate 	ipc_rele(msq_svc, (kipc_perm_t *)qp);
10600Sstevel@tonic-gate 
10610Sstevel@tonic-gate 	if (nmsg > 0)
10620Sstevel@tonic-gate 		kmem_free(snaplist, nmsg * sizeof (struct msg *));
10630Sstevel@tonic-gate 
10640Sstevel@tonic-gate 	if (error)
10650Sstevel@tonic-gate 		return (set_errno(error));
10660Sstevel@tonic-gate 	return (0);
10670Sstevel@tonic-gate }
10680Sstevel@tonic-gate 
10692983Sdv142724 #define	MSG_PREALLOC_LIMIT 8192
10702983Sdv142724 
10710Sstevel@tonic-gate /*
10720Sstevel@tonic-gate  * msgsnd system call.
10730Sstevel@tonic-gate  */
10740Sstevel@tonic-gate static int
10750Sstevel@tonic-gate msgsnd(int msqid, struct ipcmsgbuf *msgp, size_t msgsz, int msgflg)
10760Sstevel@tonic-gate {
10770Sstevel@tonic-gate 	kmsqid_t	*qp;
10782983Sdv142724 	kmutex_t	*lock = NULL;
10790Sstevel@tonic-gate 	struct msg	*mp = NULL;
10800Sstevel@tonic-gate 	long		type;
10810Sstevel@tonic-gate 	int		error = 0;
10820Sstevel@tonic-gate 	model_t		mdl = get_udatamodel();
10830Sstevel@tonic-gate 	STRUCT_HANDLE(ipcmsgbuf, umsgp);
10840Sstevel@tonic-gate 
10850Sstevel@tonic-gate 	CPU_STATS_ADDQ(CPU, sys, msg, 1);	/* bump msg send/rcv count */
10860Sstevel@tonic-gate 	STRUCT_SET_HANDLE(umsgp, mdl, msgp);
10870Sstevel@tonic-gate 
10880Sstevel@tonic-gate 	if (mdl == DATAMODEL_NATIVE) {
10890Sstevel@tonic-gate 		if (copyin(msgp, &type, sizeof (type)))
10900Sstevel@tonic-gate 			return (set_errno(EFAULT));
10910Sstevel@tonic-gate 	} else {
10920Sstevel@tonic-gate 		int32_t	type32;
10930Sstevel@tonic-gate 		if (copyin(msgp, &type32, sizeof (type32)))
10940Sstevel@tonic-gate 			return (set_errno(EFAULT));
10950Sstevel@tonic-gate 		type = type32;
10960Sstevel@tonic-gate 	}
10970Sstevel@tonic-gate 
10980Sstevel@tonic-gate 	if (type < 1)
10990Sstevel@tonic-gate 		return (set_errno(EINVAL));
11000Sstevel@tonic-gate 
11012983Sdv142724 	/*
11022983Sdv142724 	 * We want the value here large enough that most of the
11032983Sdv142724 	 * the message operations will use the "lockless" path,
11042983Sdv142724 	 * but small enough that a user can not reserve large
11052983Sdv142724 	 * chunks of kernel memory unless they have a valid
11062983Sdv142724 	 * reason to.
11072983Sdv142724 	 */
11082983Sdv142724 	if (msgsz <= MSG_PREALLOC_LIMIT) {
11092983Sdv142724 		/*
11102983Sdv142724 		 * We are small enough that we can afford to do the
11112983Sdv142724 		 * allocation now.  This saves dropping the lock
11122983Sdv142724 		 * and then reacquiring the lock.
11132983Sdv142724 		 */
11142983Sdv142724 		mp = kmem_zalloc(sizeof (struct msg), KM_SLEEP);
11152983Sdv142724 		mp->msg_copycnt = 1;
11162983Sdv142724 		mp->msg_size = msgsz;
11172983Sdv142724 		if (msgsz) {
11182983Sdv142724 			mp->msg_addr = kmem_alloc(msgsz, KM_SLEEP);
11192983Sdv142724 			if (copyin(STRUCT_FADDR(umsgp, mtext),
11202983Sdv142724 			    mp->msg_addr, msgsz) == -1) {
11212983Sdv142724 				error = EFAULT;
11222983Sdv142724 				goto msgsnd_out;
11232983Sdv142724 			}
11242983Sdv142724 		}
11252983Sdv142724 	}
11262983Sdv142724 
11272983Sdv142724 	if ((lock = ipc_lookup(msq_svc, msqid, (kipc_perm_t **)&qp)) == NULL) {
11282983Sdv142724 		error = EINVAL;
11292983Sdv142724 		goto msgsnd_out;
11302983Sdv142724 	}
11312983Sdv142724 
11320Sstevel@tonic-gate 	ipc_hold(msq_svc, (kipc_perm_t *)qp);
11330Sstevel@tonic-gate 
11340Sstevel@tonic-gate 	if (msgsz > qp->msg_qbytes) {
11350Sstevel@tonic-gate 		error = EINVAL;
11360Sstevel@tonic-gate 		goto msgsnd_out;
11370Sstevel@tonic-gate 	}
11380Sstevel@tonic-gate 
11390Sstevel@tonic-gate 	if (error = ipcperm_access(&qp->msg_perm, MSG_W, CRED()))
11400Sstevel@tonic-gate 		goto msgsnd_out;
11410Sstevel@tonic-gate 
11420Sstevel@tonic-gate top:
11430Sstevel@tonic-gate 	/*
11440Sstevel@tonic-gate 	 * Allocate space on q, message header, & buffer space.
11450Sstevel@tonic-gate 	 */
11460Sstevel@tonic-gate 	ASSERT(qp->msg_qnum <= qp->msg_qmax);
11470Sstevel@tonic-gate 	while ((msgsz > qp->msg_qbytes - qp->msg_cbytes) ||
11480Sstevel@tonic-gate 	    (qp->msg_qnum == qp->msg_qmax)) {
11490Sstevel@tonic-gate 		int cvres;
11500Sstevel@tonic-gate 
11510Sstevel@tonic-gate 		if (msgflg & IPC_NOWAIT) {
11520Sstevel@tonic-gate 			error = EAGAIN;
11530Sstevel@tonic-gate 			goto msgsnd_out;
11540Sstevel@tonic-gate 		}
11550Sstevel@tonic-gate 
11560Sstevel@tonic-gate 		qp->msg_snd_cnt++;
11570Sstevel@tonic-gate 		cvres = cv_wait_sig(&qp->msg_snd_cv, lock);
11580Sstevel@tonic-gate 		lock = ipc_relock(msq_svc, qp->msg_perm.ipc_id, lock);
11590Sstevel@tonic-gate 		qp->msg_snd_cnt--;
11600Sstevel@tonic-gate 
11614153Sdv142724 		if (error = msgq_check_err(qp, cvres)) {
11620Sstevel@tonic-gate 			goto msgsnd_out;
11630Sstevel@tonic-gate 		}
11640Sstevel@tonic-gate 	}
11650Sstevel@tonic-gate 
11660Sstevel@tonic-gate 	if (mp == NULL) {
11670Sstevel@tonic-gate 		int failure;
11680Sstevel@tonic-gate 
11690Sstevel@tonic-gate 		mutex_exit(lock);
11702983Sdv142724 		ASSERT(msgsz > 0);
11710Sstevel@tonic-gate 		mp = kmem_zalloc(sizeof (struct msg), KM_SLEEP);
11722983Sdv142724 		mp->msg_addr = kmem_alloc(msgsz, KM_SLEEP);
11730Sstevel@tonic-gate 		mp->msg_size = msgsz;
11740Sstevel@tonic-gate 		mp->msg_copycnt = 1;
11750Sstevel@tonic-gate 
11762983Sdv142724 		failure = (copyin(STRUCT_FADDR(umsgp, mtext),
11770Sstevel@tonic-gate 		    mp->msg_addr, msgsz) == -1);
11780Sstevel@tonic-gate 		lock = ipc_lock(msq_svc, qp->msg_perm.ipc_id);
11790Sstevel@tonic-gate 		if (IPC_FREE(&qp->msg_perm)) {
11800Sstevel@tonic-gate 			error = EIDRM;
11810Sstevel@tonic-gate 			goto msgsnd_out;
11820Sstevel@tonic-gate 		}
11830Sstevel@tonic-gate 		if (failure) {
11840Sstevel@tonic-gate 			error = EFAULT;
11850Sstevel@tonic-gate 			goto msgsnd_out;
11860Sstevel@tonic-gate 		}
11870Sstevel@tonic-gate 		goto top;
11880Sstevel@tonic-gate 	}
11890Sstevel@tonic-gate 
11900Sstevel@tonic-gate 	/*
11910Sstevel@tonic-gate 	 * Everything is available, put msg on q.
11920Sstevel@tonic-gate 	 */
11930Sstevel@tonic-gate 	qp->msg_qnum++;
11940Sstevel@tonic-gate 	qp->msg_cbytes += msgsz;
11950Sstevel@tonic-gate 	qp->msg_lspid = curproc->p_pid;
11960Sstevel@tonic-gate 	qp->msg_stime = gethrestime_sec();
11970Sstevel@tonic-gate 	mp->msg_type = type;
11984153Sdv142724 	if (qp->msg_lowest_type > type)
11994153Sdv142724 		qp->msg_lowest_type = type;
12000Sstevel@tonic-gate 	list_insert_tail(&qp->msg_list, mp);
12012565Sudpa 	/*
12024153Sdv142724 	 * Get the proper receiver going.
12032565Sudpa 	 */
12044153Sdv142724 	msg_wakeup_rdr(qp, &qp->msg_fnd_sndr, type);
12050Sstevel@tonic-gate 
12060Sstevel@tonic-gate msgsnd_out:
12072983Sdv142724 	if (lock)
12082983Sdv142724 		ipc_rele(msq_svc, (kipc_perm_t *)qp);	/* drops lock */
12090Sstevel@tonic-gate 
12100Sstevel@tonic-gate 	if (error) {
12110Sstevel@tonic-gate 		if (mp)
12120Sstevel@tonic-gate 			msg_rele(mp);
12130Sstevel@tonic-gate 		return (set_errno(error));
12140Sstevel@tonic-gate 	}
12150Sstevel@tonic-gate 
12160Sstevel@tonic-gate 	return (0);
12170Sstevel@tonic-gate }
12180Sstevel@tonic-gate 
12194153Sdv142724 static void
12204153Sdv142724 msg_wakeup_rdr(kmsqid_t *qp, msg_select_t **flist, long type)
12214153Sdv142724 {
12224153Sdv142724 	msg_select_t	*walker = *flist;
12234153Sdv142724 	msgq_wakeup_t	*wakeup;
12246071Sdv142724 	uint_t		msg_hash;
12254153Sdv142724 
12264153Sdv142724 	msg_hash = msg_type_hash(type);
12274153Sdv142724 
12284153Sdv142724 	do {
12294153Sdv142724 		wakeup = walker->selection(qp, msg_hash, type);
12304153Sdv142724 		walker = walker->next_selection;
12314153Sdv142724 	} while (!wakeup && walker != *flist);
12324153Sdv142724 
12334153Sdv142724 	*flist = (*flist)->next_selection;
12344153Sdv142724 	if (wakeup) {
12354153Sdv142724 		if (type) {
12364153Sdv142724 			wakeup->msgw_snd_wake = type;
12374153Sdv142724 		}
12384153Sdv142724 		cv_signal(&wakeup->msgw_wake_cv);
12394153Sdv142724 	}
12404153Sdv142724 }
12414153Sdv142724 
12426071Sdv142724 static uint_t
12434153Sdv142724 msg_type_hash(long msg_type)
12444153Sdv142724 {
12454153Sdv142724 	if (msg_type < 0) {
12466071Sdv142724 		long	hash = -msg_type / MSG_NEG_INTERVAL;
12474153Sdv142724 		/*
12484153Sdv142724 		 * Negative message types are hashed over an
12494153Sdv142724 		 * interval.  Any message type that hashes
12504153Sdv142724 		 * beyond MSG_MAX_QNUM is automatically placed
12514153Sdv142724 		 * in the last bucket.
12524153Sdv142724 		 */
12536071Sdv142724 		if (hash > MSG_MAX_QNUM)
12544153Sdv142724 			hash = MSG_MAX_QNUM;
12554153Sdv142724 		return (hash);
12564153Sdv142724 	}
12574153Sdv142724 
12584153Sdv142724 	/*
12594153Sdv142724 	 * 0 or positive message type.  The first bucket is reserved for
12604153Sdv142724 	 * message receivers of type 0, the other buckets we hash into.
12614153Sdv142724 	 */
12626071Sdv142724 	if (msg_type)
12636071Sdv142724 		return (1 + (msg_type % MSG_MAX_QNUM));
12644153Sdv142724 	return (0);
12654153Sdv142724 }
12664153Sdv142724 
12674153Sdv142724 /*
12684153Sdv142724  * Routines to see if we have a receiver of type 0 either blocked waiting
12694153Sdv142724  * for a message.  Simply return the first guy on the list.
12704153Sdv142724  */
12714153Sdv142724 
12724153Sdv142724 static msgq_wakeup_t *
12736071Sdv142724 /* ARGSUSED */
12744153Sdv142724 msg_fnd_any_snd(kmsqid_t *qp, int msg_hash, long type)
12754153Sdv142724 {
12766071Sdv142724 	msgq_wakeup_t	*walker;
12776071Sdv142724 
12786071Sdv142724 	walker = list_head(&qp->msg_wait_snd[0]);
12796071Sdv142724 
12806071Sdv142724 	if (walker)
12816071Sdv142724 		list_remove(&qp->msg_wait_snd[0], walker);
12826071Sdv142724 	return (walker);
12834153Sdv142724 }
12844153Sdv142724 
12854153Sdv142724 static msgq_wakeup_t *
12866071Sdv142724 /* ARGSUSED */
12874153Sdv142724 msg_fnd_any_rdr(kmsqid_t *qp, int msg_hash, long type)
12884153Sdv142724 {
12896071Sdv142724 	msgq_wakeup_t	*walker;
12906071Sdv142724 
12916071Sdv142724 	walker = list_head(&qp->msg_cpy_block);
12926071Sdv142724 	if (walker)
12936071Sdv142724 		list_remove(&qp->msg_cpy_block, walker);
12946071Sdv142724 	return (walker);
12954153Sdv142724 }
12964153Sdv142724 
12974153Sdv142724 static msgq_wakeup_t *
12984153Sdv142724 msg_fnd_spc_snd(kmsqid_t *qp, int msg_hash, long type)
12994153Sdv142724 {
13004153Sdv142724 	msgq_wakeup_t	*walker;
13014153Sdv142724 
13024153Sdv142724 	walker = list_head(&qp->msg_wait_snd[msg_hash]);
13034153Sdv142724 
13046071Sdv142724 	while (walker && walker->msgw_type != type)
13056071Sdv142724 		walker = list_next(&qp->msg_wait_snd[msg_hash], walker);
13066071Sdv142724 	if (walker)
13076071Sdv142724 		list_remove(&qp->msg_wait_snd[msg_hash], walker);
13084153Sdv142724 	return (walker);
13094153Sdv142724 }
13104153Sdv142724 
13116071Sdv142724 /* ARGSUSED */
13124153Sdv142724 static msgq_wakeup_t *
13134153Sdv142724 msg_fnd_neg_snd(kmsqid_t *qp, int msg_hash, long type)
13144153Sdv142724 {
13154153Sdv142724 	msgq_wakeup_t	*qptr;
13164153Sdv142724 	int		count;
13174153Sdv142724 	int		check_index;
13184153Sdv142724 	int		neg_index;
13194153Sdv142724 	int		nbuckets;
13204153Sdv142724 
13214153Sdv142724 	if (!qp->msg_ngt_cnt) {
13224153Sdv142724 		return (NULL);
13234153Sdv142724 	}
13244153Sdv142724 	neg_index = msg_type_hash(-type);
13254153Sdv142724 
13264153Sdv142724 	/*
13274153Sdv142724 	 * Check for a match among the negative type queues.  Any buckets
13284153Sdv142724 	 * at neg_index or larger can match the type.  Use the last send
13294153Sdv142724 	 * time to randomize the starting bucket to prevent starvation.
13304153Sdv142724 	 * Search all buckets from neg_index to MSG_MAX_QNUM, starting
13314153Sdv142724 	 * from the random starting point, and wrapping around after
13324153Sdv142724 	 * MSG_MAX_QNUM.
13334153Sdv142724 	 */
13344153Sdv142724 
13354153Sdv142724 	nbuckets = MSG_MAX_QNUM - neg_index + 1;
13364153Sdv142724 	check_index = neg_index + (qp->msg_stime % nbuckets);
13374153Sdv142724 
13384153Sdv142724 	for (count = nbuckets; count > 0; count--) {
13394153Sdv142724 		qptr = list_head(&qp->msg_wait_snd_ngt[check_index]);
13404153Sdv142724 		while (qptr) {
13414153Sdv142724 			/*
13424153Sdv142724 			 * The lowest hash bucket may actually contain
13434153Sdv142724 			 * message types that are not valid for this
13444153Sdv142724 			 * request.  This can happen due to the fact that
13454153Sdv142724 			 * the message buckets actually contain a consecutive
13464153Sdv142724 			 * range of types.
13474153Sdv142724 			 */
13484153Sdv142724 			if (-qptr->msgw_type >= type) {
13496071Sdv142724 				list_remove(&qp->msg_wait_snd_ngt[check_index],
13506071Sdv142724 				    qptr);
13514153Sdv142724 				return (qptr);
13524153Sdv142724 			}
13536071Sdv142724 			qptr = list_next(&qp->msg_wait_snd_ngt[check_index],
13546071Sdv142724 			    qptr);
13554153Sdv142724 		}
13564153Sdv142724 		if (++check_index > MSG_MAX_QNUM) {
13574153Sdv142724 			check_index = neg_index;
13584153Sdv142724 		}
13594153Sdv142724 	}
13604153Sdv142724 	return (NULL);
13614153Sdv142724 }
13624153Sdv142724 
13634153Sdv142724 static int
13644153Sdv142724 msg_rcvq_sleep(list_t *queue, msgq_wakeup_t *entry, kmutex_t **lock,
13654153Sdv142724     kmsqid_t *qp)
13664153Sdv142724 {
13674153Sdv142724 	int		cvres;
13684153Sdv142724 
13694153Sdv142724 	cv_init(&entry->msgw_wake_cv, NULL, 0, NULL);
13704153Sdv142724 
13714153Sdv142724 	list_insert_tail(queue, entry);
13724153Sdv142724 
13734153Sdv142724 	qp->msg_rcv_cnt++;
13744153Sdv142724 	cvres = cv_wait_sig(&entry->msgw_wake_cv, *lock);
13754153Sdv142724 	*lock = ipc_relock(msq_svc, qp->msg_perm.ipc_id, *lock);
13764153Sdv142724 	qp->msg_rcv_cnt--;
13776071Sdv142724 
13786071Sdv142724 	if (list_link_active(&entry->msgw_list)) {
13796071Sdv142724 		/*
13806071Sdv142724 		 * We woke up unexpectedly, remove ourself.
13816071Sdv142724 		 */
13826071Sdv142724 		list_remove(queue, entry);
13836071Sdv142724 	}
13844153Sdv142724 
13854153Sdv142724 	return (cvres);
13864153Sdv142724 }
13874153Sdv142724 
13884153Sdv142724 static void
13894153Sdv142724 msg_rcvq_wakeup_all(list_t *q_ptr)
13904153Sdv142724 {
13914153Sdv142724 	msgq_wakeup_t	*q_walk;
13924153Sdv142724 
13936071Sdv142724 	while (q_walk = list_head(q_ptr)) {
13946071Sdv142724 		list_remove(q_ptr, q_walk);
13954153Sdv142724 		cv_signal(&q_walk->msgw_wake_cv);
13964153Sdv142724 	}
13974153Sdv142724 }
13984153Sdv142724 
13990Sstevel@tonic-gate /*
14000Sstevel@tonic-gate  * msgsys - System entry point for msgctl, msgget, msgrcv, and msgsnd
14010Sstevel@tonic-gate  * system calls.
14020Sstevel@tonic-gate  */
14030Sstevel@tonic-gate static ssize_t
14040Sstevel@tonic-gate msgsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3,
14050Sstevel@tonic-gate 	uintptr_t a4, uintptr_t a5)
14060Sstevel@tonic-gate {
14070Sstevel@tonic-gate 	ssize_t error;
14080Sstevel@tonic-gate 
14090Sstevel@tonic-gate 	switch (opcode) {
14100Sstevel@tonic-gate 	case MSGGET:
14110Sstevel@tonic-gate 		error = msgget((key_t)a1, (int)a2);
14120Sstevel@tonic-gate 		break;
14130Sstevel@tonic-gate 	case MSGCTL:
14140Sstevel@tonic-gate 		error = msgctl((int)a1, (int)a2, (void *)a3);
14150Sstevel@tonic-gate 		break;
14160Sstevel@tonic-gate 	case MSGRCV:
14170Sstevel@tonic-gate 		error = msgrcv((int)a1, (struct ipcmsgbuf *)a2,
14180Sstevel@tonic-gate 		    (size_t)a3, (long)a4, (int)a5);
14190Sstevel@tonic-gate 		break;
14200Sstevel@tonic-gate 	case MSGSND:
14210Sstevel@tonic-gate 		error = msgsnd((int)a1, (struct ipcmsgbuf *)a2,
14220Sstevel@tonic-gate 		    (size_t)a3, (int)a4);
14230Sstevel@tonic-gate 		break;
14240Sstevel@tonic-gate 	case MSGIDS:
14250Sstevel@tonic-gate 		error = msgids((int *)a1, (uint_t)a2, (uint_t *)a3);
14260Sstevel@tonic-gate 		break;
14270Sstevel@tonic-gate 	case MSGSNAP:
14280Sstevel@tonic-gate 		error = msgsnap((int)a1, (caddr_t)a2, (size_t)a3, (long)a4);
14290Sstevel@tonic-gate 		break;
14300Sstevel@tonic-gate 	default:
14310Sstevel@tonic-gate 		error = set_errno(EINVAL);
14320Sstevel@tonic-gate 		break;
14330Sstevel@tonic-gate 	}
14340Sstevel@tonic-gate 
14350Sstevel@tonic-gate 	return (error);
14360Sstevel@tonic-gate }
14370Sstevel@tonic-gate 
14380Sstevel@tonic-gate #ifdef	_SYSCALL32_IMPL
14390Sstevel@tonic-gate /*
14400Sstevel@tonic-gate  * msgsys32 - System entry point for msgctl, msgget, msgrcv, and msgsnd
14410Sstevel@tonic-gate  * system calls for 32-bit callers on LP64 kernel.
14420Sstevel@tonic-gate  */
14430Sstevel@tonic-gate static ssize32_t
14440Sstevel@tonic-gate msgsys32(int opcode, uint32_t a1, uint32_t a2, uint32_t a3,
14450Sstevel@tonic-gate 	uint32_t a4, uint32_t a5)
14460Sstevel@tonic-gate {
14470Sstevel@tonic-gate 	ssize_t error;
14480Sstevel@tonic-gate 
14490Sstevel@tonic-gate 	switch (opcode) {
14500Sstevel@tonic-gate 	case MSGGET:
14510Sstevel@tonic-gate 		error = msgget((key_t)a1, (int)a2);
14520Sstevel@tonic-gate 		break;
14530Sstevel@tonic-gate 	case MSGCTL:
14540Sstevel@tonic-gate 		error = msgctl((int)a1, (int)a2, (void *)(uintptr_t)a3);
14550Sstevel@tonic-gate 		break;
14560Sstevel@tonic-gate 	case MSGRCV:
14570Sstevel@tonic-gate 		error = msgrcv((int)a1, (struct ipcmsgbuf *)(uintptr_t)a2,
14580Sstevel@tonic-gate 		    (size_t)a3, (long)(int32_t)a4, (int)a5);
14590Sstevel@tonic-gate 		break;
14600Sstevel@tonic-gate 	case MSGSND:
14610Sstevel@tonic-gate 		error = msgsnd((int)a1, (struct ipcmsgbuf *)(uintptr_t)a2,
14620Sstevel@tonic-gate 		    (size_t)(int32_t)a3, (int)a4);
14630Sstevel@tonic-gate 		break;
14640Sstevel@tonic-gate 	case MSGIDS:
14650Sstevel@tonic-gate 		error = msgids((int *)(uintptr_t)a1, (uint_t)a2,
14660Sstevel@tonic-gate 		    (uint_t *)(uintptr_t)a3);
14670Sstevel@tonic-gate 		break;
14680Sstevel@tonic-gate 	case MSGSNAP:
14690Sstevel@tonic-gate 		error = msgsnap((int)a1, (caddr_t)(uintptr_t)a2, (size_t)a3,
14700Sstevel@tonic-gate 		    (long)(int32_t)a4);
14710Sstevel@tonic-gate 		break;
14720Sstevel@tonic-gate 	default:
14730Sstevel@tonic-gate 		error = set_errno(EINVAL);
14740Sstevel@tonic-gate 		break;
14750Sstevel@tonic-gate 	}
14760Sstevel@tonic-gate 
14770Sstevel@tonic-gate 	return (error);
14780Sstevel@tonic-gate }
14790Sstevel@tonic-gate #endif	/* SYSCALL32_IMPL */
1480