xref: /onnv-gate/usr/src/uts/common/os/zone.c (revision 12725:334fd88ae67c)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51676Sjpk  * Common Development and Distribution License (the "License").
61676Sjpk  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
21390Sraf 
220Sstevel@tonic-gate /*
2312273SCasper.Dik@Sun.COM  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate  * Zones
280Sstevel@tonic-gate  *
290Sstevel@tonic-gate  *   A zone is a named collection of processes, namespace constraints,
300Sstevel@tonic-gate  *   and other system resources which comprise a secure and manageable
310Sstevel@tonic-gate  *   application containment facility.
320Sstevel@tonic-gate  *
330Sstevel@tonic-gate  *   Zones (represented by the reference counted zone_t) are tracked in
340Sstevel@tonic-gate  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
350Sstevel@tonic-gate  *   (zoneid_t) are used to track zone association.  Zone IDs are
360Sstevel@tonic-gate  *   dynamically generated when the zone is created; if a persistent
370Sstevel@tonic-gate  *   identifier is needed (core files, accounting logs, audit trail,
380Sstevel@tonic-gate  *   etc.), the zone name should be used.
390Sstevel@tonic-gate  *
400Sstevel@tonic-gate  *
410Sstevel@tonic-gate  *   Global Zone:
420Sstevel@tonic-gate  *
430Sstevel@tonic-gate  *   The global zone (zoneid 0) is automatically associated with all
440Sstevel@tonic-gate  *   system resources that have not been bound to a user-created zone.
450Sstevel@tonic-gate  *   This means that even systems where zones are not in active use
460Sstevel@tonic-gate  *   have a global zone, and all processes, mounts, etc. are
470Sstevel@tonic-gate  *   associated with that zone.  The global zone is generally
480Sstevel@tonic-gate  *   unconstrained in terms of privileges and access, though the usual
490Sstevel@tonic-gate  *   credential and privilege based restrictions apply.
500Sstevel@tonic-gate  *
510Sstevel@tonic-gate  *
520Sstevel@tonic-gate  *   Zone States:
530Sstevel@tonic-gate  *
540Sstevel@tonic-gate  *   The states in which a zone may be in and the transitions are as
550Sstevel@tonic-gate  *   follows:
560Sstevel@tonic-gate  *
570Sstevel@tonic-gate  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
580Sstevel@tonic-gate  *   initialized zone is added to the list of active zones on the system but
590Sstevel@tonic-gate  *   isn't accessible.
600Sstevel@tonic-gate  *
615880Snordmark  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
625880Snordmark  *   not yet completed. Not possible to enter the zone, but attributes can
635880Snordmark  *   be retrieved.
645880Snordmark  *
650Sstevel@tonic-gate  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
660Sstevel@tonic-gate  *   ready.  The zone is made visible after the ZSD constructor callbacks are
670Sstevel@tonic-gate  *   executed.  A zone remains in this state until it transitions into
680Sstevel@tonic-gate  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
690Sstevel@tonic-gate  *
700Sstevel@tonic-gate  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
710Sstevel@tonic-gate  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
720Sstevel@tonic-gate  *   state.
730Sstevel@tonic-gate  *
740Sstevel@tonic-gate  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
750Sstevel@tonic-gate  *   successfully started init.   A zone remains in this state until
760Sstevel@tonic-gate  *   zone_shutdown() is called.
770Sstevel@tonic-gate  *
780Sstevel@tonic-gate  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
790Sstevel@tonic-gate  *   killing all processes running in the zone. The zone remains
800Sstevel@tonic-gate  *   in this state until there are no more user processes running in the zone.
810Sstevel@tonic-gate  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
820Sstevel@tonic-gate  *   Since zone_shutdown() is restartable, it may be called successfully
830Sstevel@tonic-gate  *   multiple times for the same zone_t.  Setting of the zone's state to
840Sstevel@tonic-gate  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
850Sstevel@tonic-gate  *   the zone's status without worrying about it being a moving target.
860Sstevel@tonic-gate  *
870Sstevel@tonic-gate  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
880Sstevel@tonic-gate  *   are no more user processes in the zone.  The zone remains in this
890Sstevel@tonic-gate  *   state until there are no more kernel threads associated with the
900Sstevel@tonic-gate  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
910Sstevel@tonic-gate  *   fail.
920Sstevel@tonic-gate  *
930Sstevel@tonic-gate  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
940Sstevel@tonic-gate  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
950Sstevel@tonic-gate  *   join the zone or create kernel threads therein.
960Sstevel@tonic-gate  *
970Sstevel@tonic-gate  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
980Sstevel@tonic-gate  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
990Sstevel@tonic-gate  *   return NULL from now on.
1000Sstevel@tonic-gate  *
1010Sstevel@tonic-gate  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
1020Sstevel@tonic-gate  *   processes or threads doing work on behalf of the zone.  The zone is
1030Sstevel@tonic-gate  *   removed from the list of active zones.  zone_destroy() returns, and
1040Sstevel@tonic-gate  *   the zone can be recreated.
1050Sstevel@tonic-gate  *
1060Sstevel@tonic-gate  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
1070Sstevel@tonic-gate  *   callbacks are executed, and all memory associated with the zone is
1080Sstevel@tonic-gate  *   freed.
1090Sstevel@tonic-gate  *
1100Sstevel@tonic-gate  *   Threads can wait for the zone to enter a requested state by using
1110Sstevel@tonic-gate  *   zone_status_wait() or zone_status_timedwait() with the desired
1120Sstevel@tonic-gate  *   state passed in as an argument.  Zone state transitions are
1130Sstevel@tonic-gate  *   uni-directional; it is not possible to move back to an earlier state.
1140Sstevel@tonic-gate  *
1150Sstevel@tonic-gate  *
1160Sstevel@tonic-gate  *   Zone-Specific Data:
1170Sstevel@tonic-gate  *
1180Sstevel@tonic-gate  *   Subsystems needing to maintain zone-specific data can store that
1190Sstevel@tonic-gate  *   data using the ZSD mechanism.  This provides a zone-specific data
1200Sstevel@tonic-gate  *   store, similar to thread-specific data (see pthread_getspecific(3C)
1210Sstevel@tonic-gate  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
1220Sstevel@tonic-gate  *   to register callbacks to be invoked when a zone is created, shut
1230Sstevel@tonic-gate  *   down, or destroyed.  This can be used to initialize zone-specific
1240Sstevel@tonic-gate  *   data for new zones and to clean up when zones go away.
1250Sstevel@tonic-gate  *
1260Sstevel@tonic-gate  *
1270Sstevel@tonic-gate  *   Data Structures:
1280Sstevel@tonic-gate  *
1290Sstevel@tonic-gate  *   The per-zone structure (zone_t) is reference counted, and freed
1300Sstevel@tonic-gate  *   when all references are released.  zone_hold and zone_rele can be
1310Sstevel@tonic-gate  *   used to adjust the reference count.  In addition, reference counts
1320Sstevel@tonic-gate  *   associated with the cred_t structure are tracked separately using
1330Sstevel@tonic-gate  *   zone_cred_hold and zone_cred_rele.
1340Sstevel@tonic-gate  *
1350Sstevel@tonic-gate  *   Pointers to active zone_t's are stored in two hash tables; one
1360Sstevel@tonic-gate  *   for searching by id, the other for searching by name.  Lookups
1370Sstevel@tonic-gate  *   can be performed on either basis, using zone_find_by_id and
1380Sstevel@tonic-gate  *   zone_find_by_name.  Both return zone_t pointers with the zone
1390Sstevel@tonic-gate  *   held, so zone_rele should be called when the pointer is no longer
1400Sstevel@tonic-gate  *   needed.  Zones can also be searched by path; zone_find_by_path
1410Sstevel@tonic-gate  *   returns the zone with which a path name is associated (global
1420Sstevel@tonic-gate  *   zone if the path is not within some other zone's file system
1430Sstevel@tonic-gate  *   hierarchy).  This currently requires iterating through each zone,
1440Sstevel@tonic-gate  *   so it is slower than an id or name search via a hash table.
1450Sstevel@tonic-gate  *
1460Sstevel@tonic-gate  *
1470Sstevel@tonic-gate  *   Locking:
1480Sstevel@tonic-gate  *
1490Sstevel@tonic-gate  *   zonehash_lock: This is a top-level global lock used to protect the
1500Sstevel@tonic-gate  *       zone hash tables and lists.  Zones cannot be created or destroyed
1510Sstevel@tonic-gate  *       while this lock is held.
1520Sstevel@tonic-gate  *   zone_status_lock: This is a global lock protecting zone state.
1530Sstevel@tonic-gate  *       Zones cannot change state while this lock is held.  It also
1540Sstevel@tonic-gate  *       protects the list of kernel threads associated with a zone.
1550Sstevel@tonic-gate  *   zone_lock: This is a per-zone lock used to protect several fields of
1560Sstevel@tonic-gate  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
1570Sstevel@tonic-gate  *       this lock means that the zone cannot go away.
1583247Sgjelinek  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
1593247Sgjelinek  *	 related to the zone.max-lwps rctl.
1603247Sgjelinek  *   zone_mem_lock: This is a per-zone lock used to protect the fields
1613247Sgjelinek  *	 related to the zone.max-locked-memory and zone.max-swap rctls.
16212633Sjohn.levon@sun.com  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
16312633Sjohn.levon@sun.com  *       currently just max_lofi
1640Sstevel@tonic-gate  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
1650Sstevel@tonic-gate  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
1660Sstevel@tonic-gate  *       list (a list of zones in the ZONE_IS_DEAD state).
1670Sstevel@tonic-gate  *
1680Sstevel@tonic-gate  *   Ordering requirements:
1690Sstevel@tonic-gate  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
1700Sstevel@tonic-gate  *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
1710Sstevel@tonic-gate  *
1723247Sgjelinek  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
1733247Sgjelinek  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
174*12725SMenno.Lageman@Sun.COM  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
1753247Sgjelinek  *
1760Sstevel@tonic-gate  *   Blocking memory allocations are permitted while holding any of the
1770Sstevel@tonic-gate  *   zone locks.
1780Sstevel@tonic-gate  *
1790Sstevel@tonic-gate  *
1800Sstevel@tonic-gate  *   System Call Interface:
1810Sstevel@tonic-gate  *
1820Sstevel@tonic-gate  *   The zone subsystem can be managed and queried from user level with
1830Sstevel@tonic-gate  *   the following system calls (all subcodes of the primary "zone"
1840Sstevel@tonic-gate  *   system call):
1850Sstevel@tonic-gate  *   - zone_create: creates a zone with selected attributes (name,
186789Sahrens  *     root path, privileges, resource controls, ZFS datasets)
1870Sstevel@tonic-gate  *   - zone_enter: allows the current process to enter a zone
1880Sstevel@tonic-gate  *   - zone_getattr: reports attributes of a zone
1892267Sdp  *   - zone_setattr: set attributes of a zone
1902267Sdp  *   - zone_boot: set 'init' running for the zone
1910Sstevel@tonic-gate  *   - zone_list: lists all zones active in the system
1920Sstevel@tonic-gate  *   - zone_lookup: looks up zone id based on name
1930Sstevel@tonic-gate  *   - zone_shutdown: initiates shutdown process (see states above)
1940Sstevel@tonic-gate  *   - zone_destroy: completes shutdown process (see states above)
1950Sstevel@tonic-gate  *
1960Sstevel@tonic-gate  */
1970Sstevel@tonic-gate 
1980Sstevel@tonic-gate #include <sys/priv_impl.h>
1990Sstevel@tonic-gate #include <sys/cred.h>
2000Sstevel@tonic-gate #include <c2/audit.h>
2010Sstevel@tonic-gate #include <sys/debug.h>
2020Sstevel@tonic-gate #include <sys/file.h>
2030Sstevel@tonic-gate #include <sys/kmem.h>
2043247Sgjelinek #include <sys/kstat.h>
2050Sstevel@tonic-gate #include <sys/mutex.h>
2061676Sjpk #include <sys/note.h>
2070Sstevel@tonic-gate #include <sys/pathname.h>
2080Sstevel@tonic-gate #include <sys/proc.h>
2090Sstevel@tonic-gate #include <sys/project.h>
2101166Sdstaff #include <sys/sysevent.h>
2110Sstevel@tonic-gate #include <sys/task.h>
2120Sstevel@tonic-gate #include <sys/systm.h>
2130Sstevel@tonic-gate #include <sys/types.h>
2140Sstevel@tonic-gate #include <sys/utsname.h>
2150Sstevel@tonic-gate #include <sys/vnode.h>
2160Sstevel@tonic-gate #include <sys/vfs.h>
2170Sstevel@tonic-gate #include <sys/systeminfo.h>
2180Sstevel@tonic-gate #include <sys/policy.h>
2190Sstevel@tonic-gate #include <sys/cred_impl.h>
2200Sstevel@tonic-gate #include <sys/contract_impl.h>
2210Sstevel@tonic-gate #include <sys/contract/process_impl.h>
2220Sstevel@tonic-gate #include <sys/class.h>
2230Sstevel@tonic-gate #include <sys/pool.h>
2240Sstevel@tonic-gate #include <sys/pool_pset.h>
2250Sstevel@tonic-gate #include <sys/pset.h>
2260Sstevel@tonic-gate #include <sys/sysmacros.h>
2270Sstevel@tonic-gate #include <sys/callb.h>
2280Sstevel@tonic-gate #include <sys/vmparam.h>
2290Sstevel@tonic-gate #include <sys/corectl.h>
2302677Sml93401 #include <sys/ipc_impl.h>
23112273SCasper.Dik@Sun.COM #include <sys/klpd.h>
2320Sstevel@tonic-gate 
2330Sstevel@tonic-gate #include <sys/door.h>
2340Sstevel@tonic-gate #include <sys/cpuvar.h>
2355880Snordmark #include <sys/sdt.h>
2360Sstevel@tonic-gate 
2370Sstevel@tonic-gate #include <sys/uadmin.h>
2380Sstevel@tonic-gate #include <sys/session.h>
2390Sstevel@tonic-gate #include <sys/cmn_err.h>
2400Sstevel@tonic-gate #include <sys/modhash.h>
2412267Sdp #include <sys/sunddi.h>
2420Sstevel@tonic-gate #include <sys/nvpair.h>
2430Sstevel@tonic-gate #include <sys/rctl.h>
2440Sstevel@tonic-gate #include <sys/fss.h>
2452712Snn35248 #include <sys/brand.h>
2460Sstevel@tonic-gate #include <sys/zone.h>
2473448Sdh155122 #include <net/if.h>
2483792Sakolb #include <sys/cpucaps.h>
2493247Sgjelinek #include <vm/seg.h>
25010616SSebastien.Roy@Sun.COM #include <sys/mac.h>
25110616SSebastien.Roy@Sun.COM 
25210616SSebastien.Roy@Sun.COM /* List of data link IDs which are accessible from the zone */
25310616SSebastien.Roy@Sun.COM typedef struct zone_dl {
25410616SSebastien.Roy@Sun.COM 	datalink_id_t	zdl_id;
25510616SSebastien.Roy@Sun.COM 	list_node_t	zdl_linkage;
25610616SSebastien.Roy@Sun.COM } zone_dl_t;
2573247Sgjelinek 
2580Sstevel@tonic-gate /*
2590Sstevel@tonic-gate  * cv used to signal that all references to the zone have been released.  This
2600Sstevel@tonic-gate  * needs to be global since there may be multiple waiters, and the first to
2610Sstevel@tonic-gate  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
2620Sstevel@tonic-gate  */
2630Sstevel@tonic-gate static kcondvar_t zone_destroy_cv;
2640Sstevel@tonic-gate /*
2650Sstevel@tonic-gate  * Lock used to serialize access to zone_cv.  This could have been per-zone,
2660Sstevel@tonic-gate  * but then we'd need another lock for zone_destroy_cv, and why bother?
2670Sstevel@tonic-gate  */
2680Sstevel@tonic-gate static kmutex_t zone_status_lock;
2690Sstevel@tonic-gate 
2700Sstevel@tonic-gate /*
2710Sstevel@tonic-gate  * ZSD-related global variables.
2720Sstevel@tonic-gate  */
2730Sstevel@tonic-gate static kmutex_t zsd_key_lock;	/* protects the following two */
2740Sstevel@tonic-gate /*
2750Sstevel@tonic-gate  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
2760Sstevel@tonic-gate  */
2770Sstevel@tonic-gate static zone_key_t zsd_keyval = 0;
2780Sstevel@tonic-gate /*
2790Sstevel@tonic-gate  * Global list of registered keys.  We use this when a new zone is created.
2800Sstevel@tonic-gate  */
2810Sstevel@tonic-gate static list_t zsd_registered_keys;
2820Sstevel@tonic-gate 
2830Sstevel@tonic-gate int zone_hash_size = 256;
2841676Sjpk static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
2850Sstevel@tonic-gate static kmutex_t zonehash_lock;
2860Sstevel@tonic-gate static uint_t zonecount;
2870Sstevel@tonic-gate static id_space_t *zoneid_space;
2880Sstevel@tonic-gate 
2890Sstevel@tonic-gate /*
2900Sstevel@tonic-gate  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
2910Sstevel@tonic-gate  * kernel proper runs, and which manages all other zones.
2920Sstevel@tonic-gate  *
2930Sstevel@tonic-gate  * Although not declared as static, the variable "zone0" should not be used
2940Sstevel@tonic-gate  * except for by code that needs to reference the global zone early on in boot,
2950Sstevel@tonic-gate  * before it is fully initialized.  All other consumers should use
2960Sstevel@tonic-gate  * 'global_zone'.
2970Sstevel@tonic-gate  */
2980Sstevel@tonic-gate zone_t zone0;
2990Sstevel@tonic-gate zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
3000Sstevel@tonic-gate 
3010Sstevel@tonic-gate /*
3020Sstevel@tonic-gate  * List of active zones, protected by zonehash_lock.
3030Sstevel@tonic-gate  */
3040Sstevel@tonic-gate static list_t zone_active;
3050Sstevel@tonic-gate 
3060Sstevel@tonic-gate /*
3070Sstevel@tonic-gate  * List of destroyed zones that still have outstanding cred references.
3080Sstevel@tonic-gate  * Used for debugging.  Uses a separate lock to avoid lock ordering
3090Sstevel@tonic-gate  * problems in zone_free.
3100Sstevel@tonic-gate  */
3110Sstevel@tonic-gate static list_t zone_deathrow;
3120Sstevel@tonic-gate static kmutex_t zone_deathrow_lock;
3130Sstevel@tonic-gate 
3140Sstevel@tonic-gate /* number of zones is limited by virtual interface limit in IP */
3150Sstevel@tonic-gate uint_t maxzones = 8192;
3160Sstevel@tonic-gate 
3171166Sdstaff /* Event channel to sent zone state change notifications */
3181166Sdstaff evchan_t *zone_event_chan;
3191166Sdstaff 
3201166Sdstaff /*
3211166Sdstaff  * This table holds the mapping from kernel zone states to
3221166Sdstaff  * states visible in the state notification API.
3231166Sdstaff  * The idea is that we only expose "obvious" states and
3241166Sdstaff  * do not expose states which are just implementation details.
3251166Sdstaff  */
3261166Sdstaff const char  *zone_status_table[] = {
3271166Sdstaff 	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
3285880Snordmark 	ZONE_EVENT_INITIALIZED,		/* initialized */
3291166Sdstaff 	ZONE_EVENT_READY,		/* ready */
3301166Sdstaff 	ZONE_EVENT_READY,		/* booting */
3311166Sdstaff 	ZONE_EVENT_RUNNING,		/* running */
3321166Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
3331166Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
3341166Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* down */
3351166Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
3361166Sdstaff 	ZONE_EVENT_UNINITIALIZED,	/* dead */
3371166Sdstaff };
3381166Sdstaff 
3390Sstevel@tonic-gate /*
3400Sstevel@tonic-gate  * This isn't static so lint doesn't complain.
3410Sstevel@tonic-gate  */
3420Sstevel@tonic-gate rctl_hndl_t rc_zone_cpu_shares;
3432768Ssl108498 rctl_hndl_t rc_zone_locked_mem;
3443247Sgjelinek rctl_hndl_t rc_zone_max_swap;
34512633Sjohn.levon@sun.com rctl_hndl_t rc_zone_max_lofi;
3463792Sakolb rctl_hndl_t rc_zone_cpu_cap;
3470Sstevel@tonic-gate rctl_hndl_t rc_zone_nlwps;
348*12725SMenno.Lageman@Sun.COM rctl_hndl_t rc_zone_nprocs;
3492677Sml93401 rctl_hndl_t rc_zone_shmmax;
3502677Sml93401 rctl_hndl_t rc_zone_shmmni;
3512677Sml93401 rctl_hndl_t rc_zone_semmni;
3522677Sml93401 rctl_hndl_t rc_zone_msgmni;
3530Sstevel@tonic-gate /*
3540Sstevel@tonic-gate  * Synchronization primitives used to synchronize between mounts and zone
3550Sstevel@tonic-gate  * creation/destruction.
3560Sstevel@tonic-gate  */
3570Sstevel@tonic-gate static int mounts_in_progress;
3580Sstevel@tonic-gate static kcondvar_t mount_cv;
3590Sstevel@tonic-gate static kmutex_t mount_lock;
3600Sstevel@tonic-gate 
3612267Sdp const char * const zone_default_initname = "/sbin/init";
3621676Sjpk static char * const zone_prefix = "/zone/";
3630Sstevel@tonic-gate static int zone_shutdown(zoneid_t zoneid);
36410616SSebastien.Roy@Sun.COM static int zone_add_datalink(zoneid_t, datalink_id_t);
36510616SSebastien.Roy@Sun.COM static int zone_remove_datalink(zoneid_t, datalink_id_t);
36610616SSebastien.Roy@Sun.COM static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
3670Sstevel@tonic-gate 
3685880Snordmark typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
3695880Snordmark 
3705880Snordmark static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
3715880Snordmark static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
3725880Snordmark static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
3735880Snordmark static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
3745880Snordmark     zone_key_t);
3755880Snordmark static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
3765880Snordmark static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
3775880Snordmark     kmutex_t *);
3785880Snordmark static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
3795880Snordmark     kmutex_t *);
3805880Snordmark 
3810Sstevel@tonic-gate /*
382813Sdp  * Bump this number when you alter the zone syscall interfaces; this is
383813Sdp  * because we need to have support for previous API versions in libc
384813Sdp  * to support patching; libc calls into the kernel to determine this number.
385813Sdp  *
386813Sdp  * Version 1 of the API is the version originally shipped with Solaris 10
387813Sdp  * Version 2 alters the zone_create system call in order to support more
388813Sdp  *     arguments by moving the args into a structure; and to do better
389813Sdp  *     error reporting when zone_create() fails.
390813Sdp  * Version 3 alters the zone_create system call in order to support the
391813Sdp  *     import of ZFS datasets to zones.
3921676Sjpk  * Version 4 alters the zone_create system call in order to support
3931676Sjpk  *     Trusted Extensions.
3942267Sdp  * Version 5 alters the zone_boot system call, and converts its old
3952267Sdp  *     bootargs parameter to be set by the zone_setattr API instead.
3963448Sdh155122  * Version 6 adds the flag argument to zone_create.
397813Sdp  */
3983448Sdh155122 static const int ZONE_SYSCALL_API_VERSION = 6;
399813Sdp 
400813Sdp /*
4010Sstevel@tonic-gate  * Certain filesystems (such as NFS and autofs) need to know which zone
4020Sstevel@tonic-gate  * the mount is being placed in.  Because of this, we need to be able to
4030Sstevel@tonic-gate  * ensure that a zone isn't in the process of being created such that
4040Sstevel@tonic-gate  * nfs_mount() thinks it is in the global zone, while by the time it
4050Sstevel@tonic-gate  * gets added the list of mounted zones, it ends up on zoneA's mount
4060Sstevel@tonic-gate  * list.
4070Sstevel@tonic-gate  *
4080Sstevel@tonic-gate  * The following functions: block_mounts()/resume_mounts() and
4090Sstevel@tonic-gate  * mount_in_progress()/mount_completed() are used by zones and the VFS
4100Sstevel@tonic-gate  * layer (respectively) to synchronize zone creation and new mounts.
4110Sstevel@tonic-gate  *
4120Sstevel@tonic-gate  * The semantics are like a reader-reader lock such that there may
4130Sstevel@tonic-gate  * either be multiple mounts (or zone creations, if that weren't
4140Sstevel@tonic-gate  * serialized by zonehash_lock) in progress at the same time, but not
4150Sstevel@tonic-gate  * both.
4160Sstevel@tonic-gate  *
4170Sstevel@tonic-gate  * We use cv's so the user can ctrl-C out of the operation if it's
4180Sstevel@tonic-gate  * taking too long.
4190Sstevel@tonic-gate  *
4200Sstevel@tonic-gate  * The semantics are such that there is unfair bias towards the
4210Sstevel@tonic-gate  * "current" operation.  This means that zone creations may starve if
4220Sstevel@tonic-gate  * there is a rapid succession of new mounts coming in to the system, or
4230Sstevel@tonic-gate  * there is a remote possibility that zones will be created at such a
4240Sstevel@tonic-gate  * rate that new mounts will not be able to proceed.
4250Sstevel@tonic-gate  */
4260Sstevel@tonic-gate /*
4270Sstevel@tonic-gate  * Prevent new mounts from progressing to the point of calling
4280Sstevel@tonic-gate  * VFS_MOUNT().  If there are already mounts in this "region", wait for
4290Sstevel@tonic-gate  * them to complete.
4300Sstevel@tonic-gate  */
4310Sstevel@tonic-gate static int
4320Sstevel@tonic-gate block_mounts(void)
4330Sstevel@tonic-gate {
4340Sstevel@tonic-gate 	int retval = 0;
4350Sstevel@tonic-gate 
4360Sstevel@tonic-gate 	/*
4370Sstevel@tonic-gate 	 * Since it may block for a long time, block_mounts() shouldn't be
4380Sstevel@tonic-gate 	 * called with zonehash_lock held.
4390Sstevel@tonic-gate 	 */
4400Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4410Sstevel@tonic-gate 	mutex_enter(&mount_lock);
4420Sstevel@tonic-gate 	while (mounts_in_progress > 0) {
4430Sstevel@tonic-gate 		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
4440Sstevel@tonic-gate 			goto signaled;
4450Sstevel@tonic-gate 	}
4460Sstevel@tonic-gate 	/*
4470Sstevel@tonic-gate 	 * A negative value of mounts_in_progress indicates that mounts
4480Sstevel@tonic-gate 	 * have been blocked by (-mounts_in_progress) different callers.
4490Sstevel@tonic-gate 	 */
4500Sstevel@tonic-gate 	mounts_in_progress--;
4510Sstevel@tonic-gate 	retval = 1;
4520Sstevel@tonic-gate signaled:
4530Sstevel@tonic-gate 	mutex_exit(&mount_lock);
4540Sstevel@tonic-gate 	return (retval);
4550Sstevel@tonic-gate }
4560Sstevel@tonic-gate 
4570Sstevel@tonic-gate /*
4580Sstevel@tonic-gate  * The VFS layer may progress with new mounts as far as we're concerned.
4590Sstevel@tonic-gate  * Allow them to progress if we were the last obstacle.
4600Sstevel@tonic-gate  */
4610Sstevel@tonic-gate static void
4620Sstevel@tonic-gate resume_mounts(void)
4630Sstevel@tonic-gate {
4640Sstevel@tonic-gate 	mutex_enter(&mount_lock);
4650Sstevel@tonic-gate 	if (++mounts_in_progress == 0)
4660Sstevel@tonic-gate 		cv_broadcast(&mount_cv);
4670Sstevel@tonic-gate 	mutex_exit(&mount_lock);
4680Sstevel@tonic-gate }
4690Sstevel@tonic-gate 
4700Sstevel@tonic-gate /*
4710Sstevel@tonic-gate  * The VFS layer is busy with a mount; zones should wait until all
4720Sstevel@tonic-gate  * mounts are completed to progress.
4730Sstevel@tonic-gate  */
4740Sstevel@tonic-gate void
4750Sstevel@tonic-gate mount_in_progress(void)
4760Sstevel@tonic-gate {
4770Sstevel@tonic-gate 	mutex_enter(&mount_lock);
4780Sstevel@tonic-gate 	while (mounts_in_progress < 0)
4790Sstevel@tonic-gate 		cv_wait(&mount_cv, &mount_lock);
4800Sstevel@tonic-gate 	mounts_in_progress++;
4810Sstevel@tonic-gate 	mutex_exit(&mount_lock);
4820Sstevel@tonic-gate }
4830Sstevel@tonic-gate 
4840Sstevel@tonic-gate /*
4850Sstevel@tonic-gate  * VFS is done with one mount; wake up any waiting block_mounts()
4860Sstevel@tonic-gate  * callers if this is the last mount.
4870Sstevel@tonic-gate  */
4880Sstevel@tonic-gate void
4890Sstevel@tonic-gate mount_completed(void)
4900Sstevel@tonic-gate {
4910Sstevel@tonic-gate 	mutex_enter(&mount_lock);
4920Sstevel@tonic-gate 	if (--mounts_in_progress == 0)
4930Sstevel@tonic-gate 		cv_broadcast(&mount_cv);
4940Sstevel@tonic-gate 	mutex_exit(&mount_lock);
4950Sstevel@tonic-gate }
4960Sstevel@tonic-gate 
4970Sstevel@tonic-gate /*
4980Sstevel@tonic-gate  * ZSD routines.
4990Sstevel@tonic-gate  *
5000Sstevel@tonic-gate  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
5010Sstevel@tonic-gate  * defined by the pthread_key_create() and related interfaces.
5020Sstevel@tonic-gate  *
5030Sstevel@tonic-gate  * Kernel subsystems may register one or more data items and/or
5040Sstevel@tonic-gate  * callbacks to be executed when a zone is created, shutdown, or
5050Sstevel@tonic-gate  * destroyed.
5060Sstevel@tonic-gate  *
5070Sstevel@tonic-gate  * Unlike the thread counterpart, destructor callbacks will be executed
5080Sstevel@tonic-gate  * even if the data pointer is NULL and/or there are no constructor
5090Sstevel@tonic-gate  * callbacks, so it is the responsibility of such callbacks to check for
5100Sstevel@tonic-gate  * NULL data values if necessary.
5110Sstevel@tonic-gate  *
5120Sstevel@tonic-gate  * The locking strategy and overall picture is as follows:
5130Sstevel@tonic-gate  *
5140Sstevel@tonic-gate  * When someone calls zone_key_create(), a template ZSD entry is added to the
5155880Snordmark  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
5165880Snordmark  * holding that lock all the existing zones are marked as
5175880Snordmark  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
5185880Snordmark  * zone_zsd list (protected by zone_lock). The global list is updated first
5195880Snordmark  * (under zone_key_lock) to make sure that newly created zones use the
5205880Snordmark  * most recent list of keys. Then under zonehash_lock we walk the zones
5215880Snordmark  * and mark them.  Similar locking is used in zone_key_delete().
5220Sstevel@tonic-gate  *
5235880Snordmark  * The actual create, shutdown, and destroy callbacks are done without
5245880Snordmark  * holding any lock. And zsd_flags are used to ensure that the operations
5255880Snordmark  * completed so that when zone_key_create (and zone_create) is done, as well as
5265880Snordmark  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
5275880Snordmark  * are completed.
5280Sstevel@tonic-gate  *
5290Sstevel@tonic-gate  * When new zones are created constructor callbacks for all registered ZSD
5305880Snordmark  * entries will be called. That also uses the above two phases of marking
5315880Snordmark  * what needs to be done, and then running the callbacks without holding
5325880Snordmark  * any locks.
5330Sstevel@tonic-gate  *
5340Sstevel@tonic-gate  * The framework does not provide any locking around zone_getspecific() and
5350Sstevel@tonic-gate  * zone_setspecific() apart from that needed for internal consistency, so
5360Sstevel@tonic-gate  * callers interested in atomic "test-and-set" semantics will need to provide
5370Sstevel@tonic-gate  * their own locking.
5380Sstevel@tonic-gate  */
5390Sstevel@tonic-gate 
5400Sstevel@tonic-gate /*
5410Sstevel@tonic-gate  * Helper function to find the zsd_entry associated with the key in the
5420Sstevel@tonic-gate  * given list.
5430Sstevel@tonic-gate  */
5440Sstevel@tonic-gate static struct zsd_entry *
5450Sstevel@tonic-gate zsd_find(list_t *l, zone_key_t key)
5460Sstevel@tonic-gate {
5470Sstevel@tonic-gate 	struct zsd_entry *zsd;
5480Sstevel@tonic-gate 
5490Sstevel@tonic-gate 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
5500Sstevel@tonic-gate 		if (zsd->zsd_key == key) {
5515880Snordmark 			return (zsd);
5525880Snordmark 		}
5535880Snordmark 	}
5545880Snordmark 	return (NULL);
5555880Snordmark }
5565880Snordmark 
5575880Snordmark /*
5585880Snordmark  * Helper function to find the zsd_entry associated with the key in the
5595880Snordmark  * given list. Move it to the front of the list.
5605880Snordmark  */
5615880Snordmark static struct zsd_entry *
5625880Snordmark zsd_find_mru(list_t *l, zone_key_t key)
5635880Snordmark {
5645880Snordmark 	struct zsd_entry *zsd;
5655880Snordmark 
5665880Snordmark 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
5675880Snordmark 		if (zsd->zsd_key == key) {
5680Sstevel@tonic-gate 			/*
5690Sstevel@tonic-gate 			 * Move to head of list to keep list in MRU order.
5700Sstevel@tonic-gate 			 */
5710Sstevel@tonic-gate 			if (zsd != list_head(l)) {
5720Sstevel@tonic-gate 				list_remove(l, zsd);
5730Sstevel@tonic-gate 				list_insert_head(l, zsd);
5740Sstevel@tonic-gate 			}
5750Sstevel@tonic-gate 			return (zsd);
5760Sstevel@tonic-gate 		}
5770Sstevel@tonic-gate 	}
5780Sstevel@tonic-gate 	return (NULL);
5790Sstevel@tonic-gate }
5800Sstevel@tonic-gate 
5815880Snordmark void
5825880Snordmark zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
5835880Snordmark     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
5845880Snordmark {
5855880Snordmark 	struct zsd_entry *zsdp;
5865880Snordmark 	struct zsd_entry *t;
5875880Snordmark 	struct zone *zone;
5885880Snordmark 	zone_key_t  key;
5895880Snordmark 
5905880Snordmark 	zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
5915880Snordmark 	zsdp->zsd_data = NULL;
5925880Snordmark 	zsdp->zsd_create = create;
5935880Snordmark 	zsdp->zsd_shutdown = shutdown;
5945880Snordmark 	zsdp->zsd_destroy = destroy;
5955880Snordmark 
5965880Snordmark 	/*
5975880Snordmark 	 * Insert in global list of callbacks. Makes future zone creations
5985880Snordmark 	 * see it.
5995880Snordmark 	 */
6005880Snordmark 	mutex_enter(&zsd_key_lock);
60110865SPramod.Batni@Sun.COM 	key = zsdp->zsd_key = ++zsd_keyval;
6025880Snordmark 	ASSERT(zsd_keyval != 0);
6035880Snordmark 	list_insert_tail(&zsd_registered_keys, zsdp);
6045880Snordmark 	mutex_exit(&zsd_key_lock);
6055880Snordmark 
6065880Snordmark 	/*
6075880Snordmark 	 * Insert for all existing zones and mark them as needing
6085880Snordmark 	 * a create callback.
6095880Snordmark 	 */
6105880Snordmark 	mutex_enter(&zonehash_lock);	/* stop the world */
6115880Snordmark 	for (zone = list_head(&zone_active); zone != NULL;
6125880Snordmark 	    zone = list_next(&zone_active, zone)) {
6135880Snordmark 		zone_status_t status;
6145880Snordmark 
6155880Snordmark 		mutex_enter(&zone->zone_lock);
6165880Snordmark 
6175880Snordmark 		/* Skip zones that are on the way down or not yet up */
6185880Snordmark 		status = zone_status_get(zone);
6195880Snordmark 		if (status >= ZONE_IS_DOWN ||
6205880Snordmark 		    status == ZONE_IS_UNINITIALIZED) {
6215880Snordmark 			mutex_exit(&zone->zone_lock);
6225880Snordmark 			continue;
6235880Snordmark 		}
6245880Snordmark 
6255880Snordmark 		t = zsd_find_mru(&zone->zone_zsd, key);
6265880Snordmark 		if (t != NULL) {
6275880Snordmark 			/*
6285880Snordmark 			 * A zsd_configure already inserted it after
6295880Snordmark 			 * we dropped zsd_key_lock above.
6305880Snordmark 			 */
6315880Snordmark 			mutex_exit(&zone->zone_lock);
6325880Snordmark 			continue;
6335880Snordmark 		}
6345880Snordmark 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
6355880Snordmark 		t->zsd_key = key;
6365880Snordmark 		t->zsd_create = create;
6375880Snordmark 		t->zsd_shutdown = shutdown;
6385880Snordmark 		t->zsd_destroy = destroy;
6395880Snordmark 		if (create != NULL) {
6405880Snordmark 			t->zsd_flags = ZSD_CREATE_NEEDED;
6415880Snordmark 			DTRACE_PROBE2(zsd__create__needed,
6425880Snordmark 			    zone_t *, zone, zone_key_t, key);
6435880Snordmark 		}
6445880Snordmark 		list_insert_tail(&zone->zone_zsd, t);
6455880Snordmark 		mutex_exit(&zone->zone_lock);
6465880Snordmark 	}
6475880Snordmark 	mutex_exit(&zonehash_lock);
6485880Snordmark 
6495880Snordmark 	if (create != NULL) {
6505880Snordmark 		/* Now call the create callback for this key */
6515880Snordmark 		zsd_apply_all_zones(zsd_apply_create, key);
6525880Snordmark 	}
65310865SPramod.Batni@Sun.COM 	/*
65410910SRobert.Harris@Sun.COM 	 * It is safe for consumers to use the key now, make it
65510910SRobert.Harris@Sun.COM 	 * globally visible. Specifically zone_getspecific() will
65610910SRobert.Harris@Sun.COM 	 * always successfully return the zone specific data associated
65710910SRobert.Harris@Sun.COM 	 * with the key.
65810910SRobert.Harris@Sun.COM 	 */
65910865SPramod.Batni@Sun.COM 	*keyp = key;
66010865SPramod.Batni@Sun.COM 
6615880Snordmark }
6625880Snordmark 
6630Sstevel@tonic-gate /*
6640Sstevel@tonic-gate  * Function called when a module is being unloaded, or otherwise wishes
6650Sstevel@tonic-gate  * to unregister its ZSD key and callbacks.
6665880Snordmark  *
6675880Snordmark  * Remove from the global list and determine the functions that need to
6685880Snordmark  * be called under a global lock. Then call the functions without
6695880Snordmark  * holding any locks. Finally free up the zone_zsd entries. (The apply
6705880Snordmark  * functions need to access the zone_zsd entries to find zsd_data etc.)
6710Sstevel@tonic-gate  */
6720Sstevel@tonic-gate int
6730Sstevel@tonic-gate zone_key_delete(zone_key_t key)
6740Sstevel@tonic-gate {
6750Sstevel@tonic-gate 	struct zsd_entry *zsdp = NULL;
6760Sstevel@tonic-gate 	zone_t *zone;
6770Sstevel@tonic-gate 
6780Sstevel@tonic-gate 	mutex_enter(&zsd_key_lock);
6795880Snordmark 	zsdp = zsd_find_mru(&zsd_registered_keys, key);
6805880Snordmark 	if (zsdp == NULL) {
6815880Snordmark 		mutex_exit(&zsd_key_lock);
6825880Snordmark 		return (-1);
6835880Snordmark 	}
6840Sstevel@tonic-gate 	list_remove(&zsd_registered_keys, zsdp);
6850Sstevel@tonic-gate 	mutex_exit(&zsd_key_lock);
6860Sstevel@tonic-gate 
6875880Snordmark 	mutex_enter(&zonehash_lock);
6880Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
6890Sstevel@tonic-gate 	    zone = list_next(&zone_active, zone)) {
6900Sstevel@tonic-gate 		struct zsd_entry *del;
6915880Snordmark 
6925880Snordmark 		mutex_enter(&zone->zone_lock);
6935880Snordmark 		del = zsd_find_mru(&zone->zone_zsd, key);
6945880Snordmark 		if (del == NULL) {
6955880Snordmark 			/*
6965880Snordmark 			 * Somebody else got here first e.g the zone going
6975880Snordmark 			 * away.
6985880Snordmark 			 */
6995880Snordmark 			mutex_exit(&zone->zone_lock);
7005880Snordmark 			continue;
7015880Snordmark 		}
7025880Snordmark 		ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
7035880Snordmark 		ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
7045880Snordmark 		if (del->zsd_shutdown != NULL &&
7055880Snordmark 		    (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
7065880Snordmark 			del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
7075880Snordmark 			DTRACE_PROBE2(zsd__shutdown__needed,
7085880Snordmark 			    zone_t *, zone, zone_key_t, key);
7095880Snordmark 		}
7105880Snordmark 		if (del->zsd_destroy != NULL &&
7115880Snordmark 		    (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
7125880Snordmark 			del->zsd_flags |= ZSD_DESTROY_NEEDED;
7135880Snordmark 			DTRACE_PROBE2(zsd__destroy__needed,
7145880Snordmark 			    zone_t *, zone, zone_key_t, key);
7150Sstevel@tonic-gate 		}
7160Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
7170Sstevel@tonic-gate 	}
7180Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
7190Sstevel@tonic-gate 	kmem_free(zsdp, sizeof (*zsdp));
7205880Snordmark 
7215880Snordmark 	/* Now call the shutdown and destroy callback for this key */
7225880Snordmark 	zsd_apply_all_zones(zsd_apply_shutdown, key);
7235880Snordmark 	zsd_apply_all_zones(zsd_apply_destroy, key);
7245880Snordmark 
7255880Snordmark 	/* Now we can free up the zsdp structures in each zone */
7265880Snordmark 	mutex_enter(&zonehash_lock);
7270Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
7285880Snordmark 	    zone = list_next(&zone_active, zone)) {
7295880Snordmark 		struct zsd_entry *del;
7305880Snordmark 
7315880Snordmark 		mutex_enter(&zone->zone_lock);
7325880Snordmark 		del = zsd_find(&zone->zone_zsd, key);
7335880Snordmark 		if (del != NULL) {
7345880Snordmark 			list_remove(&zone->zone_zsd, del);
7355880Snordmark 			ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
7365880Snordmark 			kmem_free(del, sizeof (*del));
7375880Snordmark 		}
7380Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
7395880Snordmark 	}
7400Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
7415880Snordmark 
7425880Snordmark 	return (0);
7430Sstevel@tonic-gate }
7440Sstevel@tonic-gate 
7450Sstevel@tonic-gate /*
7460Sstevel@tonic-gate  * ZSD counterpart of pthread_setspecific().
7475880Snordmark  *
7485880Snordmark  * Since all zsd callbacks, including those with no create function,
7495880Snordmark  * have an entry in zone_zsd, if the key is registered it is part of
7505880Snordmark  * the zone_zsd list.
7515880Snordmark  * Return an error if the key wasn't registerd.
7520Sstevel@tonic-gate  */
7530Sstevel@tonic-gate int
7540Sstevel@tonic-gate zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
7550Sstevel@tonic-gate {
7560Sstevel@tonic-gate 	struct zsd_entry *t;
7570Sstevel@tonic-gate 
7580Sstevel@tonic-gate 	mutex_enter(&zone->zone_lock);
7595880Snordmark 	t = zsd_find_mru(&zone->zone_zsd, key);
7600Sstevel@tonic-gate 	if (t != NULL) {
7610Sstevel@tonic-gate 		/*
7620Sstevel@tonic-gate 		 * Replace old value with new
7630Sstevel@tonic-gate 		 */
7640Sstevel@tonic-gate 		t->zsd_data = (void *)data;
7650Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
7660Sstevel@tonic-gate 		return (0);
7670Sstevel@tonic-gate 	}
7680Sstevel@tonic-gate 	mutex_exit(&zone->zone_lock);
7695880Snordmark 	return (-1);
7700Sstevel@tonic-gate }
7710Sstevel@tonic-gate 
7720Sstevel@tonic-gate /*
7730Sstevel@tonic-gate  * ZSD counterpart of pthread_getspecific().
7740Sstevel@tonic-gate  */
7750Sstevel@tonic-gate void *
7760Sstevel@tonic-gate zone_getspecific(zone_key_t key, zone_t *zone)
7770Sstevel@tonic-gate {
7780Sstevel@tonic-gate 	struct zsd_entry *t;
7790Sstevel@tonic-gate 	void *data;
7800Sstevel@tonic-gate 
7810Sstevel@tonic-gate 	mutex_enter(&zone->zone_lock);
7825880Snordmark 	t = zsd_find_mru(&zone->zone_zsd, key);
7830Sstevel@tonic-gate 	data = (t == NULL ? NULL : t->zsd_data);
7840Sstevel@tonic-gate 	mutex_exit(&zone->zone_lock);
7850Sstevel@tonic-gate 	return (data);
7860Sstevel@tonic-gate }
7870Sstevel@tonic-gate 
7880Sstevel@tonic-gate /*
7890Sstevel@tonic-gate  * Function used to initialize a zone's list of ZSD callbacks and data
7900Sstevel@tonic-gate  * when the zone is being created.  The callbacks are initialized from
7915880Snordmark  * the template list (zsd_registered_keys). The constructor callback is
7925880Snordmark  * executed later (once the zone exists and with locks dropped).
7930Sstevel@tonic-gate  */
7940Sstevel@tonic-gate static void
7950Sstevel@tonic-gate zone_zsd_configure(zone_t *zone)
7960Sstevel@tonic-gate {
7970Sstevel@tonic-gate 	struct zsd_entry *zsdp;
7980Sstevel@tonic-gate 	struct zsd_entry *t;
7990Sstevel@tonic-gate 
8000Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
8010Sstevel@tonic-gate 	ASSERT(list_head(&zone->zone_zsd) == NULL);
8025880Snordmark 	mutex_enter(&zone->zone_lock);
8030Sstevel@tonic-gate 	mutex_enter(&zsd_key_lock);
8040Sstevel@tonic-gate 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
8050Sstevel@tonic-gate 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
8065880Snordmark 		/*
8075880Snordmark 		 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
8085880Snordmark 		 * should not have added anything to it.
8095880Snordmark 		 */
8105880Snordmark 		ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
8115880Snordmark 
8125880Snordmark 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
8135880Snordmark 		t->zsd_key = zsdp->zsd_key;
8145880Snordmark 		t->zsd_create = zsdp->zsd_create;
8155880Snordmark 		t->zsd_shutdown = zsdp->zsd_shutdown;
8165880Snordmark 		t->zsd_destroy = zsdp->zsd_destroy;
8170Sstevel@tonic-gate 		if (zsdp->zsd_create != NULL) {
8185880Snordmark 			t->zsd_flags = ZSD_CREATE_NEEDED;
8195880Snordmark 			DTRACE_PROBE2(zsd__create__needed,
8205880Snordmark 			    zone_t *, zone, zone_key_t, zsdp->zsd_key);
8210Sstevel@tonic-gate 		}
8225880Snordmark 		list_insert_tail(&zone->zone_zsd, t);
8230Sstevel@tonic-gate 	}
8240Sstevel@tonic-gate 	mutex_exit(&zsd_key_lock);
8255880Snordmark 	mutex_exit(&zone->zone_lock);
8260Sstevel@tonic-gate }
8270Sstevel@tonic-gate 
8280Sstevel@tonic-gate enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
8290Sstevel@tonic-gate 
8300Sstevel@tonic-gate /*
8310Sstevel@tonic-gate  * Helper function to execute shutdown or destructor callbacks.
8320Sstevel@tonic-gate  */
8330Sstevel@tonic-gate static void
8340Sstevel@tonic-gate zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
8350Sstevel@tonic-gate {
8360Sstevel@tonic-gate 	struct zsd_entry *t;
8370Sstevel@tonic-gate 
8380Sstevel@tonic-gate 	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
8390Sstevel@tonic-gate 	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
8400Sstevel@tonic-gate 	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
8410Sstevel@tonic-gate 
8425880Snordmark 	/*
8435880Snordmark 	 * Run the callback solely based on what is registered for the zone
8445880Snordmark 	 * in zone_zsd. The global list can change independently of this
8455880Snordmark 	 * as keys are registered and unregistered and we don't register new
8465880Snordmark 	 * callbacks for a zone that is in the process of going away.
8475880Snordmark 	 */
8480Sstevel@tonic-gate 	mutex_enter(&zone->zone_lock);
8495880Snordmark 	for (t = list_head(&zone->zone_zsd); t != NULL;
8505880Snordmark 	    t = list_next(&zone->zone_zsd, t)) {
8515880Snordmark 		zone_key_t key = t->zsd_key;
8520Sstevel@tonic-gate 
8530Sstevel@tonic-gate 		/* Skip if no callbacks registered */
8545880Snordmark 
8555880Snordmark 		if (ct == ZSD_SHUTDOWN) {
8565880Snordmark 			if (t->zsd_shutdown != NULL &&
8575880Snordmark 			    (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
8585880Snordmark 				t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
8595880Snordmark 				DTRACE_PROBE2(zsd__shutdown__needed,
8605880Snordmark 				    zone_t *, zone, zone_key_t, key);
8610Sstevel@tonic-gate 			}
8620Sstevel@tonic-gate 		} else {
8635880Snordmark 			if (t->zsd_destroy != NULL &&
8645880Snordmark 			    (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
8655880Snordmark 				t->zsd_flags |= ZSD_DESTROY_NEEDED;
8665880Snordmark 				DTRACE_PROBE2(zsd__destroy__needed,
8675880Snordmark 				    zone_t *, zone, zone_key_t, key);
8680Sstevel@tonic-gate 			}
8690Sstevel@tonic-gate 		}
8700Sstevel@tonic-gate 	}
8715880Snordmark 	mutex_exit(&zone->zone_lock);
8725880Snordmark 
8735880Snordmark 	/* Now call the shutdown and destroy callback for this key */
8745880Snordmark 	zsd_apply_all_keys(zsd_apply_shutdown, zone);
8755880Snordmark 	zsd_apply_all_keys(zsd_apply_destroy, zone);
8765880Snordmark 
8770Sstevel@tonic-gate }
8780Sstevel@tonic-gate 
8790Sstevel@tonic-gate /*
8800Sstevel@tonic-gate  * Called when the zone is going away; free ZSD-related memory, and
8810Sstevel@tonic-gate  * destroy the zone_zsd list.
8820Sstevel@tonic-gate  */
8830Sstevel@tonic-gate static void
8840Sstevel@tonic-gate zone_free_zsd(zone_t *zone)
8850Sstevel@tonic-gate {
8860Sstevel@tonic-gate 	struct zsd_entry *t, *next;
8870Sstevel@tonic-gate 
8880Sstevel@tonic-gate 	/*
8890Sstevel@tonic-gate 	 * Free all the zsd_entry's we had on this zone.
8900Sstevel@tonic-gate 	 */
8915880Snordmark 	mutex_enter(&zone->zone_lock);
8920Sstevel@tonic-gate 	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
8930Sstevel@tonic-gate 		next = list_next(&zone->zone_zsd, t);
8940Sstevel@tonic-gate 		list_remove(&zone->zone_zsd, t);
8955880Snordmark 		ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
8960Sstevel@tonic-gate 		kmem_free(t, sizeof (*t));
8970Sstevel@tonic-gate 	}
8980Sstevel@tonic-gate 	list_destroy(&zone->zone_zsd);
8995880Snordmark 	mutex_exit(&zone->zone_lock);
9005880Snordmark 
9015880Snordmark }
9025880Snordmark 
9035880Snordmark /*
9045880Snordmark  * Apply a function to all zones for particular key value.
9055880Snordmark  *
9065880Snordmark  * The applyfn has to drop zonehash_lock if it does some work, and
9075880Snordmark  * then reacquire it before it returns.
9085880Snordmark  * When the lock is dropped we don't follow list_next even
9095880Snordmark  * if it is possible to do so without any hazards. This is
9105880Snordmark  * because we want the design to allow for the list of zones
9115880Snordmark  * to change in any arbitrary way during the time the
9125880Snordmark  * lock was dropped.
9135880Snordmark  *
9145880Snordmark  * It is safe to restart the loop at list_head since the applyfn
9155880Snordmark  * changes the zsd_flags as it does work, so a subsequent
9165880Snordmark  * pass through will have no effect in applyfn, hence the loop will terminate
9175880Snordmark  * in at worst O(N^2).
9185880Snordmark  */
9195880Snordmark static void
9205880Snordmark zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
9215880Snordmark {
9225880Snordmark 	zone_t *zone;
9235880Snordmark 
9245880Snordmark 	mutex_enter(&zonehash_lock);
9255880Snordmark 	zone = list_head(&zone_active);
9265880Snordmark 	while (zone != NULL) {
9275880Snordmark 		if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
9285880Snordmark 			/* Lock dropped - restart at head */
9295880Snordmark 			zone = list_head(&zone_active);
9305880Snordmark 		} else {
9315880Snordmark 			zone = list_next(&zone_active, zone);
9325880Snordmark 		}
9335880Snordmark 	}
9345880Snordmark 	mutex_exit(&zonehash_lock);
9355880Snordmark }
9365880Snordmark 
9375880Snordmark /*
9385880Snordmark  * Apply a function to all keys for a particular zone.
9395880Snordmark  *
9405880Snordmark  * The applyfn has to drop zonehash_lock if it does some work, and
9415880Snordmark  * then reacquire it before it returns.
9425880Snordmark  * When the lock is dropped we don't follow list_next even
9435880Snordmark  * if it is possible to do so without any hazards. This is
9445880Snordmark  * because we want the design to allow for the list of zsd callbacks
9455880Snordmark  * to change in any arbitrary way during the time the
9465880Snordmark  * lock was dropped.
9475880Snordmark  *
9485880Snordmark  * It is safe to restart the loop at list_head since the applyfn
9495880Snordmark  * changes the zsd_flags as it does work, so a subsequent
9505880Snordmark  * pass through will have no effect in applyfn, hence the loop will terminate
9515880Snordmark  * in at worst O(N^2).
9525880Snordmark  */
9535880Snordmark static void
9545880Snordmark zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
9555880Snordmark {
9565880Snordmark 	struct zsd_entry *t;
9575880Snordmark 
9585880Snordmark 	mutex_enter(&zone->zone_lock);
9595880Snordmark 	t = list_head(&zone->zone_zsd);
9605880Snordmark 	while (t != NULL) {
9615880Snordmark 		if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
9625880Snordmark 			/* Lock dropped - restart at head */
9635880Snordmark 			t = list_head(&zone->zone_zsd);
9645880Snordmark 		} else {
9655880Snordmark 			t = list_next(&zone->zone_zsd, t);
9665880Snordmark 		}
9675880Snordmark 	}
9685880Snordmark 	mutex_exit(&zone->zone_lock);
9695880Snordmark }
9705880Snordmark 
9715880Snordmark /*
9725880Snordmark  * Call the create function for the zone and key if CREATE_NEEDED
9735880Snordmark  * is set.
9745880Snordmark  * If some other thread gets here first and sets CREATE_INPROGRESS, then
9755880Snordmark  * we wait for that thread to complete so that we can ensure that
9765880Snordmark  * all the callbacks are done when we've looped over all zones/keys.
9775880Snordmark  *
9785880Snordmark  * When we call the create function, we drop the global held by the
9795880Snordmark  * caller, and return true to tell the caller it needs to re-evalute the
9805880Snordmark  * state.
9815880Snordmark  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
9825880Snordmark  * remains held on exit.
9835880Snordmark  */
9845880Snordmark static boolean_t
9855880Snordmark zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
9865880Snordmark     zone_t *zone, zone_key_t key)
9875880Snordmark {
9885880Snordmark 	void *result;
9895880Snordmark 	struct zsd_entry *t;
9905880Snordmark 	boolean_t dropped;
9915880Snordmark 
9925880Snordmark 	if (lockp != NULL) {
9935880Snordmark 		ASSERT(MUTEX_HELD(lockp));
9945880Snordmark 	}
9955880Snordmark 	if (zone_lock_held) {
9965880Snordmark 		ASSERT(MUTEX_HELD(&zone->zone_lock));
9975880Snordmark 	} else {
9985880Snordmark 		mutex_enter(&zone->zone_lock);
9995880Snordmark 	}
10005880Snordmark 
10015880Snordmark 	t = zsd_find(&zone->zone_zsd, key);
10025880Snordmark 	if (t == NULL) {
10035880Snordmark 		/*
10045880Snordmark 		 * Somebody else got here first e.g the zone going
10055880Snordmark 		 * away.
10065880Snordmark 		 */
10075880Snordmark 		if (!zone_lock_held)
10085880Snordmark 			mutex_exit(&zone->zone_lock);
10095880Snordmark 		return (B_FALSE);
10105880Snordmark 	}
10115880Snordmark 	dropped = B_FALSE;
10125880Snordmark 	if (zsd_wait_for_inprogress(zone, t, lockp))
10135880Snordmark 		dropped = B_TRUE;
10145880Snordmark 
10155880Snordmark 	if (t->zsd_flags & ZSD_CREATE_NEEDED) {
10165880Snordmark 		t->zsd_flags &= ~ZSD_CREATE_NEEDED;
10175880Snordmark 		t->zsd_flags |= ZSD_CREATE_INPROGRESS;
10185880Snordmark 		DTRACE_PROBE2(zsd__create__inprogress,
10195880Snordmark 		    zone_t *, zone, zone_key_t, key);
10205880Snordmark 		mutex_exit(&zone->zone_lock);
10215880Snordmark 		if (lockp != NULL)
10225880Snordmark 			mutex_exit(lockp);
10235880Snordmark 
10245880Snordmark 		dropped = B_TRUE;
10255880Snordmark 		ASSERT(t->zsd_create != NULL);
10265880Snordmark 		DTRACE_PROBE2(zsd__create__start,
10275880Snordmark 		    zone_t *, zone, zone_key_t, key);
10285880Snordmark 
10295880Snordmark 		result = (*t->zsd_create)(zone->zone_id);
10305880Snordmark 
10315880Snordmark 		DTRACE_PROBE2(zsd__create__end,
10325880Snordmark 		    zone_t *, zone, voidn *, result);
10335880Snordmark 
10345880Snordmark 		ASSERT(result != NULL);
10355880Snordmark 		if (lockp != NULL)
10365880Snordmark 			mutex_enter(lockp);
10375880Snordmark 		mutex_enter(&zone->zone_lock);
10385880Snordmark 		t->zsd_data = result;
10395880Snordmark 		t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
10405880Snordmark 		t->zsd_flags |= ZSD_CREATE_COMPLETED;
10415880Snordmark 		cv_broadcast(&t->zsd_cv);
10425880Snordmark 		DTRACE_PROBE2(zsd__create__completed,
10435880Snordmark 		    zone_t *, zone, zone_key_t, key);
10445880Snordmark 	}
10455880Snordmark 	if (!zone_lock_held)
10465880Snordmark 		mutex_exit(&zone->zone_lock);
10475880Snordmark 	return (dropped);
10485880Snordmark }
10495880Snordmark 
10505880Snordmark /*
10515880Snordmark  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
10525880Snordmark  * is set.
10535880Snordmark  * If some other thread gets here first and sets *_INPROGRESS, then
10545880Snordmark  * we wait for that thread to complete so that we can ensure that
10555880Snordmark  * all the callbacks are done when we've looped over all zones/keys.
10565880Snordmark  *
10575880Snordmark  * When we call the shutdown function, we drop the global held by the
10585880Snordmark  * caller, and return true to tell the caller it needs to re-evalute the
10595880Snordmark  * state.
10605880Snordmark  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
10615880Snordmark  * remains held on exit.
10625880Snordmark  */
10635880Snordmark static boolean_t
10645880Snordmark zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
10655880Snordmark     zone_t *zone, zone_key_t key)
10665880Snordmark {
10675880Snordmark 	struct zsd_entry *t;
10685880Snordmark 	void *data;
10695880Snordmark 	boolean_t dropped;
10705880Snordmark 
10715880Snordmark 	if (lockp != NULL) {
10725880Snordmark 		ASSERT(MUTEX_HELD(lockp));
10735880Snordmark 	}
10745880Snordmark 	if (zone_lock_held) {
10755880Snordmark 		ASSERT(MUTEX_HELD(&zone->zone_lock));
10765880Snordmark 	} else {
10775880Snordmark 		mutex_enter(&zone->zone_lock);
10785880Snordmark 	}
10795880Snordmark 
10805880Snordmark 	t = zsd_find(&zone->zone_zsd, key);
10815880Snordmark 	if (t == NULL) {
10825880Snordmark 		/*
10835880Snordmark 		 * Somebody else got here first e.g the zone going
10845880Snordmark 		 * away.
10855880Snordmark 		 */
10865880Snordmark 		if (!zone_lock_held)
10875880Snordmark 			mutex_exit(&zone->zone_lock);
10885880Snordmark 		return (B_FALSE);
10895880Snordmark 	}
10905880Snordmark 	dropped = B_FALSE;
10915880Snordmark 	if (zsd_wait_for_creator(zone, t, lockp))
10925880Snordmark 		dropped = B_TRUE;
10935880Snordmark 
10945880Snordmark 	if (zsd_wait_for_inprogress(zone, t, lockp))
10955880Snordmark 		dropped = B_TRUE;
10965880Snordmark 
10975880Snordmark 	if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
10985880Snordmark 		t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
10995880Snordmark 		t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
11005880Snordmark 		DTRACE_PROBE2(zsd__shutdown__inprogress,
11015880Snordmark 		    zone_t *, zone, zone_key_t, key);
11025880Snordmark 		mutex_exit(&zone->zone_lock);
11035880Snordmark 		if (lockp != NULL)
11045880Snordmark 			mutex_exit(lockp);
11055880Snordmark 		dropped = B_TRUE;
11065880Snordmark 
11075880Snordmark 		ASSERT(t->zsd_shutdown != NULL);
11085880Snordmark 		data = t->zsd_data;
11095880Snordmark 
11105880Snordmark 		DTRACE_PROBE2(zsd__shutdown__start,
11115880Snordmark 		    zone_t *, zone, zone_key_t, key);
11125880Snordmark 
11135880Snordmark 		(t->zsd_shutdown)(zone->zone_id, data);
11145880Snordmark 		DTRACE_PROBE2(zsd__shutdown__end,
11155880Snordmark 		    zone_t *, zone, zone_key_t, key);
11165880Snordmark 
11175880Snordmark 		if (lockp != NULL)
11185880Snordmark 			mutex_enter(lockp);
11195880Snordmark 		mutex_enter(&zone->zone_lock);
11205880Snordmark 		t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
11215880Snordmark 		t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
11225880Snordmark 		cv_broadcast(&t->zsd_cv);
11235880Snordmark 		DTRACE_PROBE2(zsd__shutdown__completed,
11245880Snordmark 		    zone_t *, zone, zone_key_t, key);
11255880Snordmark 	}
11265880Snordmark 	if (!zone_lock_held)
11275880Snordmark 		mutex_exit(&zone->zone_lock);
11285880Snordmark 	return (dropped);
11295880Snordmark }
11305880Snordmark 
11315880Snordmark /*
11325880Snordmark  * Call the destroy function for the zone and key if DESTROY_NEEDED
11335880Snordmark  * is set.
11345880Snordmark  * If some other thread gets here first and sets *_INPROGRESS, then
11355880Snordmark  * we wait for that thread to complete so that we can ensure that
11365880Snordmark  * all the callbacks are done when we've looped over all zones/keys.
11375880Snordmark  *
11385880Snordmark  * When we call the destroy function, we drop the global held by the
11395880Snordmark  * caller, and return true to tell the caller it needs to re-evalute the
11405880Snordmark  * state.
11415880Snordmark  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
11425880Snordmark  * remains held on exit.
11435880Snordmark  */
11445880Snordmark static boolean_t
11455880Snordmark zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
11465880Snordmark     zone_t *zone, zone_key_t key)
11475880Snordmark {
11485880Snordmark 	struct zsd_entry *t;
11495880Snordmark 	void *data;
11505880Snordmark 	boolean_t dropped;
11515880Snordmark 
11525880Snordmark 	if (lockp != NULL) {
11535880Snordmark 		ASSERT(MUTEX_HELD(lockp));
11545880Snordmark 	}
11555880Snordmark 	if (zone_lock_held) {
11565880Snordmark 		ASSERT(MUTEX_HELD(&zone->zone_lock));
11575880Snordmark 	} else {
11585880Snordmark 		mutex_enter(&zone->zone_lock);
11595880Snordmark 	}
11605880Snordmark 
11615880Snordmark 	t = zsd_find(&zone->zone_zsd, key);
11625880Snordmark 	if (t == NULL) {
11635880Snordmark 		/*
11645880Snordmark 		 * Somebody else got here first e.g the zone going
11655880Snordmark 		 * away.
11665880Snordmark 		 */
11675880Snordmark 		if (!zone_lock_held)
11685880Snordmark 			mutex_exit(&zone->zone_lock);
11695880Snordmark 		return (B_FALSE);
11705880Snordmark 	}
11715880Snordmark 	dropped = B_FALSE;
11725880Snordmark 	if (zsd_wait_for_creator(zone, t, lockp))
11735880Snordmark 		dropped = B_TRUE;
11745880Snordmark 
11755880Snordmark 	if (zsd_wait_for_inprogress(zone, t, lockp))
11765880Snordmark 		dropped = B_TRUE;
11775880Snordmark 
11785880Snordmark 	if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
11795880Snordmark 		t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
11805880Snordmark 		t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
11815880Snordmark 		DTRACE_PROBE2(zsd__destroy__inprogress,
11825880Snordmark 		    zone_t *, zone, zone_key_t, key);
11835880Snordmark 		mutex_exit(&zone->zone_lock);
11845880Snordmark 		if (lockp != NULL)
11855880Snordmark 			mutex_exit(lockp);
11865880Snordmark 		dropped = B_TRUE;
11875880Snordmark 
11885880Snordmark 		ASSERT(t->zsd_destroy != NULL);
11895880Snordmark 		data = t->zsd_data;
11905880Snordmark 		DTRACE_PROBE2(zsd__destroy__start,
11915880Snordmark 		    zone_t *, zone, zone_key_t, key);
11925880Snordmark 
11935880Snordmark 		(t->zsd_destroy)(zone->zone_id, data);
11945880Snordmark 		DTRACE_PROBE2(zsd__destroy__end,
11955880Snordmark 		    zone_t *, zone, zone_key_t, key);
11965880Snordmark 
11975880Snordmark 		if (lockp != NULL)
11985880Snordmark 			mutex_enter(lockp);
11995880Snordmark 		mutex_enter(&zone->zone_lock);
12005880Snordmark 		t->zsd_data = NULL;
12015880Snordmark 		t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
12025880Snordmark 		t->zsd_flags |= ZSD_DESTROY_COMPLETED;
12035880Snordmark 		cv_broadcast(&t->zsd_cv);
12045880Snordmark 		DTRACE_PROBE2(zsd__destroy__completed,
12055880Snordmark 		    zone_t *, zone, zone_key_t, key);
12065880Snordmark 	}
12075880Snordmark 	if (!zone_lock_held)
12085880Snordmark 		mutex_exit(&zone->zone_lock);
12095880Snordmark 	return (dropped);
12105880Snordmark }
12115880Snordmark 
12125880Snordmark /*
12135880Snordmark  * Wait for any CREATE_NEEDED flag to be cleared.
12145880Snordmark  * Returns true if lockp was temporarily dropped while waiting.
12155880Snordmark  */
12165880Snordmark static boolean_t
12175880Snordmark zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
12185880Snordmark {
12195880Snordmark 	boolean_t dropped = B_FALSE;
12205880Snordmark 
12215880Snordmark 	while (t->zsd_flags & ZSD_CREATE_NEEDED) {
12225880Snordmark 		DTRACE_PROBE2(zsd__wait__for__creator,
12235880Snordmark 		    zone_t *, zone, struct zsd_entry *, t);
12245880Snordmark 		if (lockp != NULL) {
12255880Snordmark 			dropped = B_TRUE;
12265880Snordmark 			mutex_exit(lockp);
12275880Snordmark 		}
12285880Snordmark 		cv_wait(&t->zsd_cv, &zone->zone_lock);
12295880Snordmark 		if (lockp != NULL) {
12305880Snordmark 			/* First drop zone_lock to preserve order */
12315880Snordmark 			mutex_exit(&zone->zone_lock);
12325880Snordmark 			mutex_enter(lockp);
12335880Snordmark 			mutex_enter(&zone->zone_lock);
12345880Snordmark 		}
12355880Snordmark 	}
12365880Snordmark 	return (dropped);
12375880Snordmark }
12385880Snordmark 
12395880Snordmark /*
12405880Snordmark  * Wait for any INPROGRESS flag to be cleared.
12415880Snordmark  * Returns true if lockp was temporarily dropped while waiting.
12425880Snordmark  */
12435880Snordmark static boolean_t
12445880Snordmark zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
12455880Snordmark {
12465880Snordmark 	boolean_t dropped = B_FALSE;
12475880Snordmark 
12485880Snordmark 	while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
12495880Snordmark 		DTRACE_PROBE2(zsd__wait__for__inprogress,
12505880Snordmark 		    zone_t *, zone, struct zsd_entry *, t);
12515880Snordmark 		if (lockp != NULL) {
12525880Snordmark 			dropped = B_TRUE;
12535880Snordmark 			mutex_exit(lockp);
12545880Snordmark 		}
12555880Snordmark 		cv_wait(&t->zsd_cv, &zone->zone_lock);
12565880Snordmark 		if (lockp != NULL) {
12575880Snordmark 			/* First drop zone_lock to preserve order */
12585880Snordmark 			mutex_exit(&zone->zone_lock);
12595880Snordmark 			mutex_enter(lockp);
12605880Snordmark 			mutex_enter(&zone->zone_lock);
12615880Snordmark 		}
12625880Snordmark 	}
12635880Snordmark 	return (dropped);
12640Sstevel@tonic-gate }
12650Sstevel@tonic-gate 
12660Sstevel@tonic-gate /*
1267789Sahrens  * Frees memory associated with the zone dataset list.
1268789Sahrens  */
1269789Sahrens static void
1270789Sahrens zone_free_datasets(zone_t *zone)
1271789Sahrens {
1272789Sahrens 	zone_dataset_t *t, *next;
1273789Sahrens 
1274789Sahrens 	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1275789Sahrens 		next = list_next(&zone->zone_datasets, t);
1276789Sahrens 		list_remove(&zone->zone_datasets, t);
1277789Sahrens 		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1278789Sahrens 		kmem_free(t, sizeof (*t));
1279789Sahrens 	}
1280789Sahrens 	list_destroy(&zone->zone_datasets);
1281789Sahrens }
1282789Sahrens 
1283789Sahrens /*
12840Sstevel@tonic-gate  * zone.cpu-shares resource control support.
12850Sstevel@tonic-gate  */
12860Sstevel@tonic-gate /*ARGSUSED*/
12870Sstevel@tonic-gate static rctl_qty_t
12880Sstevel@tonic-gate zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
12890Sstevel@tonic-gate {
12900Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&p->p_lock));
12910Sstevel@tonic-gate 	return (p->p_zone->zone_shares);
12920Sstevel@tonic-gate }
12930Sstevel@tonic-gate 
12940Sstevel@tonic-gate /*ARGSUSED*/
12950Sstevel@tonic-gate static int
12960Sstevel@tonic-gate zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
12970Sstevel@tonic-gate     rctl_qty_t nv)
12980Sstevel@tonic-gate {
12990Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&p->p_lock));
13000Sstevel@tonic-gate 	ASSERT(e->rcep_t == RCENTITY_ZONE);
13010Sstevel@tonic-gate 	if (e->rcep_p.zone == NULL)
13020Sstevel@tonic-gate 		return (0);
13030Sstevel@tonic-gate 
13040Sstevel@tonic-gate 	e->rcep_p.zone->zone_shares = nv;
13050Sstevel@tonic-gate 	return (0);
13060Sstevel@tonic-gate }
13070Sstevel@tonic-gate 
13080Sstevel@tonic-gate static rctl_ops_t zone_cpu_shares_ops = {
13090Sstevel@tonic-gate 	rcop_no_action,
13100Sstevel@tonic-gate 	zone_cpu_shares_usage,
13110Sstevel@tonic-gate 	zone_cpu_shares_set,
13120Sstevel@tonic-gate 	rcop_no_test
13130Sstevel@tonic-gate };
13140Sstevel@tonic-gate 
13153792Sakolb /*
13163792Sakolb  * zone.cpu-cap resource control support.
13173792Sakolb  */
13183792Sakolb /*ARGSUSED*/
13193792Sakolb static rctl_qty_t
13203792Sakolb zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
13213792Sakolb {
13223792Sakolb 	ASSERT(MUTEX_HELD(&p->p_lock));
13233792Sakolb 	return (cpucaps_zone_get(p->p_zone));
13243792Sakolb }
13253792Sakolb 
13263792Sakolb /*ARGSUSED*/
13273792Sakolb static int
13283792Sakolb zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
13293792Sakolb     rctl_qty_t nv)
13303792Sakolb {
13313792Sakolb 	zone_t *zone = e->rcep_p.zone;
13323792Sakolb 
13333792Sakolb 	ASSERT(MUTEX_HELD(&p->p_lock));
13343792Sakolb 	ASSERT(e->rcep_t == RCENTITY_ZONE);
13353792Sakolb 
13363792Sakolb 	if (zone == NULL)
13373792Sakolb 		return (0);
13383792Sakolb 
13393792Sakolb 	/*
13403792Sakolb 	 * set cap to the new value.
13413792Sakolb 	 */
13423792Sakolb 	return (cpucaps_zone_set(zone, nv));
13433792Sakolb }
13443792Sakolb 
13453792Sakolb static rctl_ops_t zone_cpu_cap_ops = {
13463792Sakolb 	rcop_no_action,
13473792Sakolb 	zone_cpu_cap_get,
13483792Sakolb 	zone_cpu_cap_set,
13493792Sakolb 	rcop_no_test
13503792Sakolb };
13513792Sakolb 
13520Sstevel@tonic-gate /*ARGSUSED*/
13530Sstevel@tonic-gate static rctl_qty_t
13540Sstevel@tonic-gate zone_lwps_usage(rctl_t *r, proc_t *p)
13550Sstevel@tonic-gate {
13560Sstevel@tonic-gate 	rctl_qty_t nlwps;
13570Sstevel@tonic-gate 	zone_t *zone = p->p_zone;
13580Sstevel@tonic-gate 
13590Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&p->p_lock));
13600Sstevel@tonic-gate 
13610Sstevel@tonic-gate 	mutex_enter(&zone->zone_nlwps_lock);
13620Sstevel@tonic-gate 	nlwps = zone->zone_nlwps;
13630Sstevel@tonic-gate 	mutex_exit(&zone->zone_nlwps_lock);
13640Sstevel@tonic-gate 
13650Sstevel@tonic-gate 	return (nlwps);
13660Sstevel@tonic-gate }
13670Sstevel@tonic-gate 
13680Sstevel@tonic-gate /*ARGSUSED*/
13690Sstevel@tonic-gate static int
13700Sstevel@tonic-gate zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
13710Sstevel@tonic-gate     rctl_qty_t incr, uint_t flags)
13720Sstevel@tonic-gate {
13730Sstevel@tonic-gate 	rctl_qty_t nlwps;
13740Sstevel@tonic-gate 
13750Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&p->p_lock));
13760Sstevel@tonic-gate 	ASSERT(e->rcep_t == RCENTITY_ZONE);
13770Sstevel@tonic-gate 	if (e->rcep_p.zone == NULL)
13780Sstevel@tonic-gate 		return (0);
13790Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
13800Sstevel@tonic-gate 	nlwps = e->rcep_p.zone->zone_nlwps;
13810Sstevel@tonic-gate 
13820Sstevel@tonic-gate 	if (nlwps + incr > rcntl->rcv_value)
13830Sstevel@tonic-gate 		return (1);
13840Sstevel@tonic-gate 
13850Sstevel@tonic-gate 	return (0);
13860Sstevel@tonic-gate }
13870Sstevel@tonic-gate 
13880Sstevel@tonic-gate /*ARGSUSED*/
13890Sstevel@tonic-gate static int
13902768Ssl108498 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
13912768Ssl108498 {
13920Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&p->p_lock));
13930Sstevel@tonic-gate 	ASSERT(e->rcep_t == RCENTITY_ZONE);
13940Sstevel@tonic-gate 	if (e->rcep_p.zone == NULL)
13950Sstevel@tonic-gate 		return (0);
13960Sstevel@tonic-gate 	e->rcep_p.zone->zone_nlwps_ctl = nv;
13970Sstevel@tonic-gate 	return (0);
13980Sstevel@tonic-gate }
13990Sstevel@tonic-gate 
14000Sstevel@tonic-gate static rctl_ops_t zone_lwps_ops = {
14010Sstevel@tonic-gate 	rcop_no_action,
14020Sstevel@tonic-gate 	zone_lwps_usage,
14030Sstevel@tonic-gate 	zone_lwps_set,
14040Sstevel@tonic-gate 	zone_lwps_test,
14050Sstevel@tonic-gate };
14060Sstevel@tonic-gate 
14072677Sml93401 /*ARGSUSED*/
1408*12725SMenno.Lageman@Sun.COM static rctl_qty_t
1409*12725SMenno.Lageman@Sun.COM zone_procs_usage(rctl_t *r, proc_t *p)
1410*12725SMenno.Lageman@Sun.COM {
1411*12725SMenno.Lageman@Sun.COM 	rctl_qty_t nprocs;
1412*12725SMenno.Lageman@Sun.COM 	zone_t *zone = p->p_zone;
1413*12725SMenno.Lageman@Sun.COM 
1414*12725SMenno.Lageman@Sun.COM 	ASSERT(MUTEX_HELD(&p->p_lock));
1415*12725SMenno.Lageman@Sun.COM 
1416*12725SMenno.Lageman@Sun.COM 	mutex_enter(&zone->zone_nlwps_lock);
1417*12725SMenno.Lageman@Sun.COM 	nprocs = zone->zone_nprocs;
1418*12725SMenno.Lageman@Sun.COM 	mutex_exit(&zone->zone_nlwps_lock);
1419*12725SMenno.Lageman@Sun.COM 
1420*12725SMenno.Lageman@Sun.COM 	return (nprocs);
1421*12725SMenno.Lageman@Sun.COM }
1422*12725SMenno.Lageman@Sun.COM 
1423*12725SMenno.Lageman@Sun.COM /*ARGSUSED*/
1424*12725SMenno.Lageman@Sun.COM static int
1425*12725SMenno.Lageman@Sun.COM zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1426*12725SMenno.Lageman@Sun.COM     rctl_qty_t incr, uint_t flags)
1427*12725SMenno.Lageman@Sun.COM {
1428*12725SMenno.Lageman@Sun.COM 	rctl_qty_t nprocs;
1429*12725SMenno.Lageman@Sun.COM 
1430*12725SMenno.Lageman@Sun.COM 	ASSERT(MUTEX_HELD(&p->p_lock));
1431*12725SMenno.Lageman@Sun.COM 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1432*12725SMenno.Lageman@Sun.COM 	if (e->rcep_p.zone == NULL)
1433*12725SMenno.Lageman@Sun.COM 		return (0);
1434*12725SMenno.Lageman@Sun.COM 	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1435*12725SMenno.Lageman@Sun.COM 	nprocs = e->rcep_p.zone->zone_nprocs;
1436*12725SMenno.Lageman@Sun.COM 
1437*12725SMenno.Lageman@Sun.COM 	if (nprocs + incr > rcntl->rcv_value)
1438*12725SMenno.Lageman@Sun.COM 		return (1);
1439*12725SMenno.Lageman@Sun.COM 
1440*12725SMenno.Lageman@Sun.COM 	return (0);
1441*12725SMenno.Lageman@Sun.COM }
1442*12725SMenno.Lageman@Sun.COM 
1443*12725SMenno.Lageman@Sun.COM /*ARGSUSED*/
1444*12725SMenno.Lageman@Sun.COM static int
1445*12725SMenno.Lageman@Sun.COM zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1446*12725SMenno.Lageman@Sun.COM {
1447*12725SMenno.Lageman@Sun.COM 	ASSERT(MUTEX_HELD(&p->p_lock));
1448*12725SMenno.Lageman@Sun.COM 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1449*12725SMenno.Lageman@Sun.COM 	if (e->rcep_p.zone == NULL)
1450*12725SMenno.Lageman@Sun.COM 		return (0);
1451*12725SMenno.Lageman@Sun.COM 	e->rcep_p.zone->zone_nprocs_ctl = nv;
1452*12725SMenno.Lageman@Sun.COM 	return (0);
1453*12725SMenno.Lageman@Sun.COM }
1454*12725SMenno.Lageman@Sun.COM 
1455*12725SMenno.Lageman@Sun.COM static rctl_ops_t zone_procs_ops = {
1456*12725SMenno.Lageman@Sun.COM 	rcop_no_action,
1457*12725SMenno.Lageman@Sun.COM 	zone_procs_usage,
1458*12725SMenno.Lageman@Sun.COM 	zone_procs_set,
1459*12725SMenno.Lageman@Sun.COM 	zone_procs_test,
1460*12725SMenno.Lageman@Sun.COM };
1461*12725SMenno.Lageman@Sun.COM 
1462*12725SMenno.Lageman@Sun.COM /*ARGSUSED*/
14632677Sml93401 static int
14642677Sml93401 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
14652677Sml93401     rctl_qty_t incr, uint_t flags)
14662677Sml93401 {
14672677Sml93401 	rctl_qty_t v;
14682677Sml93401 	ASSERT(MUTEX_HELD(&p->p_lock));
14692677Sml93401 	ASSERT(e->rcep_t == RCENTITY_ZONE);
14702677Sml93401 	v = e->rcep_p.zone->zone_shmmax + incr;
14712677Sml93401 	if (v > rval->rcv_value)
14722677Sml93401 		return (1);
14732677Sml93401 	return (0);
14742677Sml93401 }
14752677Sml93401 
14762677Sml93401 static rctl_ops_t zone_shmmax_ops = {
14772677Sml93401 	rcop_no_action,
14782677Sml93401 	rcop_no_usage,
14792677Sml93401 	rcop_no_set,
14802677Sml93401 	zone_shmmax_test
14812677Sml93401 };
14822677Sml93401 
14832677Sml93401 /*ARGSUSED*/
14842677Sml93401 static int
14852677Sml93401 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
14862677Sml93401     rctl_qty_t incr, uint_t flags)
14872677Sml93401 {
14882677Sml93401 	rctl_qty_t v;
14892677Sml93401 	ASSERT(MUTEX_HELD(&p->p_lock));
14902677Sml93401 	ASSERT(e->rcep_t == RCENTITY_ZONE);
14912677Sml93401 	v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
14922677Sml93401 	if (v > rval->rcv_value)
14932677Sml93401 		return (1);
14942677Sml93401 	return (0);
14952677Sml93401 }
14962677Sml93401 
14972677Sml93401 static rctl_ops_t zone_shmmni_ops = {
14982677Sml93401 	rcop_no_action,
14992677Sml93401 	rcop_no_usage,
15002677Sml93401 	rcop_no_set,
15012677Sml93401 	zone_shmmni_test
15022677Sml93401 };
15032677Sml93401 
15042677Sml93401 /*ARGSUSED*/
15052677Sml93401 static int
15062677Sml93401 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
15072677Sml93401     rctl_qty_t incr, uint_t flags)
15082677Sml93401 {
15092677Sml93401 	rctl_qty_t v;
15102677Sml93401 	ASSERT(MUTEX_HELD(&p->p_lock));
15112677Sml93401 	ASSERT(e->rcep_t == RCENTITY_ZONE);
15122677Sml93401 	v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
15132677Sml93401 	if (v > rval->rcv_value)
15142677Sml93401 		return (1);
15152677Sml93401 	return (0);
15162677Sml93401 }
15172677Sml93401 
15182677Sml93401 static rctl_ops_t zone_semmni_ops = {
15192677Sml93401 	rcop_no_action,
15202677Sml93401 	rcop_no_usage,
15212677Sml93401 	rcop_no_set,
15222677Sml93401 	zone_semmni_test
15232677Sml93401 };
15242677Sml93401 
15252677Sml93401 /*ARGSUSED*/
15262677Sml93401 static int
15272677Sml93401 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
15282677Sml93401     rctl_qty_t incr, uint_t flags)
15292677Sml93401 {
15302677Sml93401 	rctl_qty_t v;
15312677Sml93401 	ASSERT(MUTEX_HELD(&p->p_lock));
15322677Sml93401 	ASSERT(e->rcep_t == RCENTITY_ZONE);
15332677Sml93401 	v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
15342677Sml93401 	if (v > rval->rcv_value)
15352677Sml93401 		return (1);
15362677Sml93401 	return (0);
15372677Sml93401 }
15382677Sml93401 
15392677Sml93401 static rctl_ops_t zone_msgmni_ops = {
15402677Sml93401 	rcop_no_action,
15412677Sml93401 	rcop_no_usage,
15422677Sml93401 	rcop_no_set,
15432677Sml93401 	zone_msgmni_test
15442677Sml93401 };
15452677Sml93401 
15462768Ssl108498 /*ARGSUSED*/
15472768Ssl108498 static rctl_qty_t
15482768Ssl108498 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
15492768Ssl108498 {
15502768Ssl108498 	rctl_qty_t q;
15512768Ssl108498 	ASSERT(MUTEX_HELD(&p->p_lock));
15523247Sgjelinek 	mutex_enter(&p->p_zone->zone_mem_lock);
15532768Ssl108498 	q = p->p_zone->zone_locked_mem;
15543247Sgjelinek 	mutex_exit(&p->p_zone->zone_mem_lock);
15552768Ssl108498 	return (q);
15562768Ssl108498 }
15572768Ssl108498 
15582768Ssl108498 /*ARGSUSED*/
15592768Ssl108498 static int
15602768Ssl108498 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
15612768Ssl108498     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
15622768Ssl108498 {
15632768Ssl108498 	rctl_qty_t q;
15643247Sgjelinek 	zone_t *z;
15653247Sgjelinek 
15663247Sgjelinek 	z = e->rcep_p.zone;
15672768Ssl108498 	ASSERT(MUTEX_HELD(&p->p_lock));
15683247Sgjelinek 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
15693247Sgjelinek 	q = z->zone_locked_mem;
15702768Ssl108498 	if (q + incr > rcntl->rcv_value)
15712768Ssl108498 		return (1);
15722768Ssl108498 	return (0);
15732768Ssl108498 }
15742768Ssl108498 
15752768Ssl108498 /*ARGSUSED*/
15762768Ssl108498 static int
15772768Ssl108498 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
15782768Ssl108498     rctl_qty_t nv)
15792768Ssl108498 {
15802768Ssl108498 	ASSERT(MUTEX_HELD(&p->p_lock));
15812768Ssl108498 	ASSERT(e->rcep_t == RCENTITY_ZONE);
15822768Ssl108498 	if (e->rcep_p.zone == NULL)
15832768Ssl108498 		return (0);
15842768Ssl108498 	e->rcep_p.zone->zone_locked_mem_ctl = nv;
15852768Ssl108498 	return (0);
15862768Ssl108498 }
15872768Ssl108498 
15882768Ssl108498 static rctl_ops_t zone_locked_mem_ops = {
15892768Ssl108498 	rcop_no_action,
15902768Ssl108498 	zone_locked_mem_usage,
15912768Ssl108498 	zone_locked_mem_set,
15922768Ssl108498 	zone_locked_mem_test
15932768Ssl108498 };
15942677Sml93401 
15953247Sgjelinek /*ARGSUSED*/
15963247Sgjelinek static rctl_qty_t
15973247Sgjelinek zone_max_swap_usage(rctl_t *rctl, struct proc *p)
15983247Sgjelinek {
15993247Sgjelinek 	rctl_qty_t q;
16003247Sgjelinek 	zone_t *z = p->p_zone;
16013247Sgjelinek 
16023247Sgjelinek 	ASSERT(MUTEX_HELD(&p->p_lock));
16033247Sgjelinek 	mutex_enter(&z->zone_mem_lock);
16043247Sgjelinek 	q = z->zone_max_swap;
16053247Sgjelinek 	mutex_exit(&z->zone_mem_lock);
16063247Sgjelinek 	return (q);
16073247Sgjelinek }
16083247Sgjelinek 
16093247Sgjelinek /*ARGSUSED*/
16103247Sgjelinek static int
16113247Sgjelinek zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
16123247Sgjelinek     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
16133247Sgjelinek {
16143247Sgjelinek 	rctl_qty_t q;
16153247Sgjelinek 	zone_t *z;
16163247Sgjelinek 
16173247Sgjelinek 	z = e->rcep_p.zone;
16183247Sgjelinek 	ASSERT(MUTEX_HELD(&p->p_lock));
16193247Sgjelinek 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
16203247Sgjelinek 	q = z->zone_max_swap;
16213247Sgjelinek 	if (q + incr > rcntl->rcv_value)
16223247Sgjelinek 		return (1);
16233247Sgjelinek 	return (0);
16243247Sgjelinek }
16253247Sgjelinek 
16263247Sgjelinek /*ARGSUSED*/
16273247Sgjelinek static int
16283247Sgjelinek zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
16293247Sgjelinek     rctl_qty_t nv)
16303247Sgjelinek {
16313247Sgjelinek 	ASSERT(MUTEX_HELD(&p->p_lock));
16323247Sgjelinek 	ASSERT(e->rcep_t == RCENTITY_ZONE);
16333247Sgjelinek 	if (e->rcep_p.zone == NULL)
16343247Sgjelinek 		return (0);
16353247Sgjelinek 	e->rcep_p.zone->zone_max_swap_ctl = nv;
16363247Sgjelinek 	return (0);
16373247Sgjelinek }
16383247Sgjelinek 
16393247Sgjelinek static rctl_ops_t zone_max_swap_ops = {
16403247Sgjelinek 	rcop_no_action,
16413247Sgjelinek 	zone_max_swap_usage,
16423247Sgjelinek 	zone_max_swap_set,
16433247Sgjelinek 	zone_max_swap_test
16443247Sgjelinek };
16453247Sgjelinek 
164612633Sjohn.levon@sun.com /*ARGSUSED*/
164712633Sjohn.levon@sun.com static rctl_qty_t
164812633Sjohn.levon@sun.com zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
164912633Sjohn.levon@sun.com {
165012633Sjohn.levon@sun.com 	rctl_qty_t q;
165112633Sjohn.levon@sun.com 	zone_t *z = p->p_zone;
165212633Sjohn.levon@sun.com 
165312633Sjohn.levon@sun.com 	ASSERT(MUTEX_HELD(&p->p_lock));
165412633Sjohn.levon@sun.com 	mutex_enter(&z->zone_rctl_lock);
165512633Sjohn.levon@sun.com 	q = z->zone_max_lofi;
165612633Sjohn.levon@sun.com 	mutex_exit(&z->zone_rctl_lock);
165712633Sjohn.levon@sun.com 	return (q);
165812633Sjohn.levon@sun.com }
165912633Sjohn.levon@sun.com 
166012633Sjohn.levon@sun.com /*ARGSUSED*/
166112633Sjohn.levon@sun.com static int
166212633Sjohn.levon@sun.com zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
166312633Sjohn.levon@sun.com     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
166412633Sjohn.levon@sun.com {
166512633Sjohn.levon@sun.com 	rctl_qty_t q;
166612633Sjohn.levon@sun.com 	zone_t *z;
166712633Sjohn.levon@sun.com 
166812633Sjohn.levon@sun.com 	z = e->rcep_p.zone;
166912633Sjohn.levon@sun.com 	ASSERT(MUTEX_HELD(&p->p_lock));
167012633Sjohn.levon@sun.com 	ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
167112633Sjohn.levon@sun.com 	q = z->zone_max_lofi;
167212633Sjohn.levon@sun.com 	if (q + incr > rcntl->rcv_value)
167312633Sjohn.levon@sun.com 		return (1);
167412633Sjohn.levon@sun.com 	return (0);
167512633Sjohn.levon@sun.com }
167612633Sjohn.levon@sun.com 
167712633Sjohn.levon@sun.com /*ARGSUSED*/
167812633Sjohn.levon@sun.com static int
167912633Sjohn.levon@sun.com zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
168012633Sjohn.levon@sun.com     rctl_qty_t nv)
168112633Sjohn.levon@sun.com {
168212633Sjohn.levon@sun.com 	ASSERT(MUTEX_HELD(&p->p_lock));
168312633Sjohn.levon@sun.com 	ASSERT(e->rcep_t == RCENTITY_ZONE);
168412633Sjohn.levon@sun.com 	if (e->rcep_p.zone == NULL)
168512633Sjohn.levon@sun.com 		return (0);
168612633Sjohn.levon@sun.com 	e->rcep_p.zone->zone_max_lofi_ctl = nv;
168712633Sjohn.levon@sun.com 	return (0);
168812633Sjohn.levon@sun.com }
168912633Sjohn.levon@sun.com 
169012633Sjohn.levon@sun.com static rctl_ops_t zone_max_lofi_ops = {
169112633Sjohn.levon@sun.com 	rcop_no_action,
169212633Sjohn.levon@sun.com 	zone_max_lofi_usage,
169312633Sjohn.levon@sun.com 	zone_max_lofi_set,
169412633Sjohn.levon@sun.com 	zone_max_lofi_test
169512633Sjohn.levon@sun.com };
169612633Sjohn.levon@sun.com 
16970Sstevel@tonic-gate /*
16980Sstevel@tonic-gate  * Helper function to brand the zone with a unique ID.
16990Sstevel@tonic-gate  */
17000Sstevel@tonic-gate static void
17010Sstevel@tonic-gate zone_uniqid(zone_t *zone)
17020Sstevel@tonic-gate {
17030Sstevel@tonic-gate 	static uint64_t uniqid = 0;
17040Sstevel@tonic-gate 
17050Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
17060Sstevel@tonic-gate 	zone->zone_uniqid = uniqid++;
17070Sstevel@tonic-gate }
17080Sstevel@tonic-gate 
17090Sstevel@tonic-gate /*
17100Sstevel@tonic-gate  * Returns a held pointer to the "kcred" for the specified zone.
17110Sstevel@tonic-gate  */
17120Sstevel@tonic-gate struct cred *
17130Sstevel@tonic-gate zone_get_kcred(zoneid_t zoneid)
17140Sstevel@tonic-gate {
17150Sstevel@tonic-gate 	zone_t *zone;
17160Sstevel@tonic-gate 	cred_t *cr;
17170Sstevel@tonic-gate 
17180Sstevel@tonic-gate 	if ((zone = zone_find_by_id(zoneid)) == NULL)
17190Sstevel@tonic-gate 		return (NULL);
17200Sstevel@tonic-gate 	cr = zone->zone_kcred;
17210Sstevel@tonic-gate 	crhold(cr);
17220Sstevel@tonic-gate 	zone_rele(zone);
17230Sstevel@tonic-gate 	return (cr);
17240Sstevel@tonic-gate }
17250Sstevel@tonic-gate 
17263247Sgjelinek static int
17273247Sgjelinek zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
17283247Sgjelinek {
17293247Sgjelinek 	zone_t *zone = ksp->ks_private;
17303247Sgjelinek 	zone_kstat_t *zk = ksp->ks_data;
17313247Sgjelinek 
17323247Sgjelinek 	if (rw == KSTAT_WRITE)
17333247Sgjelinek 		return (EACCES);
17343247Sgjelinek 
17353247Sgjelinek 	zk->zk_usage.value.ui64 = zone->zone_locked_mem;
17363247Sgjelinek 	zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
17373247Sgjelinek 	return (0);
17383247Sgjelinek }
17393247Sgjelinek 
17403247Sgjelinek static int
1741*12725SMenno.Lageman@Sun.COM zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1742*12725SMenno.Lageman@Sun.COM {
1743*12725SMenno.Lageman@Sun.COM 	zone_t *zone = ksp->ks_private;
1744*12725SMenno.Lageman@Sun.COM 	zone_kstat_t *zk = ksp->ks_data;
1745*12725SMenno.Lageman@Sun.COM 
1746*12725SMenno.Lageman@Sun.COM 	if (rw == KSTAT_WRITE)
1747*12725SMenno.Lageman@Sun.COM 		return (EACCES);
1748*12725SMenno.Lageman@Sun.COM 
1749*12725SMenno.Lageman@Sun.COM 	zk->zk_usage.value.ui64 = zone->zone_nprocs;
1750*12725SMenno.Lageman@Sun.COM 	zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1751*12725SMenno.Lageman@Sun.COM 	return (0);
1752*12725SMenno.Lageman@Sun.COM }
1753*12725SMenno.Lageman@Sun.COM 
1754*12725SMenno.Lageman@Sun.COM static int
17553247Sgjelinek zone_swapresv_kstat_update(kstat_t *ksp, int rw)
17563247Sgjelinek {
17573247Sgjelinek 	zone_t *zone = ksp->ks_private;
17583247Sgjelinek 	zone_kstat_t *zk = ksp->ks_data;
17593247Sgjelinek 
17603247Sgjelinek 	if (rw == KSTAT_WRITE)
17613247Sgjelinek 		return (EACCES);
17623247Sgjelinek 
17633247Sgjelinek 	zk->zk_usage.value.ui64 = zone->zone_max_swap;
17643247Sgjelinek 	zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
17653247Sgjelinek 	return (0);
17663247Sgjelinek }
17673247Sgjelinek 
1768*12725SMenno.Lageman@Sun.COM static kstat_t *
1769*12725SMenno.Lageman@Sun.COM zone_kstat_create_common(zone_t *zone, char *name,
1770*12725SMenno.Lageman@Sun.COM     int (*updatefunc) (kstat_t *, int))
17713247Sgjelinek {
17723247Sgjelinek 	kstat_t *ksp;
17733247Sgjelinek 	zone_kstat_t *zk;
17743247Sgjelinek 
1775*12725SMenno.Lageman@Sun.COM 	ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
17763247Sgjelinek 	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
17773247Sgjelinek 	    KSTAT_FLAG_VIRTUAL);
17783247Sgjelinek 
17793247Sgjelinek 	if (ksp == NULL)
1780*12725SMenno.Lageman@Sun.COM 		return (NULL);
17813247Sgjelinek 
17823247Sgjelinek 	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
17833247Sgjelinek 	ksp->ks_data_size += strlen(zone->zone_name) + 1;
17843247Sgjelinek 	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
17853247Sgjelinek 	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
17863247Sgjelinek 	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
17873247Sgjelinek 	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1788*12725SMenno.Lageman@Sun.COM 	ksp->ks_update = updatefunc;
17893247Sgjelinek 	ksp->ks_private = zone;
17903247Sgjelinek 	kstat_install(ksp);
1791*12725SMenno.Lageman@Sun.COM 	return (ksp);
1792*12725SMenno.Lageman@Sun.COM }
1793*12725SMenno.Lageman@Sun.COM 
1794*12725SMenno.Lageman@Sun.COM static void
1795*12725SMenno.Lageman@Sun.COM zone_kstat_create(zone_t *zone)
1796*12725SMenno.Lageman@Sun.COM {
1797*12725SMenno.Lageman@Sun.COM 	zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1798*12725SMenno.Lageman@Sun.COM 	    "lockedmem", zone_lockedmem_kstat_update);
1799*12725SMenno.Lageman@Sun.COM 	zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1800*12725SMenno.Lageman@Sun.COM 	    "swapresv", zone_swapresv_kstat_update);
1801*12725SMenno.Lageman@Sun.COM 	zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
1802*12725SMenno.Lageman@Sun.COM 	    "nprocs", zone_nprocs_kstat_update);
1803*12725SMenno.Lageman@Sun.COM }
1804*12725SMenno.Lageman@Sun.COM 
1805*12725SMenno.Lageman@Sun.COM static void
1806*12725SMenno.Lageman@Sun.COM zone_kstat_delete_common(kstat_t **pkstat)
1807*12725SMenno.Lageman@Sun.COM {
1808*12725SMenno.Lageman@Sun.COM 	void *data;
1809*12725SMenno.Lageman@Sun.COM 
1810*12725SMenno.Lageman@Sun.COM 	if (*pkstat != NULL) {
1811*12725SMenno.Lageman@Sun.COM 		data = (*pkstat)->ks_data;
1812*12725SMenno.Lageman@Sun.COM 		kstat_delete(*pkstat);
1813*12725SMenno.Lageman@Sun.COM 		kmem_free(data, sizeof (zone_kstat_t));
1814*12725SMenno.Lageman@Sun.COM 		*pkstat = NULL;
1815*12725SMenno.Lageman@Sun.COM 	}
18163247Sgjelinek }
18173247Sgjelinek 
18183247Sgjelinek static void
18193247Sgjelinek zone_kstat_delete(zone_t *zone)
18203247Sgjelinek {
1821*12725SMenno.Lageman@Sun.COM 	zone_kstat_delete_common(&zone->zone_lockedmem_kstat);
1822*12725SMenno.Lageman@Sun.COM 	zone_kstat_delete_common(&zone->zone_swapresv_kstat);
1823*12725SMenno.Lageman@Sun.COM 	zone_kstat_delete_common(&zone->zone_nprocs_kstat);
18243247Sgjelinek }
18253247Sgjelinek 
18260Sstevel@tonic-gate /*
18270Sstevel@tonic-gate  * Called very early on in boot to initialize the ZSD list so that
18280Sstevel@tonic-gate  * zone_key_create() can be called before zone_init().  It also initializes
18290Sstevel@tonic-gate  * portions of zone0 which may be used before zone_init() is called.  The
18300Sstevel@tonic-gate  * variable "global_zone" will be set when zone0 is fully initialized by
18310Sstevel@tonic-gate  * zone_init().
18320Sstevel@tonic-gate  */
18330Sstevel@tonic-gate void
18340Sstevel@tonic-gate zone_zsd_init(void)
18350Sstevel@tonic-gate {
18360Sstevel@tonic-gate 	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
18370Sstevel@tonic-gate 	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
18380Sstevel@tonic-gate 	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
18390Sstevel@tonic-gate 	    offsetof(struct zsd_entry, zsd_linkage));
18400Sstevel@tonic-gate 	list_create(&zone_active, sizeof (zone_t),
18410Sstevel@tonic-gate 	    offsetof(zone_t, zone_linkage));
18420Sstevel@tonic-gate 	list_create(&zone_deathrow, sizeof (zone_t),
18430Sstevel@tonic-gate 	    offsetof(zone_t, zone_linkage));
18440Sstevel@tonic-gate 
18450Sstevel@tonic-gate 	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
18460Sstevel@tonic-gate 	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
18473247Sgjelinek 	mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
18480Sstevel@tonic-gate 	zone0.zone_shares = 1;
18493247Sgjelinek 	zone0.zone_nlwps = 0;
18500Sstevel@tonic-gate 	zone0.zone_nlwps_ctl = INT_MAX;
1851*12725SMenno.Lageman@Sun.COM 	zone0.zone_nprocs = 0;
1852*12725SMenno.Lageman@Sun.COM 	zone0.zone_nprocs_ctl = INT_MAX;
18533247Sgjelinek 	zone0.zone_locked_mem = 0;
18543247Sgjelinek 	zone0.zone_locked_mem_ctl = UINT64_MAX;
18553247Sgjelinek 	ASSERT(zone0.zone_max_swap == 0);
18563247Sgjelinek 	zone0.zone_max_swap_ctl = UINT64_MAX;
185712633Sjohn.levon@sun.com 	zone0.zone_max_lofi = 0;
185812633Sjohn.levon@sun.com 	zone0.zone_max_lofi_ctl = UINT64_MAX;
18592677Sml93401 	zone0.zone_shmmax = 0;
18602677Sml93401 	zone0.zone_ipc.ipcq_shmmni = 0;
18612677Sml93401 	zone0.zone_ipc.ipcq_semmni = 0;
18622677Sml93401 	zone0.zone_ipc.ipcq_msgmni = 0;
18630Sstevel@tonic-gate 	zone0.zone_name = GLOBAL_ZONENAME;
18640Sstevel@tonic-gate 	zone0.zone_nodename = utsname.nodename;
18650Sstevel@tonic-gate 	zone0.zone_domain = srpc_domain;
18668662SJordan.Vaughan@Sun.com 	zone0.zone_hostid = HW_INVALID_HOSTID;
186712633Sjohn.levon@sun.com 	zone0.zone_fs_allowed = NULL;
18680Sstevel@tonic-gate 	zone0.zone_ref = 1;
18690Sstevel@tonic-gate 	zone0.zone_id = GLOBAL_ZONEID;
18700Sstevel@tonic-gate 	zone0.zone_status = ZONE_IS_RUNNING;
18710Sstevel@tonic-gate 	zone0.zone_rootpath = "/";
18720Sstevel@tonic-gate 	zone0.zone_rootpathlen = 2;
18730Sstevel@tonic-gate 	zone0.zone_psetid = ZONE_PS_INVAL;
18740Sstevel@tonic-gate 	zone0.zone_ncpus = 0;
18750Sstevel@tonic-gate 	zone0.zone_ncpus_online = 0;
18760Sstevel@tonic-gate 	zone0.zone_proc_initpid = 1;
18772267Sdp 	zone0.zone_initname = initname;
18783247Sgjelinek 	zone0.zone_lockedmem_kstat = NULL;
18793247Sgjelinek 	zone0.zone_swapresv_kstat = NULL;
1880*12725SMenno.Lageman@Sun.COM 	zone0.zone_nprocs_kstat = NULL;
18810Sstevel@tonic-gate 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
18820Sstevel@tonic-gate 	    offsetof(struct zsd_entry, zsd_linkage));
18830Sstevel@tonic-gate 	list_insert_head(&zone_active, &zone0);
18840Sstevel@tonic-gate 
18850Sstevel@tonic-gate 	/*
18860Sstevel@tonic-gate 	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
18870Sstevel@tonic-gate 	 * to anything meaningful.  It is assigned to be 'rootdir' in
18880Sstevel@tonic-gate 	 * vfs_mountroot().
18890Sstevel@tonic-gate 	 */
18900Sstevel@tonic-gate 	zone0.zone_rootvp = NULL;
18910Sstevel@tonic-gate 	zone0.zone_vfslist = NULL;
18922267Sdp 	zone0.zone_bootargs = initargs;
18930Sstevel@tonic-gate 	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
18940Sstevel@tonic-gate 	/*
18950Sstevel@tonic-gate 	 * The global zone has all privileges
18960Sstevel@tonic-gate 	 */
18970Sstevel@tonic-gate 	priv_fillset(zone0.zone_privset);
18980Sstevel@tonic-gate 	/*
18990Sstevel@tonic-gate 	 * Add p0 to the global zone
19000Sstevel@tonic-gate 	 */
19010Sstevel@tonic-gate 	zone0.zone_zsched = &p0;
19020Sstevel@tonic-gate 	p0.p_zone = &zone0;
19030Sstevel@tonic-gate }
19040Sstevel@tonic-gate 
19050Sstevel@tonic-gate /*
19061676Sjpk  * Compute a hash value based on the contents of the label and the DOI.  The
19071676Sjpk  * hash algorithm is somewhat arbitrary, but is based on the observation that
19081676Sjpk  * humans will likely pick labels that differ by amounts that work out to be
19091676Sjpk  * multiples of the number of hash chains, and thus stirring in some primes
19101676Sjpk  * should help.
19111676Sjpk  */
19121676Sjpk static uint_t
19131676Sjpk hash_bylabel(void *hdata, mod_hash_key_t key)
19141676Sjpk {
19151676Sjpk 	const ts_label_t *lab = (ts_label_t *)key;
19161676Sjpk 	const uint32_t *up, *ue;
19171676Sjpk 	uint_t hash;
19181676Sjpk 	int i;
19191676Sjpk 
19201676Sjpk 	_NOTE(ARGUNUSED(hdata));
19211676Sjpk 
19221676Sjpk 	hash = lab->tsl_doi + (lab->tsl_doi << 1);
19231676Sjpk 	/* we depend on alignment of label, but not representation */
19241676Sjpk 	up = (const uint32_t *)&lab->tsl_label;
19251676Sjpk 	ue = up + sizeof (lab->tsl_label) / sizeof (*up);
19261676Sjpk 	i = 1;
19271676Sjpk 	while (up < ue) {
19281676Sjpk 		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
19291676Sjpk 		hash += *up + (*up << ((i % 16) + 1));
19301676Sjpk 		up++;
19311676Sjpk 		i++;
19321676Sjpk 	}
19331676Sjpk 	return (hash);
19341676Sjpk }
19351676Sjpk 
19361676Sjpk /*
19371676Sjpk  * All that mod_hash cares about here is zero (equal) versus non-zero (not
19381676Sjpk  * equal).  This may need to be changed if less than / greater than is ever
19391676Sjpk  * needed.
19401676Sjpk  */
19411676Sjpk static int
19421676Sjpk hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
19431676Sjpk {
19441676Sjpk 	ts_label_t *lab1 = (ts_label_t *)key1;
19451676Sjpk 	ts_label_t *lab2 = (ts_label_t *)key2;
19461676Sjpk 
19471676Sjpk 	return (label_equal(lab1, lab2) ? 0 : 1);
19481676Sjpk }
19491676Sjpk 
19501676Sjpk /*
19510Sstevel@tonic-gate  * Called by main() to initialize the zones framework.
19520Sstevel@tonic-gate  */
19530Sstevel@tonic-gate void
19540Sstevel@tonic-gate zone_init(void)
19550Sstevel@tonic-gate {
19560Sstevel@tonic-gate 	rctl_dict_entry_t *rde;
19570Sstevel@tonic-gate 	rctl_val_t *dval;
19580Sstevel@tonic-gate 	rctl_set_t *set;
19590Sstevel@tonic-gate 	rctl_alloc_gp_t *gp;
19600Sstevel@tonic-gate 	rctl_entity_p_t e;
19611166Sdstaff 	int res;
19620Sstevel@tonic-gate 
19630Sstevel@tonic-gate 	ASSERT(curproc == &p0);
19640Sstevel@tonic-gate 
19650Sstevel@tonic-gate 	/*
19660Sstevel@tonic-gate 	 * Create ID space for zone IDs.  ID 0 is reserved for the
19670Sstevel@tonic-gate 	 * global zone.
19680Sstevel@tonic-gate 	 */
19690Sstevel@tonic-gate 	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
19700Sstevel@tonic-gate 
19710Sstevel@tonic-gate 	/*
19720Sstevel@tonic-gate 	 * Initialize generic zone resource controls, if any.
19730Sstevel@tonic-gate 	 */
19740Sstevel@tonic-gate 	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
19750Sstevel@tonic-gate 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
19761996Sml93401 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
19773792Sakolb 	    FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
19783792Sakolb 
19793792Sakolb 	rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
19803792Sakolb 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
19813792Sakolb 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
19823792Sakolb 	    RCTL_GLOBAL_INFINITE,
19833792Sakolb 	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
19840Sstevel@tonic-gate 
19850Sstevel@tonic-gate 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
19860Sstevel@tonic-gate 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
19870Sstevel@tonic-gate 	    INT_MAX, INT_MAX, &zone_lwps_ops);
1988*12725SMenno.Lageman@Sun.COM 
1989*12725SMenno.Lageman@Sun.COM 	rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
1990*12725SMenno.Lageman@Sun.COM 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
1991*12725SMenno.Lageman@Sun.COM 	    INT_MAX, INT_MAX, &zone_procs_ops);
1992*12725SMenno.Lageman@Sun.COM 
19930Sstevel@tonic-gate 	/*
19942677Sml93401 	 * System V IPC resource controls
19952677Sml93401 	 */
19962677Sml93401 	rc_zone_msgmni = rctl_register("zone.max-msg-ids",
19972677Sml93401 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
19982677Sml93401 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
19992677Sml93401 
20002677Sml93401 	rc_zone_semmni = rctl_register("zone.max-sem-ids",
20012677Sml93401 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
20022677Sml93401 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
20032677Sml93401 
20042677Sml93401 	rc_zone_shmmni = rctl_register("zone.max-shm-ids",
20052677Sml93401 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
20062677Sml93401 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
20072677Sml93401 
20082677Sml93401 	rc_zone_shmmax = rctl_register("zone.max-shm-memory",
20092677Sml93401 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
20102677Sml93401 	    RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
20112677Sml93401 
20122677Sml93401 	/*
20130Sstevel@tonic-gate 	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
20140Sstevel@tonic-gate 	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
20150Sstevel@tonic-gate 	 */
20160Sstevel@tonic-gate 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
20170Sstevel@tonic-gate 	bzero(dval, sizeof (rctl_val_t));
20180Sstevel@tonic-gate 	dval->rcv_value = 1;
20190Sstevel@tonic-gate 	dval->rcv_privilege = RCPRIV_PRIVILEGED;
20200Sstevel@tonic-gate 	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
20210Sstevel@tonic-gate 	dval->rcv_action_recip_pid = -1;
20220Sstevel@tonic-gate 
20230Sstevel@tonic-gate 	rde = rctl_dict_lookup("zone.cpu-shares");
20240Sstevel@tonic-gate 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
20250Sstevel@tonic-gate 
20262768Ssl108498 	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
20272768Ssl108498 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
20282768Ssl108498 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
20292768Ssl108498 	    &zone_locked_mem_ops);
20303247Sgjelinek 
20313247Sgjelinek 	rc_zone_max_swap = rctl_register("zone.max-swap",
20323247Sgjelinek 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
20333247Sgjelinek 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
20343247Sgjelinek 	    &zone_max_swap_ops);
20353247Sgjelinek 
203612633Sjohn.levon@sun.com 	rc_zone_max_lofi = rctl_register("zone.max-lofi",
203712633Sjohn.levon@sun.com 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
203812633Sjohn.levon@sun.com 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
203912633Sjohn.levon@sun.com 	    &zone_max_lofi_ops);
204012633Sjohn.levon@sun.com 
20410Sstevel@tonic-gate 	/*
20420Sstevel@tonic-gate 	 * Initialize the ``global zone''.
20430Sstevel@tonic-gate 	 */
20440Sstevel@tonic-gate 	set = rctl_set_create();
20450Sstevel@tonic-gate 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
20460Sstevel@tonic-gate 	mutex_enter(&p0.p_lock);
20470Sstevel@tonic-gate 	e.rcep_p.zone = &zone0;
20480Sstevel@tonic-gate 	e.rcep_t = RCENTITY_ZONE;
20490Sstevel@tonic-gate 	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
20500Sstevel@tonic-gate 	    gp);
20510Sstevel@tonic-gate 
20520Sstevel@tonic-gate 	zone0.zone_nlwps = p0.p_lwpcnt;
2053*12725SMenno.Lageman@Sun.COM 	zone0.zone_nprocs = 1;
20540Sstevel@tonic-gate 	zone0.zone_ntasks = 1;
20550Sstevel@tonic-gate 	mutex_exit(&p0.p_lock);
20562712Snn35248 	zone0.zone_restart_init = B_TRUE;
20572712Snn35248 	zone0.zone_brand = &native_brand;
20580Sstevel@tonic-gate 	rctl_prealloc_destroy(gp);
20590Sstevel@tonic-gate 	/*
20603247Sgjelinek 	 * pool_default hasn't been initialized yet, so we let pool_init()
20613247Sgjelinek 	 * take care of making sure the global zone is in the default pool.
20620Sstevel@tonic-gate 	 */
20631676Sjpk 
20641676Sjpk 	/*
20653247Sgjelinek 	 * Initialize global zone kstats
20663247Sgjelinek 	 */
20673247Sgjelinek 	zone_kstat_create(&zone0);
20683247Sgjelinek 
20693247Sgjelinek 	/*
20701676Sjpk 	 * Initialize zone label.
20711676Sjpk 	 * mlp are initialized when tnzonecfg is loaded.
20721676Sjpk 	 */
20731676Sjpk 	zone0.zone_slabel = l_admin_low;
20741676Sjpk 	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
20751676Sjpk 	label_hold(l_admin_low);
20761676Sjpk 
207710910SRobert.Harris@Sun.COM 	/*
207810910SRobert.Harris@Sun.COM 	 * Initialise the lock for the database structure used by mntfs.
207910910SRobert.Harris@Sun.COM 	 */
208010910SRobert.Harris@Sun.COM 	rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
208110910SRobert.Harris@Sun.COM 
20820Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
20830Sstevel@tonic-gate 	zone_uniqid(&zone0);
20840Sstevel@tonic-gate 	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
20851676Sjpk 
20860Sstevel@tonic-gate 	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
20870Sstevel@tonic-gate 	    mod_hash_null_valdtor);
20880Sstevel@tonic-gate 	zonehashbyname = mod_hash_create_strhash("zone_by_name",
20890Sstevel@tonic-gate 	    zone_hash_size, mod_hash_null_valdtor);
20901676Sjpk 	/*
20911676Sjpk 	 * maintain zonehashbylabel only for labeled systems
20921676Sjpk 	 */
20931676Sjpk 	if (is_system_labeled())
20941676Sjpk 		zonehashbylabel = mod_hash_create_extended("zone_by_label",
20951676Sjpk 		    zone_hash_size, mod_hash_null_keydtor,
20961676Sjpk 		    mod_hash_null_valdtor, hash_bylabel, NULL,
20971676Sjpk 		    hash_labelkey_cmp, KM_SLEEP);
20980Sstevel@tonic-gate 	zonecount = 1;
20990Sstevel@tonic-gate 
21000Sstevel@tonic-gate 	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
21010Sstevel@tonic-gate 	    (mod_hash_val_t)&zone0);
21020Sstevel@tonic-gate 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
21030Sstevel@tonic-gate 	    (mod_hash_val_t)&zone0);
21041769Scarlsonj 	if (is_system_labeled()) {
21051769Scarlsonj 		zone0.zone_flags |= ZF_HASHED_LABEL;
21061676Sjpk 		(void) mod_hash_insert(zonehashbylabel,
21071676Sjpk 		    (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
21081769Scarlsonj 	}
21091676Sjpk 	mutex_exit(&zonehash_lock);
21101676Sjpk 
21110Sstevel@tonic-gate 	/*
21120Sstevel@tonic-gate 	 * We avoid setting zone_kcred until now, since kcred is initialized
21130Sstevel@tonic-gate 	 * sometime after zone_zsd_init() and before zone_init().
21140Sstevel@tonic-gate 	 */
21150Sstevel@tonic-gate 	zone0.zone_kcred = kcred;
21160Sstevel@tonic-gate 	/*
21170Sstevel@tonic-gate 	 * The global zone is fully initialized (except for zone_rootvp which
21180Sstevel@tonic-gate 	 * will be set when the root filesystem is mounted).
21190Sstevel@tonic-gate 	 */
21200Sstevel@tonic-gate 	global_zone = &zone0;
21211166Sdstaff 
21221166Sdstaff 	/*
21231166Sdstaff 	 * Setup an event channel to send zone status change notifications on
21241166Sdstaff 	 */
21251166Sdstaff 	res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
21261166Sdstaff 	    EVCH_CREAT);
21271166Sdstaff 
21281166Sdstaff 	if (res)
21291166Sdstaff 		panic("Sysevent_evc_bind failed during zone setup.\n");
21303247Sgjelinek 
21310Sstevel@tonic-gate }
21320Sstevel@tonic-gate 
21330Sstevel@tonic-gate static void
21340Sstevel@tonic-gate zone_free(zone_t *zone)
21350Sstevel@tonic-gate {
21360Sstevel@tonic-gate 	ASSERT(zone != global_zone);
21370Sstevel@tonic-gate 	ASSERT(zone->zone_ntasks == 0);
21380Sstevel@tonic-gate 	ASSERT(zone->zone_nlwps == 0);
2139*12725SMenno.Lageman@Sun.COM 	ASSERT(zone->zone_nprocs == 0);
21400Sstevel@tonic-gate 	ASSERT(zone->zone_cred_ref == 0);
21410Sstevel@tonic-gate 	ASSERT(zone->zone_kcred == NULL);
21420Sstevel@tonic-gate 	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
21430Sstevel@tonic-gate 	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
21440Sstevel@tonic-gate 
21453792Sakolb 	/*
21463792Sakolb 	 * Remove any zone caps.
21473792Sakolb 	 */
21483792Sakolb 	cpucaps_zone_remove(zone);
21493792Sakolb 
21503792Sakolb 	ASSERT(zone->zone_cpucap == NULL);
21513792Sakolb 
21520Sstevel@tonic-gate 	/* remove from deathrow list */
21530Sstevel@tonic-gate 	if (zone_status_get(zone) == ZONE_IS_DEAD) {
21540Sstevel@tonic-gate 		ASSERT(zone->zone_ref == 0);
21550Sstevel@tonic-gate 		mutex_enter(&zone_deathrow_lock);
21560Sstevel@tonic-gate 		list_remove(&zone_deathrow, zone);
21570Sstevel@tonic-gate 		mutex_exit(&zone_deathrow_lock);
21580Sstevel@tonic-gate 	}
21590Sstevel@tonic-gate 
21600Sstevel@tonic-gate 	zone_free_zsd(zone);
2161789Sahrens 	zone_free_datasets(zone);
216210616SSebastien.Roy@Sun.COM 	list_destroy(&zone->zone_dl_list);
21630Sstevel@tonic-gate 
21640Sstevel@tonic-gate 	if (zone->zone_rootvp != NULL)
21650Sstevel@tonic-gate 		VN_RELE(zone->zone_rootvp);
21660Sstevel@tonic-gate 	if (zone->zone_rootpath)
21670Sstevel@tonic-gate 		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
21680Sstevel@tonic-gate 	if (zone->zone_name != NULL)
21690Sstevel@tonic-gate 		kmem_free(zone->zone_name, ZONENAME_MAX);
21701676Sjpk 	if (zone->zone_slabel != NULL)
21711676Sjpk 		label_rele(zone->zone_slabel);
21720Sstevel@tonic-gate 	if (zone->zone_nodename != NULL)
21730Sstevel@tonic-gate 		kmem_free(zone->zone_nodename, _SYS_NMLN);
21740Sstevel@tonic-gate 	if (zone->zone_domain != NULL)
21750Sstevel@tonic-gate 		kmem_free(zone->zone_domain, _SYS_NMLN);
21760Sstevel@tonic-gate 	if (zone->zone_privset != NULL)
21770Sstevel@tonic-gate 		kmem_free(zone->zone_privset, sizeof (priv_set_t));
21780Sstevel@tonic-gate 	if (zone->zone_rctls != NULL)
21790Sstevel@tonic-gate 		rctl_set_free(zone->zone_rctls);
21800Sstevel@tonic-gate 	if (zone->zone_bootargs != NULL)
218112633Sjohn.levon@sun.com 		strfree(zone->zone_bootargs);
21822267Sdp 	if (zone->zone_initname != NULL)
218312633Sjohn.levon@sun.com 		strfree(zone->zone_initname);
218412633Sjohn.levon@sun.com 	if (zone->zone_fs_allowed != NULL)
218512633Sjohn.levon@sun.com 		strfree(zone->zone_fs_allowed);
218612273SCasper.Dik@Sun.COM 	if (zone->zone_pfexecd != NULL)
218712273SCasper.Dik@Sun.COM 		klpd_freelist(&zone->zone_pfexecd);
21880Sstevel@tonic-gate 	id_free(zoneid_space, zone->zone_id);
21890Sstevel@tonic-gate 	mutex_destroy(&zone->zone_lock);
21900Sstevel@tonic-gate 	cv_destroy(&zone->zone_cv);
21911676Sjpk 	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
219210910SRobert.Harris@Sun.COM 	rw_destroy(&zone->zone_mntfs_db_lock);
21930Sstevel@tonic-gate 	kmem_free(zone, sizeof (zone_t));
21940Sstevel@tonic-gate }
21950Sstevel@tonic-gate 
21960Sstevel@tonic-gate /*
21970Sstevel@tonic-gate  * See block comment at the top of this file for information about zone
21980Sstevel@tonic-gate  * status values.
21990Sstevel@tonic-gate  */
22000Sstevel@tonic-gate /*
22010Sstevel@tonic-gate  * Convenience function for setting zone status.
22020Sstevel@tonic-gate  */
22030Sstevel@tonic-gate static void
22040Sstevel@tonic-gate zone_status_set(zone_t *zone, zone_status_t status)
22050Sstevel@tonic-gate {
22061166Sdstaff 
22071166Sdstaff 	nvlist_t *nvl = NULL;
22080Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zone_status_lock));
22090Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
22100Sstevel@tonic-gate 	    status >= zone_status_get(zone));
22111166Sdstaff 
22121166Sdstaff 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
22131166Sdstaff 	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
22141166Sdstaff 	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
22152267Sdp 	    zone_status_table[status]) ||
22161166Sdstaff 	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
22172267Sdp 	    zone_status_table[zone->zone_status]) ||
22181166Sdstaff 	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
22191166Sdstaff 	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
22201166Sdstaff 	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
22212267Sdp 	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
22221166Sdstaff #ifdef DEBUG
22231166Sdstaff 		(void) printf(
22241166Sdstaff 		    "Failed to allocate and send zone state change event.\n");
22251166Sdstaff #endif
22261166Sdstaff 	}
22271166Sdstaff 	nvlist_free(nvl);
22281166Sdstaff 
22290Sstevel@tonic-gate 	zone->zone_status = status;
22301166Sdstaff 
22310Sstevel@tonic-gate 	cv_broadcast(&zone->zone_cv);
22320Sstevel@tonic-gate }
22330Sstevel@tonic-gate 
22340Sstevel@tonic-gate /*
22350Sstevel@tonic-gate  * Public function to retrieve the zone status.  The zone status may
22360Sstevel@tonic-gate  * change after it is retrieved.
22370Sstevel@tonic-gate  */
22380Sstevel@tonic-gate zone_status_t
22390Sstevel@tonic-gate zone_status_get(zone_t *zone)
22400Sstevel@tonic-gate {
22410Sstevel@tonic-gate 	return (zone->zone_status);
22420Sstevel@tonic-gate }
22430Sstevel@tonic-gate 
22440Sstevel@tonic-gate static int
22450Sstevel@tonic-gate zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
22460Sstevel@tonic-gate {
224712633Sjohn.levon@sun.com 	char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
22482267Sdp 	int err = 0;
22492267Sdp 
22502267Sdp 	ASSERT(zone != global_zone);
225112633Sjohn.levon@sun.com 	if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
22522267Sdp 		goto done;	/* EFAULT or ENAMETOOLONG */
22532267Sdp 
22542267Sdp 	if (zone->zone_bootargs != NULL)
225512633Sjohn.levon@sun.com 		strfree(zone->zone_bootargs);
225612633Sjohn.levon@sun.com 
225712633Sjohn.levon@sun.com 	zone->zone_bootargs = strdup(buf);
22582267Sdp 
22592267Sdp done:
226012633Sjohn.levon@sun.com 	kmem_free(buf, BOOTARGS_MAX);
22612267Sdp 	return (err);
22622267Sdp }
22632267Sdp 
22642267Sdp static int
22654141Sedp zone_set_brand(zone_t *zone, const char *brand)
22664141Sedp {
22674141Sedp 	struct brand_attr *attrp;
22684141Sedp 	brand_t *bp;
22694141Sedp 
22704141Sedp 	attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
22714141Sedp 	if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
22724141Sedp 		kmem_free(attrp, sizeof (struct brand_attr));
22734141Sedp 		return (EFAULT);
22744141Sedp 	}
22754141Sedp 
22764141Sedp 	bp = brand_register_zone(attrp);
22774141Sedp 	kmem_free(attrp, sizeof (struct brand_attr));
22784141Sedp 	if (bp == NULL)
22794141Sedp 		return (EINVAL);
22804141Sedp 
22814141Sedp 	/*
22824141Sedp 	 * This is the only place where a zone can change it's brand.
22834141Sedp 	 * We already need to hold zone_status_lock to check the zone
22844141Sedp 	 * status, so we'll just use that lock to serialize zone
22854141Sedp 	 * branding requests as well.
22864141Sedp 	 */
22874141Sedp 	mutex_enter(&zone_status_lock);
22884141Sedp 
22894141Sedp 	/* Re-Branding is not allowed and the zone can't be booted yet */
22904141Sedp 	if ((ZONE_IS_BRANDED(zone)) ||
22914141Sedp 	    (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
22924141Sedp 		mutex_exit(&zone_status_lock);
22934141Sedp 		brand_unregister_zone(bp);
22944141Sedp 		return (EINVAL);
22954141Sedp 	}
22964141Sedp 
22974888Seh208807 	/* set up the brand specific data */
22984141Sedp 	zone->zone_brand = bp;
22994888Seh208807 	ZBROP(zone)->b_init_brand_data(zone);
23004888Seh208807 
23014141Sedp 	mutex_exit(&zone_status_lock);
23024141Sedp 	return (0);
23034141Sedp }
23044141Sedp 
23054141Sedp static int
230612633Sjohn.levon@sun.com zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
230712633Sjohn.levon@sun.com {
230812633Sjohn.levon@sun.com 	char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
230912633Sjohn.levon@sun.com 	int err = 0;
231012633Sjohn.levon@sun.com 
231112633Sjohn.levon@sun.com 	ASSERT(zone != global_zone);
231212633Sjohn.levon@sun.com 	if ((err = copyinstr(zone_fs_allowed, buf,
231312633Sjohn.levon@sun.com 	    ZONE_FS_ALLOWED_MAX, NULL)) != 0)
231412633Sjohn.levon@sun.com 		goto done;
231512633Sjohn.levon@sun.com 
231612633Sjohn.levon@sun.com 	if (zone->zone_fs_allowed != NULL)
231712633Sjohn.levon@sun.com 		strfree(zone->zone_fs_allowed);
231812633Sjohn.levon@sun.com 
231912633Sjohn.levon@sun.com 	zone->zone_fs_allowed = strdup(buf);
232012633Sjohn.levon@sun.com 
232112633Sjohn.levon@sun.com done:
232212633Sjohn.levon@sun.com 	kmem_free(buf, ZONE_FS_ALLOWED_MAX);
232312633Sjohn.levon@sun.com 	return (err);
232412633Sjohn.levon@sun.com }
232512633Sjohn.levon@sun.com 
232612633Sjohn.levon@sun.com static int
23272267Sdp zone_set_initname(zone_t *zone, const char *zone_initname)
23282267Sdp {
23292267Sdp 	char initname[INITNAME_SZ];
23300Sstevel@tonic-gate 	size_t len;
23312267Sdp 	int err = 0;
23322267Sdp 
23332267Sdp 	ASSERT(zone != global_zone);
23342267Sdp 	if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
23350Sstevel@tonic-gate 		return (err);	/* EFAULT or ENAMETOOLONG */
23362267Sdp 
23372267Sdp 	if (zone->zone_initname != NULL)
233812633Sjohn.levon@sun.com 		strfree(zone->zone_initname);
23392267Sdp 
23402267Sdp 	zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
23412267Sdp 	(void) strcpy(zone->zone_initname, initname);
23420Sstevel@tonic-gate 	return (0);
23430Sstevel@tonic-gate }
23440Sstevel@tonic-gate 
23453247Sgjelinek static int
23463247Sgjelinek zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
23473247Sgjelinek {
23483247Sgjelinek 	uint64_t mcap;
23493247Sgjelinek 	int err = 0;
23503247Sgjelinek 
23513247Sgjelinek 	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
23523247Sgjelinek 		zone->zone_phys_mcap = mcap;
23533247Sgjelinek 
23543247Sgjelinek 	return (err);
23553247Sgjelinek }
23563247Sgjelinek 
23573247Sgjelinek static int
23583247Sgjelinek zone_set_sched_class(zone_t *zone, const char *new_class)
23593247Sgjelinek {
23603247Sgjelinek 	char sched_class[PC_CLNMSZ];
23613247Sgjelinek 	id_t classid;
23623247Sgjelinek 	int err;
23633247Sgjelinek 
23643247Sgjelinek 	ASSERT(zone != global_zone);
23653247Sgjelinek 	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
23663247Sgjelinek 		return (err);	/* EFAULT or ENAMETOOLONG */
23673247Sgjelinek 
236811173SJonathan.Adams@Sun.COM 	if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
23693247Sgjelinek 		return (set_errno(EINVAL));
23703247Sgjelinek 	zone->zone_defaultcid = classid;
23713247Sgjelinek 	ASSERT(zone->zone_defaultcid > 0 &&
23723247Sgjelinek 	    zone->zone_defaultcid < loaded_classes);
23733247Sgjelinek 
23743247Sgjelinek 	return (0);
23753247Sgjelinek }
23763247Sgjelinek 
23770Sstevel@tonic-gate /*
23780Sstevel@tonic-gate  * Block indefinitely waiting for (zone_status >= status)
23790Sstevel@tonic-gate  */
23800Sstevel@tonic-gate void
23810Sstevel@tonic-gate zone_status_wait(zone_t *zone, zone_status_t status)
23820Sstevel@tonic-gate {
23830Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
23840Sstevel@tonic-gate 
23850Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
23860Sstevel@tonic-gate 	while (zone->zone_status < status) {
23870Sstevel@tonic-gate 		cv_wait(&zone->zone_cv, &zone_status_lock);
23880Sstevel@tonic-gate 	}
23890Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
23900Sstevel@tonic-gate }
23910Sstevel@tonic-gate 
23920Sstevel@tonic-gate /*
23930Sstevel@tonic-gate  * Private CPR-safe version of zone_status_wait().
23940Sstevel@tonic-gate  */
23950Sstevel@tonic-gate static void
23960Sstevel@tonic-gate zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
23970Sstevel@tonic-gate {
23980Sstevel@tonic-gate 	callb_cpr_t cprinfo;
23990Sstevel@tonic-gate 
24000Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
24010Sstevel@tonic-gate 
24020Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
24030Sstevel@tonic-gate 	    str);
24040Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
24050Sstevel@tonic-gate 	while (zone->zone_status < status) {
24060Sstevel@tonic-gate 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
24070Sstevel@tonic-gate 		cv_wait(&zone->zone_cv, &zone_status_lock);
24080Sstevel@tonic-gate 		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
24090Sstevel@tonic-gate 	}
24100Sstevel@tonic-gate 	/*
24110Sstevel@tonic-gate 	 * zone_status_lock is implicitly released by the following.
24120Sstevel@tonic-gate 	 */
24130Sstevel@tonic-gate 	CALLB_CPR_EXIT(&cprinfo);
24140Sstevel@tonic-gate }
24150Sstevel@tonic-gate 
24160Sstevel@tonic-gate /*
24170Sstevel@tonic-gate  * Block until zone enters requested state or signal is received.  Return (0)
24180Sstevel@tonic-gate  * if signaled, non-zero otherwise.
24190Sstevel@tonic-gate  */
24200Sstevel@tonic-gate int
24210Sstevel@tonic-gate zone_status_wait_sig(zone_t *zone, zone_status_t status)
24220Sstevel@tonic-gate {
24230Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
24240Sstevel@tonic-gate 
24250Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
24260Sstevel@tonic-gate 	while (zone->zone_status < status) {
24270Sstevel@tonic-gate 		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
24280Sstevel@tonic-gate 			mutex_exit(&zone_status_lock);
24290Sstevel@tonic-gate 			return (0);
24300Sstevel@tonic-gate 		}
24310Sstevel@tonic-gate 	}
24320Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
24330Sstevel@tonic-gate 	return (1);
24340Sstevel@tonic-gate }
24350Sstevel@tonic-gate 
24360Sstevel@tonic-gate /*
24370Sstevel@tonic-gate  * Block until the zone enters the requested state or the timeout expires,
24380Sstevel@tonic-gate  * whichever happens first.  Return (-1) if operation timed out, time remaining
24390Sstevel@tonic-gate  * otherwise.
24400Sstevel@tonic-gate  */
24410Sstevel@tonic-gate clock_t
24420Sstevel@tonic-gate zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
24430Sstevel@tonic-gate {
24440Sstevel@tonic-gate 	clock_t timeleft = 0;
24450Sstevel@tonic-gate 
24460Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
24470Sstevel@tonic-gate 
24480Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
24490Sstevel@tonic-gate 	while (zone->zone_status < status && timeleft != -1) {
24500Sstevel@tonic-gate 		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
24510Sstevel@tonic-gate 	}
24520Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
24530Sstevel@tonic-gate 	return (timeleft);
24540Sstevel@tonic-gate }
24550Sstevel@tonic-gate 
24560Sstevel@tonic-gate /*
24570Sstevel@tonic-gate  * Block until the zone enters the requested state, the current process is
24580Sstevel@tonic-gate  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
24590Sstevel@tonic-gate  * operation timed out, 0 if signaled, time remaining otherwise.
24600Sstevel@tonic-gate  */
24610Sstevel@tonic-gate clock_t
24620Sstevel@tonic-gate zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
24630Sstevel@tonic-gate {
246411066Srafael.vanoni@sun.com 	clock_t timeleft = tim - ddi_get_lbolt();
24650Sstevel@tonic-gate 
24660Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
24670Sstevel@tonic-gate 
24680Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
24690Sstevel@tonic-gate 	while (zone->zone_status < status) {
24700Sstevel@tonic-gate 		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
24710Sstevel@tonic-gate 		    tim);
24720Sstevel@tonic-gate 		if (timeleft <= 0)
24730Sstevel@tonic-gate 			break;
24740Sstevel@tonic-gate 	}
24750Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
24760Sstevel@tonic-gate 	return (timeleft);
24770Sstevel@tonic-gate }
24780Sstevel@tonic-gate 
24790Sstevel@tonic-gate /*
24800Sstevel@tonic-gate  * Zones have two reference counts: one for references from credential
24810Sstevel@tonic-gate  * structures (zone_cred_ref), and one (zone_ref) for everything else.
24820Sstevel@tonic-gate  * This is so we can allow a zone to be rebooted while there are still
24830Sstevel@tonic-gate  * outstanding cred references, since certain drivers cache dblks (which
24840Sstevel@tonic-gate  * implicitly results in cached creds).  We wait for zone_ref to drop to
24850Sstevel@tonic-gate  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
24860Sstevel@tonic-gate  * later freed when the zone_cred_ref drops to 0, though nothing other
24870Sstevel@tonic-gate  * than the zone id and privilege set should be accessed once the zone
24880Sstevel@tonic-gate  * is "dead".
24890Sstevel@tonic-gate  *
24900Sstevel@tonic-gate  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
24910Sstevel@tonic-gate  * to force halt/reboot to block waiting for the zone_cred_ref to drop
24920Sstevel@tonic-gate  * to 0.  This can be useful to flush out other sources of cached creds
24930Sstevel@tonic-gate  * that may be less innocuous than the driver case.
24940Sstevel@tonic-gate  */
24950Sstevel@tonic-gate 
24960Sstevel@tonic-gate int zone_wait_for_cred = 0;
24970Sstevel@tonic-gate 
24980Sstevel@tonic-gate static void
24990Sstevel@tonic-gate zone_hold_locked(zone_t *z)
25000Sstevel@tonic-gate {
25010Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&z->zone_lock));
25020Sstevel@tonic-gate 	z->zone_ref++;
25030Sstevel@tonic-gate 	ASSERT(z->zone_ref != 0);
25040Sstevel@tonic-gate }
25050Sstevel@tonic-gate 
25060Sstevel@tonic-gate void
25070Sstevel@tonic-gate zone_hold(zone_t *z)
25080Sstevel@tonic-gate {
25090Sstevel@tonic-gate 	mutex_enter(&z->zone_lock);
25100Sstevel@tonic-gate 	zone_hold_locked(z);
25110Sstevel@tonic-gate 	mutex_exit(&z->zone_lock);
25120Sstevel@tonic-gate }
25130Sstevel@tonic-gate 
25140Sstevel@tonic-gate /*
25150Sstevel@tonic-gate  * If the non-cred ref count drops to 1 and either the cred ref count
25160Sstevel@tonic-gate  * is 0 or we aren't waiting for cred references, the zone is ready to
25170Sstevel@tonic-gate  * be destroyed.
25180Sstevel@tonic-gate  */
25190Sstevel@tonic-gate #define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
25200Sstevel@tonic-gate 	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
25210Sstevel@tonic-gate 
25220Sstevel@tonic-gate void
25230Sstevel@tonic-gate zone_rele(zone_t *z)
25240Sstevel@tonic-gate {
25250Sstevel@tonic-gate 	boolean_t wakeup;
25260Sstevel@tonic-gate 
25270Sstevel@tonic-gate 	mutex_enter(&z->zone_lock);
25280Sstevel@tonic-gate 	ASSERT(z->zone_ref != 0);
25290Sstevel@tonic-gate 	z->zone_ref--;
25300Sstevel@tonic-gate 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
25310Sstevel@tonic-gate 		/* no more refs, free the structure */
25320Sstevel@tonic-gate 		mutex_exit(&z->zone_lock);
25330Sstevel@tonic-gate 		zone_free(z);
25340Sstevel@tonic-gate 		return;
25350Sstevel@tonic-gate 	}
25360Sstevel@tonic-gate 	/* signal zone_destroy so the zone can finish halting */
25370Sstevel@tonic-gate 	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
25380Sstevel@tonic-gate 	mutex_exit(&z->zone_lock);
25390Sstevel@tonic-gate 
25400Sstevel@tonic-gate 	if (wakeup) {
25410Sstevel@tonic-gate 		/*
25420Sstevel@tonic-gate 		 * Grabbing zonehash_lock here effectively synchronizes with
25430Sstevel@tonic-gate 		 * zone_destroy() to avoid missed signals.
25440Sstevel@tonic-gate 		 */
25450Sstevel@tonic-gate 		mutex_enter(&zonehash_lock);
25460Sstevel@tonic-gate 		cv_broadcast(&zone_destroy_cv);
25470Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
25480Sstevel@tonic-gate 	}
25490Sstevel@tonic-gate }
25500Sstevel@tonic-gate 
25510Sstevel@tonic-gate void
25520Sstevel@tonic-gate zone_cred_hold(zone_t *z)
25530Sstevel@tonic-gate {
25540Sstevel@tonic-gate 	mutex_enter(&z->zone_lock);
25550Sstevel@tonic-gate 	z->zone_cred_ref++;
25560Sstevel@tonic-gate 	ASSERT(z->zone_cred_ref != 0);
25570Sstevel@tonic-gate 	mutex_exit(&z->zone_lock);
25580Sstevel@tonic-gate }
25590Sstevel@tonic-gate 
25600Sstevel@tonic-gate void
25610Sstevel@tonic-gate zone_cred_rele(zone_t *z)
25620Sstevel@tonic-gate {
25630Sstevel@tonic-gate 	boolean_t wakeup;
25640Sstevel@tonic-gate 
25650Sstevel@tonic-gate 	mutex_enter(&z->zone_lock);
25660Sstevel@tonic-gate 	ASSERT(z->zone_cred_ref != 0);
25670Sstevel@tonic-gate 	z->zone_cred_ref--;
25680Sstevel@tonic-gate 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
25690Sstevel@tonic-gate 		/* no more refs, free the structure */
25700Sstevel@tonic-gate 		mutex_exit(&z->zone_lock);
25710Sstevel@tonic-gate 		zone_free(z);
25720Sstevel@tonic-gate 		return;
25730Sstevel@tonic-gate 	}
25740Sstevel@tonic-gate 	/*
25750Sstevel@tonic-gate 	 * If zone_destroy is waiting for the cred references to drain
25760Sstevel@tonic-gate 	 * out, and they have, signal it.
25770Sstevel@tonic-gate 	 */
25780Sstevel@tonic-gate 	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
25790Sstevel@tonic-gate 	    zone_status_get(z) >= ZONE_IS_DEAD);
25800Sstevel@tonic-gate 	mutex_exit(&z->zone_lock);
25810Sstevel@tonic-gate 
25820Sstevel@tonic-gate 	if (wakeup) {
25830Sstevel@tonic-gate 		/*
25840Sstevel@tonic-gate 		 * Grabbing zonehash_lock here effectively synchronizes with
25850Sstevel@tonic-gate 		 * zone_destroy() to avoid missed signals.
25860Sstevel@tonic-gate 		 */
25870Sstevel@tonic-gate 		mutex_enter(&zonehash_lock);
25880Sstevel@tonic-gate 		cv_broadcast(&zone_destroy_cv);
25890Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
25900Sstevel@tonic-gate 	}
25910Sstevel@tonic-gate }
25920Sstevel@tonic-gate 
25930Sstevel@tonic-gate void
25940Sstevel@tonic-gate zone_task_hold(zone_t *z)
25950Sstevel@tonic-gate {
25960Sstevel@tonic-gate 	mutex_enter(&z->zone_lock);
25970Sstevel@tonic-gate 	z->zone_ntasks++;
25980Sstevel@tonic-gate 	ASSERT(z->zone_ntasks != 0);
25990Sstevel@tonic-gate 	mutex_exit(&z->zone_lock);
26000Sstevel@tonic-gate }
26010Sstevel@tonic-gate 
26020Sstevel@tonic-gate void
26030Sstevel@tonic-gate zone_task_rele(zone_t *zone)
26040Sstevel@tonic-gate {
26050Sstevel@tonic-gate 	uint_t refcnt;
26060Sstevel@tonic-gate 
26070Sstevel@tonic-gate 	mutex_enter(&zone->zone_lock);
26080Sstevel@tonic-gate 	ASSERT(zone->zone_ntasks != 0);
26090Sstevel@tonic-gate 	refcnt = --zone->zone_ntasks;
26100Sstevel@tonic-gate 	if (refcnt > 1)	{	/* Common case */
26110Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
26120Sstevel@tonic-gate 		return;
26130Sstevel@tonic-gate 	}
26140Sstevel@tonic-gate 	zone_hold_locked(zone);	/* so we can use the zone_t later */
26150Sstevel@tonic-gate 	mutex_exit(&zone->zone_lock);
26160Sstevel@tonic-gate 	if (refcnt == 1) {
26170Sstevel@tonic-gate 		/*
26180Sstevel@tonic-gate 		 * See if the zone is shutting down.
26190Sstevel@tonic-gate 		 */
26200Sstevel@tonic-gate 		mutex_enter(&zone_status_lock);
26210Sstevel@tonic-gate 		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
26220Sstevel@tonic-gate 			goto out;
26230Sstevel@tonic-gate 		}
26240Sstevel@tonic-gate 
26250Sstevel@tonic-gate 		/*
26260Sstevel@tonic-gate 		 * Make sure the ntasks didn't change since we
26270Sstevel@tonic-gate 		 * dropped zone_lock.
26280Sstevel@tonic-gate 		 */
26290Sstevel@tonic-gate 		mutex_enter(&zone->zone_lock);
26300Sstevel@tonic-gate 		if (refcnt != zone->zone_ntasks) {
26310Sstevel@tonic-gate 			mutex_exit(&zone->zone_lock);
26320Sstevel@tonic-gate 			goto out;
26330Sstevel@tonic-gate 		}
26340Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
26350Sstevel@tonic-gate 
26360Sstevel@tonic-gate 		/*
26370Sstevel@tonic-gate 		 * No more user processes in the zone.  The zone is empty.
26380Sstevel@tonic-gate 		 */
26390Sstevel@tonic-gate 		zone_status_set(zone, ZONE_IS_EMPTY);
26400Sstevel@tonic-gate 		goto out;
26410Sstevel@tonic-gate 	}
26420Sstevel@tonic-gate 
26430Sstevel@tonic-gate 	ASSERT(refcnt == 0);
26440Sstevel@tonic-gate 	/*
26450Sstevel@tonic-gate 	 * zsched has exited; the zone is dead.
26460Sstevel@tonic-gate 	 */
26470Sstevel@tonic-gate 	zone->zone_zsched = NULL;		/* paranoia */
26480Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
26490Sstevel@tonic-gate 	zone_status_set(zone, ZONE_IS_DEAD);
26500Sstevel@tonic-gate out:
26510Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
26520Sstevel@tonic-gate 	zone_rele(zone);
26530Sstevel@tonic-gate }
26540Sstevel@tonic-gate 
26550Sstevel@tonic-gate zoneid_t
26560Sstevel@tonic-gate getzoneid(void)
26570Sstevel@tonic-gate {
26580Sstevel@tonic-gate 	return (curproc->p_zone->zone_id);
26590Sstevel@tonic-gate }
26600Sstevel@tonic-gate 
26610Sstevel@tonic-gate /*
26620Sstevel@tonic-gate  * Internal versions of zone_find_by_*().  These don't zone_hold() or
26630Sstevel@tonic-gate  * check the validity of a zone's state.
26640Sstevel@tonic-gate  */
26650Sstevel@tonic-gate static zone_t *
26660Sstevel@tonic-gate zone_find_all_by_id(zoneid_t zoneid)
26670Sstevel@tonic-gate {
26680Sstevel@tonic-gate 	mod_hash_val_t hv;
26690Sstevel@tonic-gate 	zone_t *zone = NULL;
26700Sstevel@tonic-gate 
26710Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
26720Sstevel@tonic-gate 
26730Sstevel@tonic-gate 	if (mod_hash_find(zonehashbyid,
26740Sstevel@tonic-gate 	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
26750Sstevel@tonic-gate 		zone = (zone_t *)hv;
26760Sstevel@tonic-gate 	return (zone);
26770Sstevel@tonic-gate }
26780Sstevel@tonic-gate 
26790Sstevel@tonic-gate static zone_t *
26801676Sjpk zone_find_all_by_label(const ts_label_t *label)
26811676Sjpk {
26821676Sjpk 	mod_hash_val_t hv;
26831676Sjpk 	zone_t *zone = NULL;
26841676Sjpk 
26851676Sjpk 	ASSERT(MUTEX_HELD(&zonehash_lock));
26861676Sjpk 
26871676Sjpk 	/*
26881676Sjpk 	 * zonehashbylabel is not maintained for unlabeled systems
26891676Sjpk 	 */
26901676Sjpk 	if (!is_system_labeled())
26911676Sjpk 		return (NULL);
26921676Sjpk 	if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
26931676Sjpk 		zone = (zone_t *)hv;
26941676Sjpk 	return (zone);
26951676Sjpk }
26961676Sjpk 
26971676Sjpk static zone_t *
26980Sstevel@tonic-gate zone_find_all_by_name(char *name)
26990Sstevel@tonic-gate {
27000Sstevel@tonic-gate 	mod_hash_val_t hv;
27010Sstevel@tonic-gate 	zone_t *zone = NULL;
27020Sstevel@tonic-gate 
27030Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
27040Sstevel@tonic-gate 
27050Sstevel@tonic-gate 	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
27060Sstevel@tonic-gate 		zone = (zone_t *)hv;
27070Sstevel@tonic-gate 	return (zone);
27080Sstevel@tonic-gate }
27090Sstevel@tonic-gate 
27100Sstevel@tonic-gate /*
27110Sstevel@tonic-gate  * Public interface for looking up a zone by zoneid.  Only returns the zone if
27120Sstevel@tonic-gate  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
27130Sstevel@tonic-gate  * Caller must call zone_rele() once it is done with the zone.
27140Sstevel@tonic-gate  *
27150Sstevel@tonic-gate  * The zone may begin the zone_destroy() sequence immediately after this
27160Sstevel@tonic-gate  * function returns, but may be safely used until zone_rele() is called.
27170Sstevel@tonic-gate  */
27180Sstevel@tonic-gate zone_t *
27190Sstevel@tonic-gate zone_find_by_id(zoneid_t zoneid)
27200Sstevel@tonic-gate {
27210Sstevel@tonic-gate 	zone_t *zone;
27220Sstevel@tonic-gate 	zone_status_t status;
27230Sstevel@tonic-gate 
27240Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
27250Sstevel@tonic-gate 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
27260Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
27270Sstevel@tonic-gate 		return (NULL);
27280Sstevel@tonic-gate 	}
27290Sstevel@tonic-gate 	status = zone_status_get(zone);
27300Sstevel@tonic-gate 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
27310Sstevel@tonic-gate 		/*
27320Sstevel@tonic-gate 		 * For all practical purposes the zone doesn't exist.
27330Sstevel@tonic-gate 		 */
27340Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
27350Sstevel@tonic-gate 		return (NULL);
27360Sstevel@tonic-gate 	}
27370Sstevel@tonic-gate 	zone_hold(zone);
27380Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
27390Sstevel@tonic-gate 	return (zone);
27400Sstevel@tonic-gate }
27410Sstevel@tonic-gate 
27420Sstevel@tonic-gate /*
27431676Sjpk  * Similar to zone_find_by_id, but using zone label as the key.
27441676Sjpk  */
27451676Sjpk zone_t *
27461676Sjpk zone_find_by_label(const ts_label_t *label)
27471676Sjpk {
27481676Sjpk 	zone_t *zone;
27492110Srica 	zone_status_t status;
27501676Sjpk 
27511676Sjpk 	mutex_enter(&zonehash_lock);
27521676Sjpk 	if ((zone = zone_find_all_by_label(label)) == NULL) {
27531676Sjpk 		mutex_exit(&zonehash_lock);
27541676Sjpk 		return (NULL);
27551676Sjpk 	}
27562110Srica 
27572110Srica 	status = zone_status_get(zone);
27582110Srica 	if (status > ZONE_IS_DOWN) {
27591676Sjpk 		/*
27601676Sjpk 		 * For all practical purposes the zone doesn't exist.
27611676Sjpk 		 */
27622110Srica 		mutex_exit(&zonehash_lock);
27632110Srica 		return (NULL);
27641676Sjpk 	}
27652110Srica 	zone_hold(zone);
27661676Sjpk 	mutex_exit(&zonehash_lock);
27671676Sjpk 	return (zone);
27681676Sjpk }
27691676Sjpk 
27701676Sjpk /*
27710Sstevel@tonic-gate  * Similar to zone_find_by_id, but using zone name as the key.
27720Sstevel@tonic-gate  */
27730Sstevel@tonic-gate zone_t *
27740Sstevel@tonic-gate zone_find_by_name(char *name)
27750Sstevel@tonic-gate {
27760Sstevel@tonic-gate 	zone_t *zone;
27770Sstevel@tonic-gate 	zone_status_t status;
27780Sstevel@tonic-gate 
27790Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
27800Sstevel@tonic-gate 	if ((zone = zone_find_all_by_name(name)) == NULL) {
27810Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
27820Sstevel@tonic-gate 		return (NULL);
27830Sstevel@tonic-gate 	}
27840Sstevel@tonic-gate 	status = zone_status_get(zone);
27850Sstevel@tonic-gate 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
27860Sstevel@tonic-gate 		/*
27870Sstevel@tonic-gate 		 * For all practical purposes the zone doesn't exist.
27880Sstevel@tonic-gate 		 */
27890Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
27900Sstevel@tonic-gate 		return (NULL);
27910Sstevel@tonic-gate 	}
27920Sstevel@tonic-gate 	zone_hold(zone);
27930Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
27940Sstevel@tonic-gate 	return (zone);
27950Sstevel@tonic-gate }
27960Sstevel@tonic-gate 
27970Sstevel@tonic-gate /*
27980Sstevel@tonic-gate  * Similar to zone_find_by_id(), using the path as a key.  For instance,
27990Sstevel@tonic-gate  * if there is a zone "foo" rooted at /foo/root, and the path argument
28000Sstevel@tonic-gate  * is "/foo/root/proc", it will return the held zone_t corresponding to
28010Sstevel@tonic-gate  * zone "foo".
28020Sstevel@tonic-gate  *
28030Sstevel@tonic-gate  * zone_find_by_path() always returns a non-NULL value, since at the
28040Sstevel@tonic-gate  * very least every path will be contained in the global zone.
28050Sstevel@tonic-gate  *
28060Sstevel@tonic-gate  * As with the other zone_find_by_*() functions, the caller is
28070Sstevel@tonic-gate  * responsible for zone_rele()ing the return value of this function.
28080Sstevel@tonic-gate  */
28090Sstevel@tonic-gate zone_t *
28100Sstevel@tonic-gate zone_find_by_path(const char *path)
28110Sstevel@tonic-gate {
28120Sstevel@tonic-gate 	zone_t *zone;
28130Sstevel@tonic-gate 	zone_t *zret = NULL;
28140Sstevel@tonic-gate 	zone_status_t status;
28150Sstevel@tonic-gate 
28160Sstevel@tonic-gate 	if (path == NULL) {
28170Sstevel@tonic-gate 		/*
28180Sstevel@tonic-gate 		 * Call from rootconf().
28190Sstevel@tonic-gate 		 */
28200Sstevel@tonic-gate 		zone_hold(global_zone);
28210Sstevel@tonic-gate 		return (global_zone);
28220Sstevel@tonic-gate 	}
28230Sstevel@tonic-gate 	ASSERT(*path == '/');
28240Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
28250Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
28260Sstevel@tonic-gate 	    zone = list_next(&zone_active, zone)) {
28270Sstevel@tonic-gate 		if (ZONE_PATH_VISIBLE(path, zone))
28280Sstevel@tonic-gate 			zret = zone;
28290Sstevel@tonic-gate 	}
28300Sstevel@tonic-gate 	ASSERT(zret != NULL);
28310Sstevel@tonic-gate 	status = zone_status_get(zret);
28320Sstevel@tonic-gate 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
28330Sstevel@tonic-gate 		/*
28340Sstevel@tonic-gate 		 * Zone practically doesn't exist.
28350Sstevel@tonic-gate 		 */
28360Sstevel@tonic-gate 		zret = global_zone;
28370Sstevel@tonic-gate 	}
28380Sstevel@tonic-gate 	zone_hold(zret);
28390Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
28400Sstevel@tonic-gate 	return (zret);
28410Sstevel@tonic-gate }
28420Sstevel@tonic-gate 
28430Sstevel@tonic-gate /*
28440Sstevel@tonic-gate  * Get the number of cpus visible to this zone.  The system-wide global
28450Sstevel@tonic-gate  * 'ncpus' is returned if pools are disabled, the caller is in the
28460Sstevel@tonic-gate  * global zone, or a NULL zone argument is passed in.
28470Sstevel@tonic-gate  */
28480Sstevel@tonic-gate int
28490Sstevel@tonic-gate zone_ncpus_get(zone_t *zone)
28500Sstevel@tonic-gate {
28510Sstevel@tonic-gate 	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
28520Sstevel@tonic-gate 
28530Sstevel@tonic-gate 	return (myncpus != 0 ? myncpus : ncpus);
28540Sstevel@tonic-gate }
28550Sstevel@tonic-gate 
28560Sstevel@tonic-gate /*
28570Sstevel@tonic-gate  * Get the number of online cpus visible to this zone.  The system-wide
28580Sstevel@tonic-gate  * global 'ncpus_online' is returned if pools are disabled, the caller
28590Sstevel@tonic-gate  * is in the global zone, or a NULL zone argument is passed in.
28600Sstevel@tonic-gate  */
28610Sstevel@tonic-gate int
28620Sstevel@tonic-gate zone_ncpus_online_get(zone_t *zone)
28630Sstevel@tonic-gate {
28640Sstevel@tonic-gate 	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
28650Sstevel@tonic-gate 
28660Sstevel@tonic-gate 	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
28670Sstevel@tonic-gate }
28680Sstevel@tonic-gate 
28690Sstevel@tonic-gate /*
28700Sstevel@tonic-gate  * Return the pool to which the zone is currently bound.
28710Sstevel@tonic-gate  */
28720Sstevel@tonic-gate pool_t *
28730Sstevel@tonic-gate zone_pool_get(zone_t *zone)
28740Sstevel@tonic-gate {
28750Sstevel@tonic-gate 	ASSERT(pool_lock_held());
28760Sstevel@tonic-gate 
28770Sstevel@tonic-gate 	return (zone->zone_pool);
28780Sstevel@tonic-gate }
28790Sstevel@tonic-gate 
28800Sstevel@tonic-gate /*
28810Sstevel@tonic-gate  * Set the zone's pool pointer and update the zone's visibility to match
28820Sstevel@tonic-gate  * the resources in the new pool.
28830Sstevel@tonic-gate  */
28840Sstevel@tonic-gate void
28850Sstevel@tonic-gate zone_pool_set(zone_t *zone, pool_t *pool)
28860Sstevel@tonic-gate {
28870Sstevel@tonic-gate 	ASSERT(pool_lock_held());
28880Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
28890Sstevel@tonic-gate 
28900Sstevel@tonic-gate 	zone->zone_pool = pool;
28910Sstevel@tonic-gate 	zone_pset_set(zone, pool->pool_pset->pset_id);
28920Sstevel@tonic-gate }
28930Sstevel@tonic-gate 
28940Sstevel@tonic-gate /*
28950Sstevel@tonic-gate  * Return the cached value of the id of the processor set to which the
28960Sstevel@tonic-gate  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
28970Sstevel@tonic-gate  * facility is disabled.
28980Sstevel@tonic-gate  */
28990Sstevel@tonic-gate psetid_t
29000Sstevel@tonic-gate zone_pset_get(zone_t *zone)
29010Sstevel@tonic-gate {
29020Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
29030Sstevel@tonic-gate 
29040Sstevel@tonic-gate 	return (zone->zone_psetid);
29050Sstevel@tonic-gate }
29060Sstevel@tonic-gate 
29070Sstevel@tonic-gate /*
29080Sstevel@tonic-gate  * Set the cached value of the id of the processor set to which the zone
29090Sstevel@tonic-gate  * is currently bound.  Also update the zone's visibility to match the
29100Sstevel@tonic-gate  * resources in the new processor set.
29110Sstevel@tonic-gate  */
29120Sstevel@tonic-gate void
29130Sstevel@tonic-gate zone_pset_set(zone_t *zone, psetid_t newpsetid)
29140Sstevel@tonic-gate {
29150Sstevel@tonic-gate 	psetid_t oldpsetid;
29160Sstevel@tonic-gate 
29170Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
29180Sstevel@tonic-gate 	oldpsetid = zone_pset_get(zone);
29190Sstevel@tonic-gate 
29200Sstevel@tonic-gate 	if (oldpsetid == newpsetid)
29210Sstevel@tonic-gate 		return;
29220Sstevel@tonic-gate 	/*
29230Sstevel@tonic-gate 	 * Global zone sees all.
29240Sstevel@tonic-gate 	 */
29250Sstevel@tonic-gate 	if (zone != global_zone) {
29260Sstevel@tonic-gate 		zone->zone_psetid = newpsetid;
29270Sstevel@tonic-gate 		if (newpsetid != ZONE_PS_INVAL)
29280Sstevel@tonic-gate 			pool_pset_visibility_add(newpsetid, zone);
29290Sstevel@tonic-gate 		if (oldpsetid != ZONE_PS_INVAL)
29300Sstevel@tonic-gate 			pool_pset_visibility_remove(oldpsetid, zone);
29310Sstevel@tonic-gate 	}
29320Sstevel@tonic-gate 	/*
29330Sstevel@tonic-gate 	 * Disabling pools, so we should start using the global values
29340Sstevel@tonic-gate 	 * for ncpus and ncpus_online.
29350Sstevel@tonic-gate 	 */
29360Sstevel@tonic-gate 	if (newpsetid == ZONE_PS_INVAL) {
29370Sstevel@tonic-gate 		zone->zone_ncpus = 0;
29380Sstevel@tonic-gate 		zone->zone_ncpus_online = 0;
29390Sstevel@tonic-gate 	}
29400Sstevel@tonic-gate }
29410Sstevel@tonic-gate 
29420Sstevel@tonic-gate /*
29430Sstevel@tonic-gate  * Walk the list of active zones and issue the provided callback for
29440Sstevel@tonic-gate  * each of them.
29450Sstevel@tonic-gate  *
29460Sstevel@tonic-gate  * Caller must not be holding any locks that may be acquired under
29470Sstevel@tonic-gate  * zonehash_lock.  See comment at the beginning of the file for a list of
29480Sstevel@tonic-gate  * common locks and their interactions with zones.
29490Sstevel@tonic-gate  */
29500Sstevel@tonic-gate int
29510Sstevel@tonic-gate zone_walk(int (*cb)(zone_t *, void *), void *data)
29520Sstevel@tonic-gate {
29530Sstevel@tonic-gate 	zone_t *zone;
29540Sstevel@tonic-gate 	int ret = 0;
29550Sstevel@tonic-gate 	zone_status_t status;
29560Sstevel@tonic-gate 
29570Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
29580Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
29590Sstevel@tonic-gate 	    zone = list_next(&zone_active, zone)) {
29600Sstevel@tonic-gate 		/*
29610Sstevel@tonic-gate 		 * Skip zones that shouldn't be externally visible.
29620Sstevel@tonic-gate 		 */
29630Sstevel@tonic-gate 		status = zone_status_get(zone);
29640Sstevel@tonic-gate 		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
29650Sstevel@tonic-gate 			continue;
29660Sstevel@tonic-gate 		/*
29670Sstevel@tonic-gate 		 * Bail immediately if any callback invocation returns a
29680Sstevel@tonic-gate 		 * non-zero value.
29690Sstevel@tonic-gate 		 */
29700Sstevel@tonic-gate 		ret = (*cb)(zone, data);
29710Sstevel@tonic-gate 		if (ret != 0)
29720Sstevel@tonic-gate 			break;
29730Sstevel@tonic-gate 	}
29740Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
29750Sstevel@tonic-gate 	return (ret);
29760Sstevel@tonic-gate }
29770Sstevel@tonic-gate 
29780Sstevel@tonic-gate static int
29790Sstevel@tonic-gate zone_set_root(zone_t *zone, const char *upath)
29800Sstevel@tonic-gate {
29810Sstevel@tonic-gate 	vnode_t *vp;
29820Sstevel@tonic-gate 	int trycount;
29830Sstevel@tonic-gate 	int error = 0;
29840Sstevel@tonic-gate 	char *path;
29850Sstevel@tonic-gate 	struct pathname upn, pn;
29860Sstevel@tonic-gate 	size_t pathlen;
29870Sstevel@tonic-gate 
29880Sstevel@tonic-gate 	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
29890Sstevel@tonic-gate 		return (error);
29900Sstevel@tonic-gate 
29910Sstevel@tonic-gate 	pn_alloc(&pn);
29920Sstevel@tonic-gate 
29930Sstevel@tonic-gate 	/* prevent infinite loop */
29940Sstevel@tonic-gate 	trycount = 10;
29950Sstevel@tonic-gate 	for (;;) {
29960Sstevel@tonic-gate 		if (--trycount <= 0) {
29970Sstevel@tonic-gate 			error = ESTALE;
29980Sstevel@tonic-gate 			goto out;
29990Sstevel@tonic-gate 		}
30000Sstevel@tonic-gate 
30010Sstevel@tonic-gate 		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
30020Sstevel@tonic-gate 			/*
30030Sstevel@tonic-gate 			 * VOP_ACCESS() may cover 'vp' with a new
30040Sstevel@tonic-gate 			 * filesystem, if 'vp' is an autoFS vnode.
30050Sstevel@tonic-gate 			 * Get the new 'vp' if so.
30060Sstevel@tonic-gate 			 */
30075331Samw 			if ((error =
30085331Samw 			    VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
30094417Seh208807 			    (!vn_ismntpt(vp) ||
30100Sstevel@tonic-gate 			    (error = traverse(&vp)) == 0)) {
30110Sstevel@tonic-gate 				pathlen = pn.pn_pathlen + 2;
30120Sstevel@tonic-gate 				path = kmem_alloc(pathlen, KM_SLEEP);
30130Sstevel@tonic-gate 				(void) strncpy(path, pn.pn_path,
30140Sstevel@tonic-gate 				    pn.pn_pathlen + 1);
30150Sstevel@tonic-gate 				path[pathlen - 2] = '/';
30160Sstevel@tonic-gate 				path[pathlen - 1] = '\0';
30170Sstevel@tonic-gate 				pn_free(&pn);
30180Sstevel@tonic-gate 				pn_free(&upn);
30190Sstevel@tonic-gate 
30200Sstevel@tonic-gate 				/* Success! */
30210Sstevel@tonic-gate 				break;
30220Sstevel@tonic-gate 			}
30230Sstevel@tonic-gate 			VN_RELE(vp);
30240Sstevel@tonic-gate 		}
30250Sstevel@tonic-gate 		if (error != ESTALE)
30260Sstevel@tonic-gate 			goto out;
30270Sstevel@tonic-gate 	}
30280Sstevel@tonic-gate 
30290Sstevel@tonic-gate 	ASSERT(error == 0);
30300Sstevel@tonic-gate 	zone->zone_rootvp = vp;		/* we hold a reference to vp */
30310Sstevel@tonic-gate 	zone->zone_rootpath = path;
30320Sstevel@tonic-gate 	zone->zone_rootpathlen = pathlen;
30331769Scarlsonj 	if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
30341769Scarlsonj 		zone->zone_flags |= ZF_IS_SCRATCH;
30350Sstevel@tonic-gate 	return (0);
30360Sstevel@tonic-gate 
30370Sstevel@tonic-gate out:
30380Sstevel@tonic-gate 	pn_free(&pn);
30390Sstevel@tonic-gate 	pn_free(&upn);
30400Sstevel@tonic-gate 	return (error);
30410Sstevel@tonic-gate }
30420Sstevel@tonic-gate 
30430Sstevel@tonic-gate #define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
30440Sstevel@tonic-gate 			((c) >= 'a' && (c) <= 'z') || \
30450Sstevel@tonic-gate 			((c) >= 'A' && (c) <= 'Z'))
30460Sstevel@tonic-gate 
30470Sstevel@tonic-gate static int
30480Sstevel@tonic-gate zone_set_name(zone_t *zone, const char *uname)
30490Sstevel@tonic-gate {
30500Sstevel@tonic-gate 	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
30510Sstevel@tonic-gate 	size_t len;
30520Sstevel@tonic-gate 	int i, err;
30530Sstevel@tonic-gate 
30540Sstevel@tonic-gate 	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
30550Sstevel@tonic-gate 		kmem_free(kname, ZONENAME_MAX);
30560Sstevel@tonic-gate 		return (err);	/* EFAULT or ENAMETOOLONG */
30570Sstevel@tonic-gate 	}
30580Sstevel@tonic-gate 
30590Sstevel@tonic-gate 	/* must be less than ZONENAME_MAX */
30600Sstevel@tonic-gate 	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
30610Sstevel@tonic-gate 		kmem_free(kname, ZONENAME_MAX);
30620Sstevel@tonic-gate 		return (EINVAL);
30630Sstevel@tonic-gate 	}
30640Sstevel@tonic-gate 
30650Sstevel@tonic-gate 	/*
30660Sstevel@tonic-gate 	 * Name must start with an alphanumeric and must contain only
30670Sstevel@tonic-gate 	 * alphanumerics, '-', '_' and '.'.
30680Sstevel@tonic-gate 	 */
30690Sstevel@tonic-gate 	if (!isalnum(kname[0])) {
30700Sstevel@tonic-gate 		kmem_free(kname, ZONENAME_MAX);
30710Sstevel@tonic-gate 		return (EINVAL);
30720Sstevel@tonic-gate 	}
30730Sstevel@tonic-gate 	for (i = 1; i < len - 1; i++) {
30740Sstevel@tonic-gate 		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
30750Sstevel@tonic-gate 		    kname[i] != '.') {
30760Sstevel@tonic-gate 			kmem_free(kname, ZONENAME_MAX);
30770Sstevel@tonic-gate 			return (EINVAL);
30780Sstevel@tonic-gate 		}
30790Sstevel@tonic-gate 	}
30800Sstevel@tonic-gate 
30810Sstevel@tonic-gate 	zone->zone_name = kname;
30820Sstevel@tonic-gate 	return (0);
30830Sstevel@tonic-gate }
30840Sstevel@tonic-gate 
30850Sstevel@tonic-gate /*
30868662SJordan.Vaughan@Sun.com  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
30878662SJordan.Vaughan@Sun.com  * is NULL or it points to a zone with no hostid emulation, then the machine's
30888662SJordan.Vaughan@Sun.com  * hostid (i.e., the global zone's hostid) is returned.  This function returns
30898662SJordan.Vaughan@Sun.com  * zero if neither the zone nor the host machine (global zone) have hostids.  It
30908662SJordan.Vaughan@Sun.com  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
30918662SJordan.Vaughan@Sun.com  * hostid and the machine's hostid is invalid.
30928662SJordan.Vaughan@Sun.com  */
30938662SJordan.Vaughan@Sun.com uint32_t
30948662SJordan.Vaughan@Sun.com zone_get_hostid(zone_t *zonep)
30958662SJordan.Vaughan@Sun.com {
30968662SJordan.Vaughan@Sun.com 	unsigned long machine_hostid;
30978662SJordan.Vaughan@Sun.com 
30988662SJordan.Vaughan@Sun.com 	if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
30998662SJordan.Vaughan@Sun.com 		if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
31008662SJordan.Vaughan@Sun.com 			return (HW_INVALID_HOSTID);
31018662SJordan.Vaughan@Sun.com 		return ((uint32_t)machine_hostid);
31028662SJordan.Vaughan@Sun.com 	}
31038662SJordan.Vaughan@Sun.com 	return (zonep->zone_hostid);
31048662SJordan.Vaughan@Sun.com }
31058662SJordan.Vaughan@Sun.com 
31068662SJordan.Vaughan@Sun.com /*
31070Sstevel@tonic-gate  * Similar to thread_create(), but makes sure the thread is in the appropriate
31080Sstevel@tonic-gate  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
31090Sstevel@tonic-gate  */
31100Sstevel@tonic-gate /*ARGSUSED*/
31110Sstevel@tonic-gate kthread_t *
31120Sstevel@tonic-gate zthread_create(
31130Sstevel@tonic-gate     caddr_t stk,
31140Sstevel@tonic-gate     size_t stksize,
31150Sstevel@tonic-gate     void (*proc)(),
31160Sstevel@tonic-gate     void *arg,
31170Sstevel@tonic-gate     size_t len,
31180Sstevel@tonic-gate     pri_t pri)
31190Sstevel@tonic-gate {
31200Sstevel@tonic-gate 	kthread_t *t;
31210Sstevel@tonic-gate 	zone_t *zone = curproc->p_zone;
31220Sstevel@tonic-gate 	proc_t *pp = zone->zone_zsched;
31230Sstevel@tonic-gate 
31240Sstevel@tonic-gate 	zone_hold(zone);	/* Reference to be dropped when thread exits */
31250Sstevel@tonic-gate 
31260Sstevel@tonic-gate 	/*
31270Sstevel@tonic-gate 	 * No-one should be trying to create threads if the zone is shutting
31280Sstevel@tonic-gate 	 * down and there aren't any kernel threads around.  See comment
31290Sstevel@tonic-gate 	 * in zthread_exit().
31300Sstevel@tonic-gate 	 */
31310Sstevel@tonic-gate 	ASSERT(!(zone->zone_kthreads == NULL &&
31320Sstevel@tonic-gate 	    zone_status_get(zone) >= ZONE_IS_EMPTY));
31330Sstevel@tonic-gate 	/*
31340Sstevel@tonic-gate 	 * Create a thread, but don't let it run until we've finished setting
31350Sstevel@tonic-gate 	 * things up.
31360Sstevel@tonic-gate 	 */
31370Sstevel@tonic-gate 	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
31380Sstevel@tonic-gate 	ASSERT(t->t_forw == NULL);
31390Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
31400Sstevel@tonic-gate 	if (zone->zone_kthreads == NULL) {
31410Sstevel@tonic-gate 		t->t_forw = t->t_back = t;
31420Sstevel@tonic-gate 	} else {
31430Sstevel@tonic-gate 		kthread_t *tx = zone->zone_kthreads;
31440Sstevel@tonic-gate 
31450Sstevel@tonic-gate 		t->t_forw = tx;
31460Sstevel@tonic-gate 		t->t_back = tx->t_back;
31470Sstevel@tonic-gate 		tx->t_back->t_forw = t;
31480Sstevel@tonic-gate 		tx->t_back = t;
31490Sstevel@tonic-gate 	}
31500Sstevel@tonic-gate 	zone->zone_kthreads = t;
31510Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
31520Sstevel@tonic-gate 
31530Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
31540Sstevel@tonic-gate 	t->t_proc_flag |= TP_ZTHREAD;
31550Sstevel@tonic-gate 	project_rele(t->t_proj);
31560Sstevel@tonic-gate 	t->t_proj = project_hold(pp->p_task->tk_proj);
31570Sstevel@tonic-gate 
31580Sstevel@tonic-gate 	/*
31590Sstevel@tonic-gate 	 * Setup complete, let it run.
31600Sstevel@tonic-gate 	 */
31610Sstevel@tonic-gate 	thread_lock(t);
31620Sstevel@tonic-gate 	t->t_schedflag |= TS_ALLSTART;
31630Sstevel@tonic-gate 	setrun_locked(t);
31640Sstevel@tonic-gate 	thread_unlock(t);
31650Sstevel@tonic-gate 
31660Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
31670Sstevel@tonic-gate 
31680Sstevel@tonic-gate 	return (t);
31690Sstevel@tonic-gate }
31700Sstevel@tonic-gate 
31710Sstevel@tonic-gate /*
31720Sstevel@tonic-gate  * Similar to thread_exit().  Must be called by threads created via
31730Sstevel@tonic-gate  * zthread_exit().
31740Sstevel@tonic-gate  */
31750Sstevel@tonic-gate void
31760Sstevel@tonic-gate zthread_exit(void)
31770Sstevel@tonic-gate {
31780Sstevel@tonic-gate 	kthread_t *t = curthread;
31790Sstevel@tonic-gate 	proc_t *pp = curproc;
31800Sstevel@tonic-gate 	zone_t *zone = pp->p_zone;
31810Sstevel@tonic-gate 
31820Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
31830Sstevel@tonic-gate 
31840Sstevel@tonic-gate 	/*
31850Sstevel@tonic-gate 	 * Reparent to p0
31860Sstevel@tonic-gate 	 */
31871075Sjosephb 	kpreempt_disable();
31880Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
31890Sstevel@tonic-gate 	t->t_proc_flag &= ~TP_ZTHREAD;
31900Sstevel@tonic-gate 	t->t_procp = &p0;
31910Sstevel@tonic-gate 	hat_thread_exit(t);
31920Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
31931075Sjosephb 	kpreempt_enable();
31940Sstevel@tonic-gate 
31950Sstevel@tonic-gate 	if (t->t_back == t) {
31960Sstevel@tonic-gate 		ASSERT(t->t_forw == t);
31970Sstevel@tonic-gate 		/*
31980Sstevel@tonic-gate 		 * If the zone is empty, once the thread count
31990Sstevel@tonic-gate 		 * goes to zero no further kernel threads can be
32000Sstevel@tonic-gate 		 * created.  This is because if the creator is a process
32010Sstevel@tonic-gate 		 * in the zone, then it must have exited before the zone
32020Sstevel@tonic-gate 		 * state could be set to ZONE_IS_EMPTY.
32030Sstevel@tonic-gate 		 * Otherwise, if the creator is a kernel thread in the
32040Sstevel@tonic-gate 		 * zone, the thread count is non-zero.
32050Sstevel@tonic-gate 		 *
32060Sstevel@tonic-gate 		 * This really means that non-zone kernel threads should
32070Sstevel@tonic-gate 		 * not create zone kernel threads.
32080Sstevel@tonic-gate 		 */
32090Sstevel@tonic-gate 		zone->zone_kthreads = NULL;
32100Sstevel@tonic-gate 		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
32110Sstevel@tonic-gate 			zone_status_set(zone, ZONE_IS_DOWN);
32123792Sakolb 			/*
32133792Sakolb 			 * Remove any CPU caps on this zone.
32143792Sakolb 			 */
32153792Sakolb 			cpucaps_zone_remove(zone);
32160Sstevel@tonic-gate 		}
32170Sstevel@tonic-gate 	} else {
32180Sstevel@tonic-gate 		t->t_forw->t_back = t->t_back;
32190Sstevel@tonic-gate 		t->t_back->t_forw = t->t_forw;
32200Sstevel@tonic-gate 		if (zone->zone_kthreads == t)
32210Sstevel@tonic-gate 			zone->zone_kthreads = t->t_forw;
32220Sstevel@tonic-gate 	}
32230Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
32240Sstevel@tonic-gate 	zone_rele(zone);
32250Sstevel@tonic-gate 	thread_exit();
32260Sstevel@tonic-gate 	/* NOTREACHED */
32270Sstevel@tonic-gate }
32280Sstevel@tonic-gate 
32290Sstevel@tonic-gate static void
32300Sstevel@tonic-gate zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
32310Sstevel@tonic-gate {
32320Sstevel@tonic-gate 	vnode_t *oldvp;
32330Sstevel@tonic-gate 
32340Sstevel@tonic-gate 	/* we're going to hold a reference here to the directory */
32350Sstevel@tonic-gate 	VN_HOLD(vp);
32360Sstevel@tonic-gate 
323711861SMarek.Pospisil@Sun.COM 	/* update abs cwd/root path see c2/audit.c */
323811861SMarek.Pospisil@Sun.COM 	if (AU_AUDITING())
32390Sstevel@tonic-gate 		audit_chdirec(vp, vpp);
32400Sstevel@tonic-gate 
32410Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
32420Sstevel@tonic-gate 	oldvp = *vpp;
32430Sstevel@tonic-gate 	*vpp = vp;
32440Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
32450Sstevel@tonic-gate 	if (oldvp != NULL)
32460Sstevel@tonic-gate 		VN_RELE(oldvp);
32470Sstevel@tonic-gate }
32480Sstevel@tonic-gate 
32490Sstevel@tonic-gate /*
32500Sstevel@tonic-gate  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
32510Sstevel@tonic-gate  */
32520Sstevel@tonic-gate static int
32530Sstevel@tonic-gate nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
32540Sstevel@tonic-gate {
32550Sstevel@tonic-gate 	nvpair_t *nvp = NULL;
32560Sstevel@tonic-gate 	boolean_t priv_set = B_FALSE;
32570Sstevel@tonic-gate 	boolean_t limit_set = B_FALSE;
32580Sstevel@tonic-gate 	boolean_t action_set = B_FALSE;
32590Sstevel@tonic-gate 
32600Sstevel@tonic-gate 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
32610Sstevel@tonic-gate 		const char *name;
32620Sstevel@tonic-gate 		uint64_t ui64;
32630Sstevel@tonic-gate 
32640Sstevel@tonic-gate 		name = nvpair_name(nvp);
32650Sstevel@tonic-gate 		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
32660Sstevel@tonic-gate 			return (EINVAL);
32670Sstevel@tonic-gate 		(void) nvpair_value_uint64(nvp, &ui64);
32680Sstevel@tonic-gate 		if (strcmp(name, "privilege") == 0) {
32690Sstevel@tonic-gate 			/*
32700Sstevel@tonic-gate 			 * Currently only privileged values are allowed, but
32710Sstevel@tonic-gate 			 * this may change in the future.
32720Sstevel@tonic-gate 			 */
32730Sstevel@tonic-gate 			if (ui64 != RCPRIV_PRIVILEGED)
32740Sstevel@tonic-gate 				return (EINVAL);
32750Sstevel@tonic-gate 			rv->rcv_privilege = ui64;
32760Sstevel@tonic-gate 			priv_set = B_TRUE;
32770Sstevel@tonic-gate 		} else if (strcmp(name, "limit") == 0) {
32780Sstevel@tonic-gate 			rv->rcv_value = ui64;
32790Sstevel@tonic-gate 			limit_set = B_TRUE;
32800Sstevel@tonic-gate 		} else if (strcmp(name, "action") == 0) {
32810Sstevel@tonic-gate 			if (ui64 != RCTL_LOCAL_NOACTION &&
32820Sstevel@tonic-gate 			    ui64 != RCTL_LOCAL_DENY)
32830Sstevel@tonic-gate 				return (EINVAL);
32840Sstevel@tonic-gate 			rv->rcv_flagaction = ui64;
32850Sstevel@tonic-gate 			action_set = B_TRUE;
32860Sstevel@tonic-gate 		} else {
32870Sstevel@tonic-gate 			return (EINVAL);
32880Sstevel@tonic-gate 		}
32890Sstevel@tonic-gate 	}
32900Sstevel@tonic-gate 
32910Sstevel@tonic-gate 	if (!(priv_set && limit_set && action_set))
32920Sstevel@tonic-gate 		return (EINVAL);
32930Sstevel@tonic-gate 	rv->rcv_action_signal = 0;
32940Sstevel@tonic-gate 	rv->rcv_action_recipient = NULL;
32950Sstevel@tonic-gate 	rv->rcv_action_recip_pid = -1;
32960Sstevel@tonic-gate 	rv->rcv_firing_time = 0;
32970Sstevel@tonic-gate 
32980Sstevel@tonic-gate 	return (0);
32990Sstevel@tonic-gate }
33000Sstevel@tonic-gate 
33012267Sdp /*
33022267Sdp  * Non-global zone version of start_init.
33032267Sdp  */
33040Sstevel@tonic-gate void
33052267Sdp zone_start_init(void)
33060Sstevel@tonic-gate {
33070Sstevel@tonic-gate 	proc_t *p = ttoproc(curthread);
33082712Snn35248 	zone_t *z = p->p_zone;
33092267Sdp 
33102267Sdp 	ASSERT(!INGLOBALZONE(curproc));
33110Sstevel@tonic-gate 
33120Sstevel@tonic-gate 	/*
33132712Snn35248 	 * For all purposes (ZONE_ATTR_INITPID and restart_init),
33142712Snn35248 	 * storing just the pid of init is sufficient.
33152712Snn35248 	 */
33162712Snn35248 	z->zone_proc_initpid = p->p_pid;
33172712Snn35248 
33182712Snn35248 	/*
33192267Sdp 	 * We maintain zone_boot_err so that we can return the cause of the
33202267Sdp 	 * failure back to the caller of the zone_boot syscall.
33210Sstevel@tonic-gate 	 */
33222267Sdp 	p->p_zone->zone_boot_err = start_init_common();
33230Sstevel@tonic-gate 
33248364SJordan.Vaughan@Sun.com 	/*
33258364SJordan.Vaughan@Sun.com 	 * We will prevent booting zones from becoming running zones if the
33268364SJordan.Vaughan@Sun.com 	 * global zone is shutting down.
33278364SJordan.Vaughan@Sun.com 	 */
33280Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
33298364SJordan.Vaughan@Sun.com 	if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
33308364SJordan.Vaughan@Sun.com 	    ZONE_IS_SHUTTING_DOWN) {
33310Sstevel@tonic-gate 		/*
33320Sstevel@tonic-gate 		 * Make sure we are still in the booting state-- we could have
33330Sstevel@tonic-gate 		 * raced and already be shutting down, or even further along.
33340Sstevel@tonic-gate 		 */
33353792Sakolb 		if (zone_status_get(z) == ZONE_IS_BOOTING) {
33362712Snn35248 			zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
33373792Sakolb 		}
33380Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
33390Sstevel@tonic-gate 		/* It's gone bad, dispose of the process */
33402712Snn35248 		if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3341390Sraf 			mutex_enter(&p->p_lock);
3342390Sraf 			ASSERT(p->p_flag & SEXITLWPS);
33430Sstevel@tonic-gate 			lwp_exit();
33440Sstevel@tonic-gate 		}
33450Sstevel@tonic-gate 	} else {
33462712Snn35248 		if (zone_status_get(z) == ZONE_IS_BOOTING)
33472712Snn35248 			zone_status_set(z, ZONE_IS_RUNNING);
33480Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
33490Sstevel@tonic-gate 		/* cause the process to return to userland. */
33500Sstevel@tonic-gate 		lwp_rtt();
33510Sstevel@tonic-gate 	}
33520Sstevel@tonic-gate }
33530Sstevel@tonic-gate 
33540Sstevel@tonic-gate struct zsched_arg {
33550Sstevel@tonic-gate 	zone_t *zone;
33560Sstevel@tonic-gate 	nvlist_t *nvlist;
33570Sstevel@tonic-gate };
33580Sstevel@tonic-gate 
33590Sstevel@tonic-gate /*
33600Sstevel@tonic-gate  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
33610Sstevel@tonic-gate  * anything to do with scheduling, but rather with the fact that
33620Sstevel@tonic-gate  * per-zone kernel threads are parented to zsched, just like regular
33630Sstevel@tonic-gate  * kernel threads are parented to sched (p0).
33640Sstevel@tonic-gate  *
33650Sstevel@tonic-gate  * zsched is also responsible for launching init for the zone.
33660Sstevel@tonic-gate  */
33670Sstevel@tonic-gate static void
33680Sstevel@tonic-gate zsched(void *arg)
33690Sstevel@tonic-gate {
33700Sstevel@tonic-gate 	struct zsched_arg *za = arg;
33710Sstevel@tonic-gate 	proc_t *pp = curproc;
33720Sstevel@tonic-gate 	proc_t *initp = proc_init;
33730Sstevel@tonic-gate 	zone_t *zone = za->zone;
33740Sstevel@tonic-gate 	cred_t *cr, *oldcred;
33750Sstevel@tonic-gate 	rctl_set_t *set;
33760Sstevel@tonic-gate 	rctl_alloc_gp_t *gp;
33770Sstevel@tonic-gate 	contract_t *ct = NULL;
33780Sstevel@tonic-gate 	task_t *tk, *oldtk;
33790Sstevel@tonic-gate 	rctl_entity_p_t e;
33800Sstevel@tonic-gate 	kproject_t *pj;
33810Sstevel@tonic-gate 
33820Sstevel@tonic-gate 	nvlist_t *nvl = za->nvlist;
33830Sstevel@tonic-gate 	nvpair_t *nvp = NULL;
33840Sstevel@tonic-gate 
33853446Smrj 	bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
33863446Smrj 	bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
33873446Smrj 	PTOU(pp)->u_argc = 0;
33883446Smrj 	PTOU(pp)->u_argv = NULL;
33893446Smrj 	PTOU(pp)->u_envp = NULL;
33900Sstevel@tonic-gate 	closeall(P_FINFO(pp));
33910Sstevel@tonic-gate 
33920Sstevel@tonic-gate 	/*
33930Sstevel@tonic-gate 	 * We are this zone's "zsched" process.  As the zone isn't generally
33940Sstevel@tonic-gate 	 * visible yet we don't need to grab any locks before initializing its
33950Sstevel@tonic-gate 	 * zone_proc pointer.
33960Sstevel@tonic-gate 	 */
33970Sstevel@tonic-gate 	zone_hold(zone);  /* this hold is released by zone_destroy() */
33980Sstevel@tonic-gate 	zone->zone_zsched = pp;
33990Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
34000Sstevel@tonic-gate 	pp->p_zone = zone;
34010Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
34020Sstevel@tonic-gate 
34030Sstevel@tonic-gate 	/*
34040Sstevel@tonic-gate 	 * Disassociate process from its 'parent'; parent ourselves to init
34050Sstevel@tonic-gate 	 * (pid 1) and change other values as needed.
34060Sstevel@tonic-gate 	 */
34070Sstevel@tonic-gate 	sess_create();
34080Sstevel@tonic-gate 
34090Sstevel@tonic-gate 	mutex_enter(&pidlock);
34100Sstevel@tonic-gate 	proc_detach(pp);
34110Sstevel@tonic-gate 	pp->p_ppid = 1;
34120Sstevel@tonic-gate 	pp->p_flag |= SZONETOP;
34130Sstevel@tonic-gate 	pp->p_ancpid = 1;
34140Sstevel@tonic-gate 	pp->p_parent = initp;
34150Sstevel@tonic-gate 	pp->p_psibling = NULL;
34160Sstevel@tonic-gate 	if (initp->p_child)
34170Sstevel@tonic-gate 		initp->p_child->p_psibling = pp;
34180Sstevel@tonic-gate 	pp->p_sibling = initp->p_child;
34190Sstevel@tonic-gate 	initp->p_child = pp;
34200Sstevel@tonic-gate 
34210Sstevel@tonic-gate 	/* Decrement what newproc() incremented. */
34220Sstevel@tonic-gate 	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
34230Sstevel@tonic-gate 	/*
34240Sstevel@tonic-gate 	 * Our credentials are about to become kcred-like, so we don't care
34250Sstevel@tonic-gate 	 * about the caller's ruid.
34260Sstevel@tonic-gate 	 */
34270Sstevel@tonic-gate 	upcount_inc(crgetruid(kcred), zone->zone_id);
34280Sstevel@tonic-gate 	mutex_exit(&pidlock);
34290Sstevel@tonic-gate 
34300Sstevel@tonic-gate 	/*
3431*12725SMenno.Lageman@Sun.COM 	 * getting out of global zone, so decrement lwp and process counts
34320Sstevel@tonic-gate 	 */
34330Sstevel@tonic-gate 	pj = pp->p_task->tk_proj;
34340Sstevel@tonic-gate 	mutex_enter(&global_zone->zone_nlwps_lock);
34350Sstevel@tonic-gate 	pj->kpj_nlwps -= pp->p_lwpcnt;
34360Sstevel@tonic-gate 	global_zone->zone_nlwps -= pp->p_lwpcnt;
3437*12725SMenno.Lageman@Sun.COM 	pj->kpj_nprocs--;
3438*12725SMenno.Lageman@Sun.COM 	global_zone->zone_nprocs--;
34390Sstevel@tonic-gate 	mutex_exit(&global_zone->zone_nlwps_lock);
34400Sstevel@tonic-gate 
34410Sstevel@tonic-gate 	/*
34422768Ssl108498 	 * Decrement locked memory counts on old zone and project.
34432768Ssl108498 	 */
34443247Sgjelinek 	mutex_enter(&global_zone->zone_mem_lock);
34452768Ssl108498 	global_zone->zone_locked_mem -= pp->p_locked_mem;
34462768Ssl108498 	pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
34473247Sgjelinek 	mutex_exit(&global_zone->zone_mem_lock);
34482768Ssl108498 
34492768Ssl108498 	/*
34500Sstevel@tonic-gate 	 * Create and join a new task in project '0' of this zone.
34510Sstevel@tonic-gate 	 *
34520Sstevel@tonic-gate 	 * We don't need to call holdlwps() since we know we're the only lwp in
34530Sstevel@tonic-gate 	 * this process.
34540Sstevel@tonic-gate 	 *
34550Sstevel@tonic-gate 	 * task_join() returns with p_lock held.
34560Sstevel@tonic-gate 	 */
34570Sstevel@tonic-gate 	tk = task_create(0, zone);
34580Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
34590Sstevel@tonic-gate 	oldtk = task_join(tk, 0);
34602768Ssl108498 
34612768Ssl108498 	pj = pp->p_task->tk_proj;
34622768Ssl108498 
34633247Sgjelinek 	mutex_enter(&zone->zone_mem_lock);
34642768Ssl108498 	zone->zone_locked_mem += pp->p_locked_mem;
34652768Ssl108498 	pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
34663247Sgjelinek 	mutex_exit(&zone->zone_mem_lock);
34670Sstevel@tonic-gate 
34680Sstevel@tonic-gate 	/*
3469*12725SMenno.Lageman@Sun.COM 	 * add lwp and process counts to zsched's zone, and increment
3470*12725SMenno.Lageman@Sun.COM 	 * project's task and process count due to the task created in
3471*12725SMenno.Lageman@Sun.COM 	 * the above task_create.
34720Sstevel@tonic-gate 	 */
34730Sstevel@tonic-gate 	mutex_enter(&zone->zone_nlwps_lock);
34740Sstevel@tonic-gate 	pj->kpj_nlwps += pp->p_lwpcnt;
34750Sstevel@tonic-gate 	pj->kpj_ntasks += 1;
34760Sstevel@tonic-gate 	zone->zone_nlwps += pp->p_lwpcnt;
3477*12725SMenno.Lageman@Sun.COM 	pj->kpj_nprocs++;
3478*12725SMenno.Lageman@Sun.COM 	zone->zone_nprocs++;
34790Sstevel@tonic-gate 	mutex_exit(&zone->zone_nlwps_lock);
34800Sstevel@tonic-gate 
34812768Ssl108498 	mutex_exit(&curproc->p_lock);
34822768Ssl108498 	mutex_exit(&cpu_lock);
34832768Ssl108498 	task_rele(oldtk);
34842768Ssl108498 
34850Sstevel@tonic-gate 	/*
34860Sstevel@tonic-gate 	 * The process was created by a process in the global zone, hence the
34870Sstevel@tonic-gate 	 * credentials are wrong.  We might as well have kcred-ish credentials.
34880Sstevel@tonic-gate 	 */
34890Sstevel@tonic-gate 	cr = zone->zone_kcred;
34900Sstevel@tonic-gate 	crhold(cr);
34910Sstevel@tonic-gate 	mutex_enter(&pp->p_crlock);
34920Sstevel@tonic-gate 	oldcred = pp->p_cred;
34930Sstevel@tonic-gate 	pp->p_cred = cr;
34940Sstevel@tonic-gate 	mutex_exit(&pp->p_crlock);
34950Sstevel@tonic-gate 	crfree(oldcred);
34960Sstevel@tonic-gate 
34970Sstevel@tonic-gate 	/*
34980Sstevel@tonic-gate 	 * Hold credentials again (for thread)
34990Sstevel@tonic-gate 	 */
35000Sstevel@tonic-gate 	crhold(cr);
35010Sstevel@tonic-gate 
35020Sstevel@tonic-gate 	/*
35030Sstevel@tonic-gate 	 * p_lwpcnt can't change since this is a kernel process.
35040Sstevel@tonic-gate 	 */
35050Sstevel@tonic-gate 	crset(pp, cr);
35060Sstevel@tonic-gate 
35070Sstevel@tonic-gate 	/*
35080Sstevel@tonic-gate 	 * Chroot
35090Sstevel@tonic-gate 	 */
35100Sstevel@tonic-gate 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
35110Sstevel@tonic-gate 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
35120Sstevel@tonic-gate 
35130Sstevel@tonic-gate 	/*
35140Sstevel@tonic-gate 	 * Initialize zone's rctl set.
35150Sstevel@tonic-gate 	 */
35160Sstevel@tonic-gate 	set = rctl_set_create();
35170Sstevel@tonic-gate 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
35180Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
35190Sstevel@tonic-gate 	e.rcep_p.zone = zone;
35200Sstevel@tonic-gate 	e.rcep_t = RCENTITY_ZONE;
35210Sstevel@tonic-gate 	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
35220Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
35230Sstevel@tonic-gate 	rctl_prealloc_destroy(gp);
35240Sstevel@tonic-gate 
35250Sstevel@tonic-gate 	/*
35260Sstevel@tonic-gate 	 * Apply the rctls passed in to zone_create().  This is basically a list
35270Sstevel@tonic-gate 	 * assignment: all of the old values are removed and the new ones
35280Sstevel@tonic-gate 	 * inserted.  That is, if an empty list is passed in, all values are
35290Sstevel@tonic-gate 	 * removed.
35300Sstevel@tonic-gate 	 */
35310Sstevel@tonic-gate 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
35320Sstevel@tonic-gate 		rctl_dict_entry_t *rde;
35330Sstevel@tonic-gate 		rctl_hndl_t hndl;
35340Sstevel@tonic-gate 		char *name;
35350Sstevel@tonic-gate 		nvlist_t **nvlarray;
35360Sstevel@tonic-gate 		uint_t i, nelem;
35370Sstevel@tonic-gate 		int error;	/* For ASSERT()s */
35380Sstevel@tonic-gate 
35390Sstevel@tonic-gate 		name = nvpair_name(nvp);
35400Sstevel@tonic-gate 		hndl = rctl_hndl_lookup(name);
35410Sstevel@tonic-gate 		ASSERT(hndl != -1);
35420Sstevel@tonic-gate 		rde = rctl_dict_lookup_hndl(hndl);
35430Sstevel@tonic-gate 		ASSERT(rde != NULL);
35440Sstevel@tonic-gate 
35450Sstevel@tonic-gate 		for (; /* ever */; ) {
35460Sstevel@tonic-gate 			rctl_val_t oval;
35470Sstevel@tonic-gate 
35480Sstevel@tonic-gate 			mutex_enter(&pp->p_lock);
35490Sstevel@tonic-gate 			error = rctl_local_get(hndl, NULL, &oval, pp);
35500Sstevel@tonic-gate 			mutex_exit(&pp->p_lock);
35510Sstevel@tonic-gate 			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
35520Sstevel@tonic-gate 			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
35530Sstevel@tonic-gate 			if (oval.rcv_privilege == RCPRIV_SYSTEM)
35540Sstevel@tonic-gate 				break;
35550Sstevel@tonic-gate 			mutex_enter(&pp->p_lock);
35560Sstevel@tonic-gate 			error = rctl_local_delete(hndl, &oval, pp);
35570Sstevel@tonic-gate 			mutex_exit(&pp->p_lock);
35580Sstevel@tonic-gate 			ASSERT(error == 0);
35590Sstevel@tonic-gate 		}
35600Sstevel@tonic-gate 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
35610Sstevel@tonic-gate 		ASSERT(error == 0);
35620Sstevel@tonic-gate 		for (i = 0; i < nelem; i++) {
35630Sstevel@tonic-gate 			rctl_val_t *nvalp;
35640Sstevel@tonic-gate 
35650Sstevel@tonic-gate 			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
35660Sstevel@tonic-gate 			error = nvlist2rctlval(nvlarray[i], nvalp);
35670Sstevel@tonic-gate 			ASSERT(error == 0);
35680Sstevel@tonic-gate 			/*
35690Sstevel@tonic-gate 			 * rctl_local_insert can fail if the value being
35700Sstevel@tonic-gate 			 * inserted is a duplicate; this is OK.
35710Sstevel@tonic-gate 			 */
35720Sstevel@tonic-gate 			mutex_enter(&pp->p_lock);
35730Sstevel@tonic-gate 			if (rctl_local_insert(hndl, nvalp, pp) != 0)
35740Sstevel@tonic-gate 				kmem_cache_free(rctl_val_cache, nvalp);
35750Sstevel@tonic-gate 			mutex_exit(&pp->p_lock);
35760Sstevel@tonic-gate 		}
35770Sstevel@tonic-gate 	}
35780Sstevel@tonic-gate 	/*
35790Sstevel@tonic-gate 	 * Tell the world that we're done setting up.
35800Sstevel@tonic-gate 	 *
35815880Snordmark 	 * At this point we want to set the zone status to ZONE_IS_INITIALIZED
35820Sstevel@tonic-gate 	 * and atomically set the zone's processor set visibility.  Once
35830Sstevel@tonic-gate 	 * we drop pool_lock() this zone will automatically get updated
35840Sstevel@tonic-gate 	 * to reflect any future changes to the pools configuration.
35855880Snordmark 	 *
35865880Snordmark 	 * Note that after we drop the locks below (zonehash_lock in
35875880Snordmark 	 * particular) other operations such as a zone_getattr call can
35885880Snordmark 	 * now proceed and observe the zone. That is the reason for doing a
35895880Snordmark 	 * state transition to the INITIALIZED state.
35900Sstevel@tonic-gate 	 */
35910Sstevel@tonic-gate 	pool_lock();
35920Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
35930Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
35940Sstevel@tonic-gate 	zone_uniqid(zone);
35950Sstevel@tonic-gate 	zone_zsd_configure(zone);
35960Sstevel@tonic-gate 	if (pool_state == POOL_ENABLED)
35970Sstevel@tonic-gate 		zone_pset_set(zone, pool_default->pool_pset->pset_id);
35980Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
35990Sstevel@tonic-gate 	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
36005880Snordmark 	zone_status_set(zone, ZONE_IS_INITIALIZED);
36010Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
36020Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
36030Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
36040Sstevel@tonic-gate 	pool_unlock();
36050Sstevel@tonic-gate 
36065880Snordmark 	/* Now call the create callback for this key */
36075880Snordmark 	zsd_apply_all_keys(zsd_apply_create, zone);
36085880Snordmark 
36095880Snordmark 	/* The callbacks are complete. Mark ZONE_IS_READY */
36105880Snordmark 	mutex_enter(&zone_status_lock);
36115880Snordmark 	ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
36125880Snordmark 	zone_status_set(zone, ZONE_IS_READY);
36135880Snordmark 	mutex_exit(&zone_status_lock);
36145880Snordmark 
36150Sstevel@tonic-gate 	/*
36160Sstevel@tonic-gate 	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
36170Sstevel@tonic-gate 	 * we launch init, and set the state to running.
36180Sstevel@tonic-gate 	 */
36190Sstevel@tonic-gate 	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
36200Sstevel@tonic-gate 
36210Sstevel@tonic-gate 	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
36220Sstevel@tonic-gate 		id_t cid;
36230Sstevel@tonic-gate 
36240Sstevel@tonic-gate 		/*
36250Sstevel@tonic-gate 		 * Ok, this is a little complicated.  We need to grab the
36260Sstevel@tonic-gate 		 * zone's pool's scheduling class ID; note that by now, we
36270Sstevel@tonic-gate 		 * are already bound to a pool if we need to be (zoneadmd
36280Sstevel@tonic-gate 		 * will have done that to us while we're in the READY
36290Sstevel@tonic-gate 		 * state).  *But* the scheduling class for the zone's 'init'
36300Sstevel@tonic-gate 		 * must be explicitly passed to newproc, which doesn't
36310Sstevel@tonic-gate 		 * respect pool bindings.
36320Sstevel@tonic-gate 		 *
36330Sstevel@tonic-gate 		 * We hold the pool_lock across the call to newproc() to
36340Sstevel@tonic-gate 		 * close the obvious race: the pool's scheduling class
36350Sstevel@tonic-gate 		 * could change before we manage to create the LWP with
36360Sstevel@tonic-gate 		 * classid 'cid'.
36370Sstevel@tonic-gate 		 */
36380Sstevel@tonic-gate 		pool_lock();
36393247Sgjelinek 		if (zone->zone_defaultcid > 0)
36403247Sgjelinek 			cid = zone->zone_defaultcid;
36413247Sgjelinek 		else
36423247Sgjelinek 			cid = pool_get_class(zone->zone_pool);
36430Sstevel@tonic-gate 		if (cid == -1)
36440Sstevel@tonic-gate 			cid = defaultcid;
36450Sstevel@tonic-gate 
36460Sstevel@tonic-gate 		/*
36470Sstevel@tonic-gate 		 * If this fails, zone_boot will ultimately fail.  The
36480Sstevel@tonic-gate 		 * state of the zone will be set to SHUTTING_DOWN-- userland
36490Sstevel@tonic-gate 		 * will have to tear down the zone, and fail, or try again.
36500Sstevel@tonic-gate 		 */
36512267Sdp 		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
365211173SJonathan.Adams@Sun.COM 		    minclsyspri - 1, &ct, 0)) != 0) {
36530Sstevel@tonic-gate 			mutex_enter(&zone_status_lock);
36540Sstevel@tonic-gate 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
36550Sstevel@tonic-gate 			mutex_exit(&zone_status_lock);
36560Sstevel@tonic-gate 		}
36570Sstevel@tonic-gate 		pool_unlock();
36580Sstevel@tonic-gate 	}
36590Sstevel@tonic-gate 
36600Sstevel@tonic-gate 	/*
36610Sstevel@tonic-gate 	 * Wait for zone_destroy() to be called.  This is what we spend
36620Sstevel@tonic-gate 	 * most of our life doing.
36630Sstevel@tonic-gate 	 */
36640Sstevel@tonic-gate 	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
36650Sstevel@tonic-gate 
36660Sstevel@tonic-gate 	if (ct)
36670Sstevel@tonic-gate 		/*
36680Sstevel@tonic-gate 		 * At this point the process contract should be empty.
36690Sstevel@tonic-gate 		 * (Though if it isn't, it's not the end of the world.)
36700Sstevel@tonic-gate 		 */
36710Sstevel@tonic-gate 		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
36720Sstevel@tonic-gate 
36730Sstevel@tonic-gate 	/*
36740Sstevel@tonic-gate 	 * Allow kcred to be freed when all referring processes
36750Sstevel@tonic-gate 	 * (including this one) go away.  We can't just do this in
36760Sstevel@tonic-gate 	 * zone_free because we need to wait for the zone_cred_ref to
36770Sstevel@tonic-gate 	 * drop to 0 before calling zone_free, and the existence of
36780Sstevel@tonic-gate 	 * zone_kcred will prevent that.  Thus, we call crfree here to
36790Sstevel@tonic-gate 	 * balance the crdup in zone_create.  The crhold calls earlier
36800Sstevel@tonic-gate 	 * in zsched will be dropped when the thread and process exit.
36810Sstevel@tonic-gate 	 */
36820Sstevel@tonic-gate 	crfree(zone->zone_kcred);
36830Sstevel@tonic-gate 	zone->zone_kcred = NULL;
36840Sstevel@tonic-gate 
36850Sstevel@tonic-gate 	exit(CLD_EXITED, 0);
36860Sstevel@tonic-gate }
36870Sstevel@tonic-gate 
36880Sstevel@tonic-gate /*
36890Sstevel@tonic-gate  * Helper function to determine if there are any submounts of the
36900Sstevel@tonic-gate  * provided path.  Used to make sure the zone doesn't "inherit" any
36910Sstevel@tonic-gate  * mounts from before it is created.
36920Sstevel@tonic-gate  */
36930Sstevel@tonic-gate static uint_t
36940Sstevel@tonic-gate zone_mount_count(const char *rootpath)
36950Sstevel@tonic-gate {
36960Sstevel@tonic-gate 	vfs_t *vfsp;
36970Sstevel@tonic-gate 	uint_t count = 0;
36980Sstevel@tonic-gate 	size_t rootpathlen = strlen(rootpath);
36990Sstevel@tonic-gate 
37000Sstevel@tonic-gate 	/*
37010Sstevel@tonic-gate 	 * Holding zonehash_lock prevents race conditions with
37020Sstevel@tonic-gate 	 * vfs_list_add()/vfs_list_remove() since we serialize with
37030Sstevel@tonic-gate 	 * zone_find_by_path().
37040Sstevel@tonic-gate 	 */
37050Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
37060Sstevel@tonic-gate 	/*
37070Sstevel@tonic-gate 	 * The rootpath must end with a '/'
37080Sstevel@tonic-gate 	 */
37090Sstevel@tonic-gate 	ASSERT(rootpath[rootpathlen - 1] == '/');
37100Sstevel@tonic-gate 
37110Sstevel@tonic-gate 	/*
37120Sstevel@tonic-gate 	 * This intentionally does not count the rootpath itself if that
37130Sstevel@tonic-gate 	 * happens to be a mount point.
37140Sstevel@tonic-gate 	 */
37150Sstevel@tonic-gate 	vfs_list_read_lock();
37160Sstevel@tonic-gate 	vfsp = rootvfs;
37170Sstevel@tonic-gate 	do {
37180Sstevel@tonic-gate 		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
37190Sstevel@tonic-gate 		    rootpathlen) == 0)
37200Sstevel@tonic-gate 			count++;
37210Sstevel@tonic-gate 		vfsp = vfsp->vfs_next;
37220Sstevel@tonic-gate 	} while (vfsp != rootvfs);
37230Sstevel@tonic-gate 	vfs_list_unlock();
37240Sstevel@tonic-gate 	return (count);
37250Sstevel@tonic-gate }
37260Sstevel@tonic-gate 
37270Sstevel@tonic-gate /*
37280Sstevel@tonic-gate  * Helper function to make sure that a zone created on 'rootpath'
37290Sstevel@tonic-gate  * wouldn't end up containing other zones' rootpaths.
37300Sstevel@tonic-gate  */
37310Sstevel@tonic-gate static boolean_t
37320Sstevel@tonic-gate zone_is_nested(const char *rootpath)
37330Sstevel@tonic-gate {
37340Sstevel@tonic-gate 	zone_t *zone;
37350Sstevel@tonic-gate 	size_t rootpathlen = strlen(rootpath);
37360Sstevel@tonic-gate 	size_t len;
37370Sstevel@tonic-gate 
37380Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
37390Sstevel@tonic-gate 
37408799SDhanaraj.M@Sun.COM 	/*
37418799SDhanaraj.M@Sun.COM 	 * zone_set_root() appended '/' and '\0' at the end of rootpath
37428799SDhanaraj.M@Sun.COM 	 */
37438799SDhanaraj.M@Sun.COM 	if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
37448799SDhanaraj.M@Sun.COM 	    (rootpath[1] == '/') && (rootpath[2] == '\0'))
37458799SDhanaraj.M@Sun.COM 		return (B_TRUE);
37468799SDhanaraj.M@Sun.COM 
37470Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
37480Sstevel@tonic-gate 	    zone = list_next(&zone_active, zone)) {
37490Sstevel@tonic-gate 		if (zone == global_zone)
37500Sstevel@tonic-gate 			continue;
37510Sstevel@tonic-gate 		len = strlen(zone->zone_rootpath);
37520Sstevel@tonic-gate 		if (strncmp(rootpath, zone->zone_rootpath,
37530Sstevel@tonic-gate 		    MIN(rootpathlen, len)) == 0)
37540Sstevel@tonic-gate 			return (B_TRUE);
37550Sstevel@tonic-gate 	}
37560Sstevel@tonic-gate 	return (B_FALSE);
37570Sstevel@tonic-gate }
37580Sstevel@tonic-gate 
37590Sstevel@tonic-gate static int
3760813Sdp zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
3761813Sdp     size_t zone_privssz)
37620Sstevel@tonic-gate {
37630Sstevel@tonic-gate 	priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
37640Sstevel@tonic-gate 
3765813Sdp 	if (zone_privssz < sizeof (priv_set_t))
3766813Sdp 		return (set_errno(ENOMEM));
3767813Sdp 
37680Sstevel@tonic-gate 	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
37690Sstevel@tonic-gate 		kmem_free(privs, sizeof (priv_set_t));
37700Sstevel@tonic-gate 		return (EFAULT);
37710Sstevel@tonic-gate 	}
37720Sstevel@tonic-gate 
37730Sstevel@tonic-gate 	zone->zone_privset = privs;
37740Sstevel@tonic-gate 	return (0);
37750Sstevel@tonic-gate }
37760Sstevel@tonic-gate 
37770Sstevel@tonic-gate /*
37780Sstevel@tonic-gate  * We make creative use of nvlists to pass in rctls from userland.  The list is
37790Sstevel@tonic-gate  * a list of the following structures:
37800Sstevel@tonic-gate  *
37810Sstevel@tonic-gate  * (name = rctl_name, value = nvpair_list_array)
37820Sstevel@tonic-gate  *
37830Sstevel@tonic-gate  * Where each element of the nvpair_list_array is of the form:
37840Sstevel@tonic-gate  *
37850Sstevel@tonic-gate  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
37860Sstevel@tonic-gate  * 	(name = "limit", value = uint64_t),
37870Sstevel@tonic-gate  * 	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
37880Sstevel@tonic-gate  */
37890Sstevel@tonic-gate static int
37900Sstevel@tonic-gate parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
37910Sstevel@tonic-gate {
37920Sstevel@tonic-gate 	nvpair_t *nvp = NULL;
37930Sstevel@tonic-gate 	nvlist_t *nvl = NULL;
37940Sstevel@tonic-gate 	char *kbuf;
37950Sstevel@tonic-gate 	int error;
37960Sstevel@tonic-gate 	rctl_val_t rv;
37970Sstevel@tonic-gate 
37980Sstevel@tonic-gate 	*nvlp = NULL;
37990Sstevel@tonic-gate 
38000Sstevel@tonic-gate 	if (buflen == 0)
38010Sstevel@tonic-gate 		return (0);
38020Sstevel@tonic-gate 
38030Sstevel@tonic-gate 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
38040Sstevel@tonic-gate 		return (ENOMEM);
38050Sstevel@tonic-gate 	if (copyin(ubuf, kbuf, buflen)) {
38060Sstevel@tonic-gate 		error = EFAULT;
38070Sstevel@tonic-gate 		goto out;
38080Sstevel@tonic-gate 	}
38090Sstevel@tonic-gate 	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
38100Sstevel@tonic-gate 		/*
38110Sstevel@tonic-gate 		 * nvl may have been allocated/free'd, but the value set to
38120Sstevel@tonic-gate 		 * non-NULL, so we reset it here.
38130Sstevel@tonic-gate 		 */
38140Sstevel@tonic-gate 		nvl = NULL;
38150Sstevel@tonic-gate 		error = EINVAL;
38160Sstevel@tonic-gate 		goto out;
38170Sstevel@tonic-gate 	}
38180Sstevel@tonic-gate 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
38190Sstevel@tonic-gate 		rctl_dict_entry_t *rde;
38200Sstevel@tonic-gate 		rctl_hndl_t hndl;
38210Sstevel@tonic-gate 		nvlist_t **nvlarray;
38220Sstevel@tonic-gate 		uint_t i, nelem;
38230Sstevel@tonic-gate 		char *name;
38240Sstevel@tonic-gate 
38250Sstevel@tonic-gate 		error = EINVAL;
38260Sstevel@tonic-gate 		name = nvpair_name(nvp);
38270Sstevel@tonic-gate 		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
38280Sstevel@tonic-gate 		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
38290Sstevel@tonic-gate 			goto out;
38300Sstevel@tonic-gate 		}
38310Sstevel@tonic-gate 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
38320Sstevel@tonic-gate 			goto out;
38330Sstevel@tonic-gate 		}
38340Sstevel@tonic-gate 		rde = rctl_dict_lookup_hndl(hndl);
38350Sstevel@tonic-gate 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
38360Sstevel@tonic-gate 		ASSERT(error == 0);
38370Sstevel@tonic-gate 		for (i = 0; i < nelem; i++) {
38380Sstevel@tonic-gate 			if (error = nvlist2rctlval(nvlarray[i], &rv))
38390Sstevel@tonic-gate 				goto out;
38400Sstevel@tonic-gate 		}
38410Sstevel@tonic-gate 		if (rctl_invalid_value(rde, &rv)) {
38420Sstevel@tonic-gate 			error = EINVAL;
38430Sstevel@tonic-gate 			goto out;
38440Sstevel@tonic-gate 		}
38450Sstevel@tonic-gate 	}
38460Sstevel@tonic-gate 	error = 0;
38470Sstevel@tonic-gate 	*nvlp = nvl;
38480Sstevel@tonic-gate out:
38490Sstevel@tonic-gate 	kmem_free(kbuf, buflen);
38500Sstevel@tonic-gate 	if (error && nvl != NULL)
38510Sstevel@tonic-gate 		nvlist_free(nvl);
38520Sstevel@tonic-gate 	return (error);
38530Sstevel@tonic-gate }
38540Sstevel@tonic-gate 
38550Sstevel@tonic-gate int
38560Sstevel@tonic-gate zone_create_error(int er_error, int er_ext, int *er_out) {
38570Sstevel@tonic-gate 	if (er_out != NULL) {
38580Sstevel@tonic-gate 		if (copyout(&er_ext, er_out, sizeof (int))) {
38590Sstevel@tonic-gate 			return (set_errno(EFAULT));
38600Sstevel@tonic-gate 		}
38610Sstevel@tonic-gate 	}
38620Sstevel@tonic-gate 	return (set_errno(er_error));
38630Sstevel@tonic-gate }
38640Sstevel@tonic-gate 
38651676Sjpk static int
38661676Sjpk zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
38671676Sjpk {
38681676Sjpk 	ts_label_t *tsl;
38691676Sjpk 	bslabel_t blab;
38701676Sjpk 
38711676Sjpk 	/* Get label from user */
38721676Sjpk 	if (copyin(lab, &blab, sizeof (blab)) != 0)
38731676Sjpk 		return (EFAULT);
38741676Sjpk 	tsl = labelalloc(&blab, doi, KM_NOSLEEP);
38751676Sjpk 	if (tsl == NULL)
38761676Sjpk 		return (ENOMEM);
38771676Sjpk 
38781676Sjpk 	zone->zone_slabel = tsl;
38791676Sjpk 	return (0);
38801676Sjpk }
38811676Sjpk 
38820Sstevel@tonic-gate /*
3883789Sahrens  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
3884789Sahrens  */
3885789Sahrens static int
3886789Sahrens parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
3887789Sahrens {
3888789Sahrens 	char *kbuf;
3889789Sahrens 	char *dataset, *next;
3890789Sahrens 	zone_dataset_t *zd;
3891789Sahrens 	size_t len;
3892789Sahrens 
3893789Sahrens 	if (ubuf == NULL || buflen == 0)
3894789Sahrens 		return (0);
3895789Sahrens 
3896789Sahrens 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3897789Sahrens 		return (ENOMEM);
3898789Sahrens 
3899789Sahrens 	if (copyin(ubuf, kbuf, buflen) != 0) {
3900789Sahrens 		kmem_free(kbuf, buflen);
3901789Sahrens 		return (EFAULT);
3902789Sahrens 	}
3903789Sahrens 
3904789Sahrens 	dataset = next = kbuf;
3905789Sahrens 	for (;;) {
3906789Sahrens 		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
3907789Sahrens 
3908789Sahrens 		next = strchr(dataset, ',');
3909789Sahrens 
3910789Sahrens 		if (next == NULL)
3911789Sahrens 			len = strlen(dataset);
3912789Sahrens 		else
3913789Sahrens 			len = next - dataset;
3914789Sahrens 
3915789Sahrens 		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
3916789Sahrens 		bcopy(dataset, zd->zd_dataset, len);
3917789Sahrens 		zd->zd_dataset[len] = '\0';
3918789Sahrens 
3919789Sahrens 		list_insert_head(&zone->zone_datasets, zd);
3920789Sahrens 
3921789Sahrens 		if (next == NULL)
3922789Sahrens 			break;
3923789Sahrens 
3924789Sahrens 		dataset = next + 1;
3925789Sahrens 	}
3926789Sahrens 
3927789Sahrens 	kmem_free(kbuf, buflen);
3928789Sahrens 	return (0);
3929789Sahrens }
3930789Sahrens 
3931789Sahrens /*
39320Sstevel@tonic-gate  * System call to create/initialize a new zone named 'zone_name', rooted
39330Sstevel@tonic-gate  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
39341676Sjpk  * and initialized with the zone-wide rctls described in 'rctlbuf', and
39351676Sjpk  * with labeling set by 'match', 'doi', and 'label'.
39360Sstevel@tonic-gate  *
39370Sstevel@tonic-gate  * If extended error is non-null, we may use it to return more detailed
39380Sstevel@tonic-gate  * error information.
39390Sstevel@tonic-gate  */
39400Sstevel@tonic-gate static zoneid_t
39410Sstevel@tonic-gate zone_create(const char *zone_name, const char *zone_root,
3942813Sdp     const priv_set_t *zone_privs, size_t zone_privssz,
3943813Sdp     caddr_t rctlbuf, size_t rctlbufsz,
39441676Sjpk     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
39453448Sdh155122     int match, uint32_t doi, const bslabel_t *label,
39463448Sdh155122     int flags)
39470Sstevel@tonic-gate {
39480Sstevel@tonic-gate 	struct zsched_arg zarg;
39490Sstevel@tonic-gate 	nvlist_t *rctls = NULL;
39500Sstevel@tonic-gate 	proc_t *pp = curproc;
39510Sstevel@tonic-gate 	zone_t *zone, *ztmp;
39520Sstevel@tonic-gate 	zoneid_t zoneid;
39530Sstevel@tonic-gate 	int error;
39540Sstevel@tonic-gate 	int error2 = 0;
39550Sstevel@tonic-gate 	char *str;
39560Sstevel@tonic-gate 	cred_t *zkcr;
39571769Scarlsonj 	boolean_t insert_label_hash;
39580Sstevel@tonic-gate 
39590Sstevel@tonic-gate 	if (secpolicy_zone_config(CRED()) != 0)
39600Sstevel@tonic-gate 		return (set_errno(EPERM));
39610Sstevel@tonic-gate 
39620Sstevel@tonic-gate 	/* can't boot zone from within chroot environment */
39630Sstevel@tonic-gate 	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
39640Sstevel@tonic-gate 		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
3965813Sdp 		    extended_error));
39660Sstevel@tonic-gate 
39670Sstevel@tonic-gate 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
39680Sstevel@tonic-gate 	zoneid = zone->zone_id = id_alloc(zoneid_space);
39690Sstevel@tonic-gate 	zone->zone_status = ZONE_IS_UNINITIALIZED;
39700Sstevel@tonic-gate 	zone->zone_pool = pool_default;
39710Sstevel@tonic-gate 	zone->zone_pool_mod = gethrtime();
39720Sstevel@tonic-gate 	zone->zone_psetid = ZONE_PS_INVAL;
39730Sstevel@tonic-gate 	zone->zone_ncpus = 0;
39740Sstevel@tonic-gate 	zone->zone_ncpus_online = 0;
39752712Snn35248 	zone->zone_restart_init = B_TRUE;
39762712Snn35248 	zone->zone_brand = &native_brand;
39772712Snn35248 	zone->zone_initname = NULL;
39780Sstevel@tonic-gate 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
39790Sstevel@tonic-gate 	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
39803247Sgjelinek 	mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
39810Sstevel@tonic-gate 	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
39820Sstevel@tonic-gate 	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
39830Sstevel@tonic-gate 	    offsetof(struct zsd_entry, zsd_linkage));
3984789Sahrens 	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
3985789Sahrens 	    offsetof(zone_dataset_t, zd_linkage));
398610616SSebastien.Roy@Sun.COM 	list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
398710616SSebastien.Roy@Sun.COM 	    offsetof(zone_dl_t, zdl_linkage));
39881676Sjpk 	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
398910910SRobert.Harris@Sun.COM 	rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
39900Sstevel@tonic-gate 
39913448Sdh155122 	if (flags & ZCF_NET_EXCL) {
39923448Sdh155122 		zone->zone_flags |= ZF_NET_EXCL;
39933448Sdh155122 	}
39943448Sdh155122 
39950Sstevel@tonic-gate 	if ((error = zone_set_name(zone, zone_name)) != 0) {
39960Sstevel@tonic-gate 		zone_free(zone);
39970Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
39980Sstevel@tonic-gate 	}
39990Sstevel@tonic-gate 
40000Sstevel@tonic-gate 	if ((error = zone_set_root(zone, zone_root)) != 0) {
40010Sstevel@tonic-gate 		zone_free(zone);
40020Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
40030Sstevel@tonic-gate 	}
4004813Sdp 	if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
40050Sstevel@tonic-gate 		zone_free(zone);
40060Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
40070Sstevel@tonic-gate 	}
40080Sstevel@tonic-gate 
40090Sstevel@tonic-gate 	/* initialize node name to be the same as zone name */
40100Sstevel@tonic-gate 	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
40110Sstevel@tonic-gate 	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
40120Sstevel@tonic-gate 	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
40130Sstevel@tonic-gate 
40140Sstevel@tonic-gate 	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
40150Sstevel@tonic-gate 	zone->zone_domain[0] = '\0';
40168662SJordan.Vaughan@Sun.com 	zone->zone_hostid = HW_INVALID_HOSTID;
40170Sstevel@tonic-gate 	zone->zone_shares = 1;
40182677Sml93401 	zone->zone_shmmax = 0;
40192677Sml93401 	zone->zone_ipc.ipcq_shmmni = 0;
40202677Sml93401 	zone->zone_ipc.ipcq_semmni = 0;
40212677Sml93401 	zone->zone_ipc.ipcq_msgmni = 0;
40220Sstevel@tonic-gate 	zone->zone_bootargs = NULL;
402312633Sjohn.levon@sun.com 	zone->zone_fs_allowed = NULL;
40242267Sdp 	zone->zone_initname =
40252267Sdp 	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
40262267Sdp 	(void) strcpy(zone->zone_initname, zone_default_initname);
40273247Sgjelinek 	zone->zone_nlwps = 0;
40283247Sgjelinek 	zone->zone_nlwps_ctl = INT_MAX;
4029*12725SMenno.Lageman@Sun.COM 	zone->zone_nprocs = 0;
4030*12725SMenno.Lageman@Sun.COM 	zone->zone_nprocs_ctl = INT_MAX;
40312768Ssl108498 	zone->zone_locked_mem = 0;
40322768Ssl108498 	zone->zone_locked_mem_ctl = UINT64_MAX;
40333247Sgjelinek 	zone->zone_max_swap = 0;
40343247Sgjelinek 	zone->zone_max_swap_ctl = UINT64_MAX;
403512633Sjohn.levon@sun.com 	zone->zone_max_lofi = 0;
403612633Sjohn.levon@sun.com 	zone->zone_max_lofi_ctl = UINT64_MAX;
40373247Sgjelinek 	zone0.zone_lockedmem_kstat = NULL;
40383247Sgjelinek 	zone0.zone_swapresv_kstat = NULL;
40390Sstevel@tonic-gate 
40400Sstevel@tonic-gate 	/*
40410Sstevel@tonic-gate 	 * Zsched initializes the rctls.
40420Sstevel@tonic-gate 	 */
40430Sstevel@tonic-gate 	zone->zone_rctls = NULL;
40440Sstevel@tonic-gate 
40450Sstevel@tonic-gate 	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
40460Sstevel@tonic-gate 		zone_free(zone);
40470Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
40480Sstevel@tonic-gate 	}
40490Sstevel@tonic-gate 
4050789Sahrens 	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4051789Sahrens 		zone_free(zone);
4052789Sahrens 		return (set_errno(error));
4053789Sahrens 	}
4054789Sahrens 
40550Sstevel@tonic-gate 	/*
40561676Sjpk 	 * Read in the trusted system parameters:
40571676Sjpk 	 * match flag and sensitivity label.
40581676Sjpk 	 */
40591676Sjpk 	zone->zone_match = match;
40601769Scarlsonj 	if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
40614462Skp158701 		/* Fail if requested to set doi to anything but system's doi */
40624462Skp158701 		if (doi != 0 && doi != default_doi) {
40634462Skp158701 			zone_free(zone);
40644462Skp158701 			return (set_errno(EINVAL));
40654462Skp158701 		}
40664462Skp158701 		/* Always apply system's doi to the zone */
40674462Skp158701 		error = zone_set_label(zone, label, default_doi);
40681676Sjpk 		if (error != 0) {
40691676Sjpk 			zone_free(zone);
40701676Sjpk 			return (set_errno(error));
40711676Sjpk 		}
40721769Scarlsonj 		insert_label_hash = B_TRUE;
40731676Sjpk 	} else {
40741676Sjpk 		/* all zones get an admin_low label if system is not labeled */
40751676Sjpk 		zone->zone_slabel = l_admin_low;
40761676Sjpk 		label_hold(l_admin_low);
40771769Scarlsonj 		insert_label_hash = B_FALSE;
40781676Sjpk 	}
40791676Sjpk 
40801676Sjpk 	/*
40810Sstevel@tonic-gate 	 * Stop all lwps since that's what normally happens as part of fork().
40820Sstevel@tonic-gate 	 * This needs to happen before we grab any locks to avoid deadlock
40830Sstevel@tonic-gate 	 * (another lwp in the process could be waiting for the held lock).
40840Sstevel@tonic-gate 	 */
40850Sstevel@tonic-gate 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
40860Sstevel@tonic-gate 		zone_free(zone);
40870Sstevel@tonic-gate 		if (rctls)
40880Sstevel@tonic-gate 			nvlist_free(rctls);
40890Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
40900Sstevel@tonic-gate 	}
40910Sstevel@tonic-gate 
40920Sstevel@tonic-gate 	if (block_mounts() == 0) {
40930Sstevel@tonic-gate 		mutex_enter(&pp->p_lock);
40940Sstevel@tonic-gate 		if (curthread != pp->p_agenttp)
40950Sstevel@tonic-gate 			continuelwps(pp);
40960Sstevel@tonic-gate 		mutex_exit(&pp->p_lock);
40970Sstevel@tonic-gate 		zone_free(zone);
40980Sstevel@tonic-gate 		if (rctls)
40990Sstevel@tonic-gate 			nvlist_free(rctls);
41000Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
41010Sstevel@tonic-gate 	}
41020Sstevel@tonic-gate 
41030Sstevel@tonic-gate 	/*
41040Sstevel@tonic-gate 	 * Set up credential for kernel access.  After this, any errors
41050Sstevel@tonic-gate 	 * should go through the dance in errout rather than calling
41060Sstevel@tonic-gate 	 * zone_free directly.
41070Sstevel@tonic-gate 	 */
41080Sstevel@tonic-gate 	zone->zone_kcred = crdup(kcred);
41090Sstevel@tonic-gate 	crsetzone(zone->zone_kcred, zone);
41100Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
41110Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
41120Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
41130Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
41140Sstevel@tonic-gate 
41150Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
41160Sstevel@tonic-gate 	/*
41170Sstevel@tonic-gate 	 * Make sure zone doesn't already exist.
41181676Sjpk 	 *
41191676Sjpk 	 * If the system and zone are labeled,
41201676Sjpk 	 * make sure no other zone exists that has the same label.
41210Sstevel@tonic-gate 	 */
41221676Sjpk 	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
41231769Scarlsonj 	    (insert_label_hash &&
41241676Sjpk 	    (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
41250Sstevel@tonic-gate 		zone_status_t status;
41260Sstevel@tonic-gate 
41270Sstevel@tonic-gate 		status = zone_status_get(ztmp);
41280Sstevel@tonic-gate 		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
41290Sstevel@tonic-gate 			error = EEXIST;
41300Sstevel@tonic-gate 		else
41310Sstevel@tonic-gate 			error = EBUSY;
41324791Ston 
41334791Ston 		if (insert_label_hash)
41344791Ston 			error2 = ZE_LABELINUSE;
41354791Ston 
41360Sstevel@tonic-gate 		goto errout;
41370Sstevel@tonic-gate 	}
41380Sstevel@tonic-gate 
41390Sstevel@tonic-gate 	/*
41400Sstevel@tonic-gate 	 * Don't allow zone creations which would cause one zone's rootpath to
41410Sstevel@tonic-gate 	 * be accessible from that of another (non-global) zone.
41420Sstevel@tonic-gate 	 */
41430Sstevel@tonic-gate 	if (zone_is_nested(zone->zone_rootpath)) {
41440Sstevel@tonic-gate 		error = EBUSY;
41450Sstevel@tonic-gate 		goto errout;
41460Sstevel@tonic-gate 	}
41470Sstevel@tonic-gate 
41480Sstevel@tonic-gate 	ASSERT(zonecount != 0);		/* check for leaks */
41490Sstevel@tonic-gate 	if (zonecount + 1 > maxzones) {
41500Sstevel@tonic-gate 		error = ENOMEM;
41510Sstevel@tonic-gate 		goto errout;
41520Sstevel@tonic-gate 	}
41530Sstevel@tonic-gate 
41540Sstevel@tonic-gate 	if (zone_mount_count(zone->zone_rootpath) != 0) {
41550Sstevel@tonic-gate 		error = EBUSY;
41560Sstevel@tonic-gate 		error2 = ZE_AREMOUNTS;
41570Sstevel@tonic-gate 		goto errout;
41580Sstevel@tonic-gate 	}
41590Sstevel@tonic-gate 
41600Sstevel@tonic-gate 	/*
41610Sstevel@tonic-gate 	 * Zone is still incomplete, but we need to drop all locks while
41620Sstevel@tonic-gate 	 * zsched() initializes this zone's kernel process.  We
41630Sstevel@tonic-gate 	 * optimistically add the zone to the hashtable and associated
41640Sstevel@tonic-gate 	 * lists so a parallel zone_create() doesn't try to create the
41650Sstevel@tonic-gate 	 * same zone.
41660Sstevel@tonic-gate 	 */
41670Sstevel@tonic-gate 	zonecount++;
41680Sstevel@tonic-gate 	(void) mod_hash_insert(zonehashbyid,
41690Sstevel@tonic-gate 	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
41700Sstevel@tonic-gate 	    (mod_hash_val_t)(uintptr_t)zone);
41710Sstevel@tonic-gate 	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
41720Sstevel@tonic-gate 	(void) strcpy(str, zone->zone_name);
41730Sstevel@tonic-gate 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
41740Sstevel@tonic-gate 	    (mod_hash_val_t)(uintptr_t)zone);
41751769Scarlsonj 	if (insert_label_hash) {
41761676Sjpk 		(void) mod_hash_insert(zonehashbylabel,
41771676Sjpk 		    (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
41781769Scarlsonj 		zone->zone_flags |= ZF_HASHED_LABEL;
41791676Sjpk 	}
41801676Sjpk 
41810Sstevel@tonic-gate 	/*
41820Sstevel@tonic-gate 	 * Insert into active list.  At this point there are no 'hold's
41830Sstevel@tonic-gate 	 * on the zone, but everyone else knows not to use it, so we can
41840Sstevel@tonic-gate 	 * continue to use it.  zsched() will do a zone_hold() if the
41850Sstevel@tonic-gate 	 * newproc() is successful.
41860Sstevel@tonic-gate 	 */
41870Sstevel@tonic-gate 	list_insert_tail(&zone_active, zone);
41880Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
41890Sstevel@tonic-gate 
41900Sstevel@tonic-gate 	zarg.zone = zone;
41910Sstevel@tonic-gate 	zarg.nvlist = rctls;
41920Sstevel@tonic-gate 	/*
41930Sstevel@tonic-gate 	 * The process, task, and project rctls are probably wrong;
41940Sstevel@tonic-gate 	 * we need an interface to get the default values of all rctls,
41950Sstevel@tonic-gate 	 * and initialize zsched appropriately.  I'm not sure that that
41960Sstevel@tonic-gate 	 * makes much of a difference, though.
41970Sstevel@tonic-gate 	 */
419811173SJonathan.Adams@Sun.COM 	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
419911173SJonathan.Adams@Sun.COM 	if (error != 0) {
42000Sstevel@tonic-gate 		/*
42010Sstevel@tonic-gate 		 * We need to undo all globally visible state.
42020Sstevel@tonic-gate 		 */
42030Sstevel@tonic-gate 		mutex_enter(&zonehash_lock);
42040Sstevel@tonic-gate 		list_remove(&zone_active, zone);
42051769Scarlsonj 		if (zone->zone_flags & ZF_HASHED_LABEL) {
42061676Sjpk 			ASSERT(zone->zone_slabel != NULL);
42071676Sjpk 			(void) mod_hash_destroy(zonehashbylabel,
42081676Sjpk 			    (mod_hash_key_t)zone->zone_slabel);
42091676Sjpk 		}
42100Sstevel@tonic-gate 		(void) mod_hash_destroy(zonehashbyname,
42110Sstevel@tonic-gate 		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
42120Sstevel@tonic-gate 		(void) mod_hash_destroy(zonehashbyid,
42130Sstevel@tonic-gate 		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
42140Sstevel@tonic-gate 		ASSERT(zonecount > 1);
42150Sstevel@tonic-gate 		zonecount--;
42160Sstevel@tonic-gate 		goto errout;
42170Sstevel@tonic-gate 	}
42180Sstevel@tonic-gate 
42190Sstevel@tonic-gate 	/*
42200Sstevel@tonic-gate 	 * Zone creation can't fail from now on.
42210Sstevel@tonic-gate 	 */
42220Sstevel@tonic-gate 
42230Sstevel@tonic-gate 	/*
42243247Sgjelinek 	 * Create zone kstats
42253247Sgjelinek 	 */
42263247Sgjelinek 	zone_kstat_create(zone);
42273247Sgjelinek 
42283247Sgjelinek 	/*
42290Sstevel@tonic-gate 	 * Let the other lwps continue.
42300Sstevel@tonic-gate 	 */
42310Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
42320Sstevel@tonic-gate 	if (curthread != pp->p_agenttp)
42330Sstevel@tonic-gate 		continuelwps(pp);
42340Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
42350Sstevel@tonic-gate 
42360Sstevel@tonic-gate 	/*
42370Sstevel@tonic-gate 	 * Wait for zsched to finish initializing the zone.
42380Sstevel@tonic-gate 	 */
42390Sstevel@tonic-gate 	zone_status_wait(zone, ZONE_IS_READY);
42400Sstevel@tonic-gate 	/*
42410Sstevel@tonic-gate 	 * The zone is fully visible, so we can let mounts progress.
42420Sstevel@tonic-gate 	 */
42430Sstevel@tonic-gate 	resume_mounts();
42440Sstevel@tonic-gate 	if (rctls)
42450Sstevel@tonic-gate 		nvlist_free(rctls);
42460Sstevel@tonic-gate 
42470Sstevel@tonic-gate 	return (zoneid);
42480Sstevel@tonic-gate 
42490Sstevel@tonic-gate errout:
42500Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
42510Sstevel@tonic-gate 	/*
42520Sstevel@tonic-gate 	 * Let the other lwps continue.
42530Sstevel@tonic-gate 	 */
42540Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
42550Sstevel@tonic-gate 	if (curthread != pp->p_agenttp)
42560Sstevel@tonic-gate 		continuelwps(pp);
42570Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
42580Sstevel@tonic-gate 
42590Sstevel@tonic-gate 	resume_mounts();
42600Sstevel@tonic-gate 	if (rctls)
42610Sstevel@tonic-gate 		nvlist_free(rctls);
42620Sstevel@tonic-gate 	/*
42630Sstevel@tonic-gate 	 * There is currently one reference to the zone, a cred_ref from
42640Sstevel@tonic-gate 	 * zone_kcred.  To free the zone, we call crfree, which will call
42650Sstevel@tonic-gate 	 * zone_cred_rele, which will call zone_free.
42660Sstevel@tonic-gate 	 */
42670Sstevel@tonic-gate 	ASSERT(zone->zone_cred_ref == 1);	/* for zone_kcred */
42680Sstevel@tonic-gate 	ASSERT(zone->zone_kcred->cr_ref == 1);
42690Sstevel@tonic-gate 	ASSERT(zone->zone_ref == 0);
42700Sstevel@tonic-gate 	zkcr = zone->zone_kcred;
42710Sstevel@tonic-gate 	zone->zone_kcred = NULL;
42720Sstevel@tonic-gate 	crfree(zkcr);				/* triggers call to zone_free */
42730Sstevel@tonic-gate 	return (zone_create_error(error, error2, extended_error));
42740Sstevel@tonic-gate }
42750Sstevel@tonic-gate 
42760Sstevel@tonic-gate /*
42770Sstevel@tonic-gate  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
42782267Sdp  * the heavy lifting.  initname is the path to the program to launch
42792267Sdp  * at the "top" of the zone; if this is NULL, we use the system default,
42802267Sdp  * which is stored at zone_default_initname.
42810Sstevel@tonic-gate  */
42820Sstevel@tonic-gate static int
42832267Sdp zone_boot(zoneid_t zoneid)
42840Sstevel@tonic-gate {
42850Sstevel@tonic-gate 	int err;
42860Sstevel@tonic-gate 	zone_t *zone;
42870Sstevel@tonic-gate 
42880Sstevel@tonic-gate 	if (secpolicy_zone_config(CRED()) != 0)
42890Sstevel@tonic-gate 		return (set_errno(EPERM));
42900Sstevel@tonic-gate 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
42910Sstevel@tonic-gate 		return (set_errno(EINVAL));
42920Sstevel@tonic-gate 
42930Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
42940Sstevel@tonic-gate 	/*
42950Sstevel@tonic-gate 	 * Look for zone under hash lock to prevent races with calls to
42960Sstevel@tonic-gate 	 * zone_shutdown, zone_destroy, etc.
42970Sstevel@tonic-gate 	 */
42980Sstevel@tonic-gate 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
42990Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
43000Sstevel@tonic-gate 		return (set_errno(EINVAL));
43010Sstevel@tonic-gate 	}
43020Sstevel@tonic-gate 
43030Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
43040Sstevel@tonic-gate 	if (zone_status_get(zone) != ZONE_IS_READY) {
43050Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
43060Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
43070Sstevel@tonic-gate 		return (set_errno(EINVAL));
43080Sstevel@tonic-gate 	}
43090Sstevel@tonic-gate 	zone_status_set(zone, ZONE_IS_BOOTING);
43100Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
43110Sstevel@tonic-gate 
43120Sstevel@tonic-gate 	zone_hold(zone);	/* so we can use the zone_t later */
43130Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
43140Sstevel@tonic-gate 
43150Sstevel@tonic-gate 	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
43160Sstevel@tonic-gate 		zone_rele(zone);
43170Sstevel@tonic-gate 		return (set_errno(EINTR));
43180Sstevel@tonic-gate 	}
43190Sstevel@tonic-gate 
43200Sstevel@tonic-gate 	/*
43210Sstevel@tonic-gate 	 * Boot (starting init) might have failed, in which case the zone
43220Sstevel@tonic-gate 	 * will go to the SHUTTING_DOWN state; an appropriate errno will
43230Sstevel@tonic-gate 	 * be placed in zone->zone_boot_err, and so we return that.
43240Sstevel@tonic-gate 	 */
43250Sstevel@tonic-gate 	err = zone->zone_boot_err;
43260Sstevel@tonic-gate 	zone_rele(zone);
43270Sstevel@tonic-gate 	return (err ? set_errno(err) : 0);
43280Sstevel@tonic-gate }
43290Sstevel@tonic-gate 
43300Sstevel@tonic-gate /*
43310Sstevel@tonic-gate  * Kills all user processes in the zone, waiting for them all to exit
43320Sstevel@tonic-gate  * before returning.
43330Sstevel@tonic-gate  */
43340Sstevel@tonic-gate static int
43350Sstevel@tonic-gate zone_empty(zone_t *zone)
43360Sstevel@tonic-gate {
43370Sstevel@tonic-gate 	int waitstatus;
43380Sstevel@tonic-gate 
43390Sstevel@tonic-gate 	/*
43400Sstevel@tonic-gate 	 * We need to drop zonehash_lock before killing all
43410Sstevel@tonic-gate 	 * processes, otherwise we'll deadlock with zone_find_*
43420Sstevel@tonic-gate 	 * which can be called from the exit path.
43430Sstevel@tonic-gate 	 */
43440Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
434511066Srafael.vanoni@sun.com 	while ((waitstatus = zone_status_timedwait_sig(zone,
434611066Srafael.vanoni@sun.com 	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
43470Sstevel@tonic-gate 		killall(zone->zone_id);
43480Sstevel@tonic-gate 	}
43490Sstevel@tonic-gate 	/*
43500Sstevel@tonic-gate 	 * return EINTR if we were signaled
43510Sstevel@tonic-gate 	 */
43520Sstevel@tonic-gate 	if (waitstatus == 0)
43530Sstevel@tonic-gate 		return (EINTR);
43540Sstevel@tonic-gate 	return (0);
43550Sstevel@tonic-gate }
43560Sstevel@tonic-gate 
43570Sstevel@tonic-gate /*
43581676Sjpk  * This function implements the policy for zone visibility.
43591676Sjpk  *
43601676Sjpk  * In standard Solaris, a non-global zone can only see itself.
43611676Sjpk  *
43621676Sjpk  * In Trusted Extensions, a labeled zone can lookup any zone whose label
43631676Sjpk  * it dominates. For this test, the label of the global zone is treated as
43641676Sjpk  * admin_high so it is special-cased instead of being checked for dominance.
43651676Sjpk  *
43661676Sjpk  * Returns true if zone attributes are viewable, false otherwise.
43671676Sjpk  */
43681676Sjpk static boolean_t
43691676Sjpk zone_list_access(zone_t *zone)
43701676Sjpk {
43711676Sjpk 
43721676Sjpk 	if (curproc->p_zone == global_zone ||
43731676Sjpk 	    curproc->p_zone == zone) {
43741676Sjpk 		return (B_TRUE);
43751769Scarlsonj 	} else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
43761676Sjpk 		bslabel_t *curproc_label;
43771676Sjpk 		bslabel_t *zone_label;
43781676Sjpk 
43791676Sjpk 		curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
43801676Sjpk 		zone_label = label2bslabel(zone->zone_slabel);
43811676Sjpk 
43821676Sjpk 		if (zone->zone_id != GLOBAL_ZONEID &&
43831676Sjpk 		    bldominates(curproc_label, zone_label)) {
43841676Sjpk 			return (B_TRUE);
43851676Sjpk 		} else {
43861676Sjpk 			return (B_FALSE);
43871676Sjpk 		}
43881676Sjpk 	} else {
43891676Sjpk 		return (B_FALSE);
43901676Sjpk 	}
43911676Sjpk }
43921676Sjpk 
43931676Sjpk /*
43940Sstevel@tonic-gate  * Systemcall to start the zone's halt sequence.  By the time this
43950Sstevel@tonic-gate  * function successfully returns, all user processes and kernel threads
43960Sstevel@tonic-gate  * executing in it will have exited, ZSD shutdown callbacks executed,
43970Sstevel@tonic-gate  * and the zone status set to ZONE_IS_DOWN.
43980Sstevel@tonic-gate  *
43990Sstevel@tonic-gate  * It is possible that the call will interrupt itself if the caller is the
44000Sstevel@tonic-gate  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
44010Sstevel@tonic-gate  */
44020Sstevel@tonic-gate static int
44030Sstevel@tonic-gate zone_shutdown(zoneid_t zoneid)
44040Sstevel@tonic-gate {
44050Sstevel@tonic-gate 	int error;
44060Sstevel@tonic-gate 	zone_t *zone;
44070Sstevel@tonic-gate 	zone_status_t status;
44080Sstevel@tonic-gate 
44090Sstevel@tonic-gate 	if (secpolicy_zone_config(CRED()) != 0)
44100Sstevel@tonic-gate 		return (set_errno(EPERM));
44110Sstevel@tonic-gate 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
44120Sstevel@tonic-gate 		return (set_errno(EINVAL));
44130Sstevel@tonic-gate 
44140Sstevel@tonic-gate 	/*
44150Sstevel@tonic-gate 	 * Block mounts so that VFS_MOUNT() can get an accurate view of
44160Sstevel@tonic-gate 	 * the zone's status with regards to ZONE_IS_SHUTTING down.
44170Sstevel@tonic-gate 	 *
44180Sstevel@tonic-gate 	 * e.g. NFS can fail the mount if it determines that the zone
44190Sstevel@tonic-gate 	 * has already begun the shutdown sequence.
44200Sstevel@tonic-gate 	 */
44210Sstevel@tonic-gate 	if (block_mounts() == 0)
44220Sstevel@tonic-gate 		return (set_errno(EINTR));
44230Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
44240Sstevel@tonic-gate 	/*
44250Sstevel@tonic-gate 	 * Look for zone under hash lock to prevent races with other
44260Sstevel@tonic-gate 	 * calls to zone_shutdown and zone_destroy.
44270Sstevel@tonic-gate 	 */
44280Sstevel@tonic-gate 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
44290Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
44300Sstevel@tonic-gate 		resume_mounts();
44310Sstevel@tonic-gate 		return (set_errno(EINVAL));
44320Sstevel@tonic-gate 	}
44330Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
44340Sstevel@tonic-gate 	status = zone_status_get(zone);
44350Sstevel@tonic-gate 	/*
44360Sstevel@tonic-gate 	 * Fail if the zone isn't fully initialized yet.
44370Sstevel@tonic-gate 	 */
44380Sstevel@tonic-gate 	if (status < ZONE_IS_READY) {
44390Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
44400Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
44410Sstevel@tonic-gate 		resume_mounts();
44420Sstevel@tonic-gate 		return (set_errno(EINVAL));
44430Sstevel@tonic-gate 	}
44440Sstevel@tonic-gate 	/*
44450Sstevel@tonic-gate 	 * If conditions required for zone_shutdown() to return have been met,
44460Sstevel@tonic-gate 	 * return success.
44470Sstevel@tonic-gate 	 */
44480Sstevel@tonic-gate 	if (status >= ZONE_IS_DOWN) {
44490Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
44500Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
44510Sstevel@tonic-gate 		resume_mounts();
44520Sstevel@tonic-gate 		return (0);
44530Sstevel@tonic-gate 	}
44540Sstevel@tonic-gate 	/*
44550Sstevel@tonic-gate 	 * If zone_shutdown() hasn't been called before, go through the motions.
44560Sstevel@tonic-gate 	 * If it has, there's nothing to do but wait for the kernel threads to
44570Sstevel@tonic-gate 	 * drain.
44580Sstevel@tonic-gate 	 */
44590Sstevel@tonic-gate 	if (status < ZONE_IS_EMPTY) {
44600Sstevel@tonic-gate 		uint_t ntasks;
44610Sstevel@tonic-gate 
44620Sstevel@tonic-gate 		mutex_enter(&zone->zone_lock);
44630Sstevel@tonic-gate 		if ((ntasks = zone->zone_ntasks) != 1) {
44640Sstevel@tonic-gate 			/*
44650Sstevel@tonic-gate 			 * There's still stuff running.
44660Sstevel@tonic-gate 			 */
44670Sstevel@tonic-gate 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
44680Sstevel@tonic-gate 		}
44690Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
44700Sstevel@tonic-gate 		if (ntasks == 1) {
44710Sstevel@tonic-gate 			/*
44720Sstevel@tonic-gate 			 * The only way to create another task is through
44730Sstevel@tonic-gate 			 * zone_enter(), which will block until we drop
44740Sstevel@tonic-gate 			 * zonehash_lock.  The zone is empty.
44750Sstevel@tonic-gate 			 */
44760Sstevel@tonic-gate 			if (zone->zone_kthreads == NULL) {
44770Sstevel@tonic-gate 				/*
44780Sstevel@tonic-gate 				 * Skip ahead to ZONE_IS_DOWN
44790Sstevel@tonic-gate 				 */
44800Sstevel@tonic-gate 				zone_status_set(zone, ZONE_IS_DOWN);
44810Sstevel@tonic-gate 			} else {
44820Sstevel@tonic-gate 				zone_status_set(zone, ZONE_IS_EMPTY);
44830Sstevel@tonic-gate 			}
44840Sstevel@tonic-gate 		}
44850Sstevel@tonic-gate 	}
44860Sstevel@tonic-gate 	zone_hold(zone);	/* so we can use the zone_t later */
44870Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
44880Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
44890Sstevel@tonic-gate 	resume_mounts();
44900Sstevel@tonic-gate 
44910Sstevel@tonic-gate 	if (error = zone_empty(zone)) {
44920Sstevel@tonic-gate 		zone_rele(zone);
44930Sstevel@tonic-gate 		return (set_errno(error));
44940Sstevel@tonic-gate 	}
44950Sstevel@tonic-gate 	/*
44960Sstevel@tonic-gate 	 * After the zone status goes to ZONE_IS_DOWN this zone will no
44970Sstevel@tonic-gate 	 * longer be notified of changes to the pools configuration, so
44980Sstevel@tonic-gate 	 * in order to not end up with a stale pool pointer, we point
44990Sstevel@tonic-gate 	 * ourselves at the default pool and remove all resource
45000Sstevel@tonic-gate 	 * visibility.  This is especially important as the zone_t may
45010Sstevel@tonic-gate 	 * languish on the deathrow for a very long time waiting for
45020Sstevel@tonic-gate 	 * cred's to drain out.
45030Sstevel@tonic-gate 	 *
45040Sstevel@tonic-gate 	 * This rebinding of the zone can happen multiple times
45050Sstevel@tonic-gate 	 * (presumably due to interrupted or parallel systemcalls)
45060Sstevel@tonic-gate 	 * without any adverse effects.
45070Sstevel@tonic-gate 	 */
45080Sstevel@tonic-gate 	if (pool_lock_intr() != 0) {
45090Sstevel@tonic-gate 		zone_rele(zone);
45100Sstevel@tonic-gate 		return (set_errno(EINTR));
45110Sstevel@tonic-gate 	}
45120Sstevel@tonic-gate 	if (pool_state == POOL_ENABLED) {
45130Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
45140Sstevel@tonic-gate 		zone_pool_set(zone, pool_default);
45150Sstevel@tonic-gate 		/*
45160Sstevel@tonic-gate 		 * The zone no longer needs to be able to see any cpus.
45170Sstevel@tonic-gate 		 */
45180Sstevel@tonic-gate 		zone_pset_set(zone, ZONE_PS_INVAL);
45190Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
45200Sstevel@tonic-gate 	}
45210Sstevel@tonic-gate 	pool_unlock();
45220Sstevel@tonic-gate 
45230Sstevel@tonic-gate 	/*
45240Sstevel@tonic-gate 	 * ZSD shutdown callbacks can be executed multiple times, hence
45250Sstevel@tonic-gate 	 * it is safe to not be holding any locks across this call.
45260Sstevel@tonic-gate 	 */
45270Sstevel@tonic-gate 	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
45280Sstevel@tonic-gate 
45290Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
45300Sstevel@tonic-gate 	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
45310Sstevel@tonic-gate 		zone_status_set(zone, ZONE_IS_DOWN);
45320Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
45330Sstevel@tonic-gate 
45340Sstevel@tonic-gate 	/*
45350Sstevel@tonic-gate 	 * Wait for kernel threads to drain.
45360Sstevel@tonic-gate 	 */
45370Sstevel@tonic-gate 	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
45380Sstevel@tonic-gate 		zone_rele(zone);
45390Sstevel@tonic-gate 		return (set_errno(EINTR));
45400Sstevel@tonic-gate 	}
45412712Snn35248 
45423671Ssl108498 	/*
45433671Ssl108498 	 * Zone can be become down/destroyable even if the above wait
45443671Ssl108498 	 * returns EINTR, so any code added here may never execute.
45453671Ssl108498 	 * (i.e. don't add code here)
45463671Ssl108498 	 */
45472712Snn35248 
45480Sstevel@tonic-gate 	zone_rele(zone);
45490Sstevel@tonic-gate 	return (0);
45500Sstevel@tonic-gate }
45510Sstevel@tonic-gate 
45520Sstevel@tonic-gate /*
45530Sstevel@tonic-gate  * Systemcall entry point to finalize the zone halt process.  The caller
45542677Sml93401  * must have already successfully called zone_shutdown().
45550Sstevel@tonic-gate  *
45560Sstevel@tonic-gate  * Upon successful completion, the zone will have been fully destroyed:
45570Sstevel@tonic-gate  * zsched will have exited, destructor callbacks executed, and the zone
45580Sstevel@tonic-gate  * removed from the list of active zones.
45590Sstevel@tonic-gate  */
45600Sstevel@tonic-gate static int
45610Sstevel@tonic-gate zone_destroy(zoneid_t zoneid)
45620Sstevel@tonic-gate {
45630Sstevel@tonic-gate 	uint64_t uniqid;
45640Sstevel@tonic-gate 	zone_t *zone;
45650Sstevel@tonic-gate 	zone_status_t status;
45660Sstevel@tonic-gate 
45670Sstevel@tonic-gate 	if (secpolicy_zone_config(CRED()) != 0)
45680Sstevel@tonic-gate 		return (set_errno(EPERM));
45690Sstevel@tonic-gate 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
45700Sstevel@tonic-gate 		return (set_errno(EINVAL));
45710Sstevel@tonic-gate 
45720Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
45730Sstevel@tonic-gate 	/*
45740Sstevel@tonic-gate 	 * Look for zone under hash lock to prevent races with other
45750Sstevel@tonic-gate 	 * calls to zone_destroy.
45760Sstevel@tonic-gate 	 */
45770Sstevel@tonic-gate 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
45780Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
45790Sstevel@tonic-gate 		return (set_errno(EINVAL));
45800Sstevel@tonic-gate 	}
45810Sstevel@tonic-gate 
45820Sstevel@tonic-gate 	if (zone_mount_count(zone->zone_rootpath) != 0) {
45830Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
45840Sstevel@tonic-gate 		return (set_errno(EBUSY));
45850Sstevel@tonic-gate 	}
45860Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
45870Sstevel@tonic-gate 	status = zone_status_get(zone);
45880Sstevel@tonic-gate 	if (status < ZONE_IS_DOWN) {
45890Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
45900Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
45910Sstevel@tonic-gate 		return (set_errno(EBUSY));
45920Sstevel@tonic-gate 	} else if (status == ZONE_IS_DOWN) {
45930Sstevel@tonic-gate 		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
45940Sstevel@tonic-gate 	}
45950Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
45960Sstevel@tonic-gate 	zone_hold(zone);
45970Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
45980Sstevel@tonic-gate 
45990Sstevel@tonic-gate 	/*
46000Sstevel@tonic-gate 	 * wait for zsched to exit
46010Sstevel@tonic-gate 	 */
46020Sstevel@tonic-gate 	zone_status_wait(zone, ZONE_IS_DEAD);
46030Sstevel@tonic-gate 	zone_zsd_callbacks(zone, ZSD_DESTROY);
46043448Sdh155122 	zone->zone_netstack = NULL;
46050Sstevel@tonic-gate 	uniqid = zone->zone_uniqid;
46060Sstevel@tonic-gate 	zone_rele(zone);
46070Sstevel@tonic-gate 	zone = NULL;	/* potentially free'd */
46080Sstevel@tonic-gate 
46090Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
46100Sstevel@tonic-gate 	for (; /* ever */; ) {
46110Sstevel@tonic-gate 		boolean_t unref;
46120Sstevel@tonic-gate 
46130Sstevel@tonic-gate 		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
46140Sstevel@tonic-gate 		    zone->zone_uniqid != uniqid) {
46150Sstevel@tonic-gate 			/*
46160Sstevel@tonic-gate 			 * The zone has gone away.  Necessary conditions
46170Sstevel@tonic-gate 			 * are met, so we return success.
46180Sstevel@tonic-gate 			 */
46190Sstevel@tonic-gate 			mutex_exit(&zonehash_lock);
46200Sstevel@tonic-gate 			return (0);
46210Sstevel@tonic-gate 		}
46220Sstevel@tonic-gate 		mutex_enter(&zone->zone_lock);
46230Sstevel@tonic-gate 		unref = ZONE_IS_UNREF(zone);
46240Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
46250Sstevel@tonic-gate 		if (unref) {
46260Sstevel@tonic-gate 			/*
46270Sstevel@tonic-gate 			 * There is only one reference to the zone -- that
46280Sstevel@tonic-gate 			 * added when the zone was added to the hashtables --
46290Sstevel@tonic-gate 			 * and things will remain this way until we drop
46300Sstevel@tonic-gate 			 * zonehash_lock... we can go ahead and cleanup the
46310Sstevel@tonic-gate 			 * zone.
46320Sstevel@tonic-gate 			 */
46330Sstevel@tonic-gate 			break;
46340Sstevel@tonic-gate 		}
46350Sstevel@tonic-gate 
46360Sstevel@tonic-gate 		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
46370Sstevel@tonic-gate 			/* Signaled */
46380Sstevel@tonic-gate 			mutex_exit(&zonehash_lock);
46390Sstevel@tonic-gate 			return (set_errno(EINTR));
46400Sstevel@tonic-gate 		}
46410Sstevel@tonic-gate 
46420Sstevel@tonic-gate 	}
46430Sstevel@tonic-gate 
46443792Sakolb 	/*
46453792Sakolb 	 * Remove CPU cap for this zone now since we're not going to
46463792Sakolb 	 * fail below this point.
46473792Sakolb 	 */
46483792Sakolb 	cpucaps_zone_remove(zone);
46493792Sakolb 
46503792Sakolb 	/* Get rid of the zone's kstats */
46513247Sgjelinek 	zone_kstat_delete(zone);
46523247Sgjelinek 
465312273SCasper.Dik@Sun.COM 	/* remove the pfexecd doors */
465412273SCasper.Dik@Sun.COM 	if (zone->zone_pfexecd != NULL) {
465512273SCasper.Dik@Sun.COM 		klpd_freelist(&zone->zone_pfexecd);
465612273SCasper.Dik@Sun.COM 		zone->zone_pfexecd = NULL;
465712273SCasper.Dik@Sun.COM 	}
465812273SCasper.Dik@Sun.COM 
46594888Seh208807 	/* free brand specific data */
46604888Seh208807 	if (ZONE_IS_BRANDED(zone))
46614888Seh208807 		ZBROP(zone)->b_free_brand_data(zone);
46624888Seh208807 
46633671Ssl108498 	/* Say goodbye to brand framework. */
46643671Ssl108498 	brand_unregister_zone(zone->zone_brand);
46653671Ssl108498 
46660Sstevel@tonic-gate 	/*
46670Sstevel@tonic-gate 	 * It is now safe to let the zone be recreated; remove it from the
46680Sstevel@tonic-gate 	 * lists.  The memory will not be freed until the last cred
46690Sstevel@tonic-gate 	 * reference goes away.
46700Sstevel@tonic-gate 	 */
46710Sstevel@tonic-gate 	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
46720Sstevel@tonic-gate 	zonecount--;
46730Sstevel@tonic-gate 	/* remove from active list and hash tables */
46740Sstevel@tonic-gate 	list_remove(&zone_active, zone);
46750Sstevel@tonic-gate 	(void) mod_hash_destroy(zonehashbyname,
46760Sstevel@tonic-gate 	    (mod_hash_key_t)zone->zone_name);
46770Sstevel@tonic-gate 	(void) mod_hash_destroy(zonehashbyid,
46780Sstevel@tonic-gate 	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
46791769Scarlsonj 	if (zone->zone_flags & ZF_HASHED_LABEL)
46801676Sjpk 		(void) mod_hash_destroy(zonehashbylabel,
46811676Sjpk 		    (mod_hash_key_t)zone->zone_slabel);
46820Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
46830Sstevel@tonic-gate 
4684766Scarlsonj 	/*
4685766Scarlsonj 	 * Release the root vnode; we're not using it anymore.  Nor should any
4686766Scarlsonj 	 * other thread that might access it exist.
4687766Scarlsonj 	 */
4688766Scarlsonj 	if (zone->zone_rootvp != NULL) {
4689766Scarlsonj 		VN_RELE(zone->zone_rootvp);
4690766Scarlsonj 		zone->zone_rootvp = NULL;
4691766Scarlsonj 	}
4692766Scarlsonj 
46930Sstevel@tonic-gate 	/* add to deathrow list */
46940Sstevel@tonic-gate 	mutex_enter(&zone_deathrow_lock);
46950Sstevel@tonic-gate 	list_insert_tail(&zone_deathrow, zone);
46960Sstevel@tonic-gate 	mutex_exit(&zone_deathrow_lock);
46970Sstevel@tonic-gate 
46980Sstevel@tonic-gate 	/*
46990Sstevel@tonic-gate 	 * Drop last reference (which was added by zsched()), this will
47000Sstevel@tonic-gate 	 * free the zone unless there are outstanding cred references.
47010Sstevel@tonic-gate 	 */
47020Sstevel@tonic-gate 	zone_rele(zone);
47030Sstevel@tonic-gate 	return (0);
47040Sstevel@tonic-gate }
47050Sstevel@tonic-gate 
47060Sstevel@tonic-gate /*
47070Sstevel@tonic-gate  * Systemcall entry point for zone_getattr(2).
47080Sstevel@tonic-gate  */
47090Sstevel@tonic-gate static ssize_t
47100Sstevel@tonic-gate zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
47110Sstevel@tonic-gate {
47120Sstevel@tonic-gate 	size_t size;
47130Sstevel@tonic-gate 	int error = 0, err;
47140Sstevel@tonic-gate 	zone_t *zone;
47150Sstevel@tonic-gate 	char *zonepath;
47162267Sdp 	char *outstr;
47170Sstevel@tonic-gate 	zone_status_t zone_status;
47180Sstevel@tonic-gate 	pid_t initpid;
47193792Sakolb 	boolean_t global = (curzone == global_zone);
47203792Sakolb 	boolean_t inzone = (curzone->zone_id == zoneid);
47213448Sdh155122 	ushort_t flags;
47220Sstevel@tonic-gate 
47230Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
47240Sstevel@tonic-gate 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
47250Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
47260Sstevel@tonic-gate 		return (set_errno(EINVAL));
47270Sstevel@tonic-gate 	}
47280Sstevel@tonic-gate 	zone_status = zone_status_get(zone);
47295880Snordmark 	if (zone_status < ZONE_IS_INITIALIZED) {
47300Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
47310Sstevel@tonic-gate 		return (set_errno(EINVAL));
47320Sstevel@tonic-gate 	}
47330Sstevel@tonic-gate 	zone_hold(zone);
47340Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
47350Sstevel@tonic-gate 
47360Sstevel@tonic-gate 	/*
47371676Sjpk 	 * If not in the global zone, don't show information about other zones,
47381676Sjpk 	 * unless the system is labeled and the local zone's label dominates
47391676Sjpk 	 * the other zone.
47400Sstevel@tonic-gate 	 */
47411676Sjpk 	if (!zone_list_access(zone)) {
47420Sstevel@tonic-gate 		zone_rele(zone);
47430Sstevel@tonic-gate 		return (set_errno(EINVAL));
47440Sstevel@tonic-gate 	}
47450Sstevel@tonic-gate 
47460Sstevel@tonic-gate 	switch (attr) {
47470Sstevel@tonic-gate 	case ZONE_ATTR_ROOT:
47480Sstevel@tonic-gate 		if (global) {
47490Sstevel@tonic-gate 			/*
47500Sstevel@tonic-gate 			 * Copy the path to trim the trailing "/" (except for
47510Sstevel@tonic-gate 			 * the global zone).
47520Sstevel@tonic-gate 			 */
47530Sstevel@tonic-gate 			if (zone != global_zone)
47540Sstevel@tonic-gate 				size = zone->zone_rootpathlen - 1;
47550Sstevel@tonic-gate 			else
47560Sstevel@tonic-gate 				size = zone->zone_rootpathlen;
47570Sstevel@tonic-gate 			zonepath = kmem_alloc(size, KM_SLEEP);
47580Sstevel@tonic-gate 			bcopy(zone->zone_rootpath, zonepath, size);
47590Sstevel@tonic-gate 			zonepath[size - 1] = '\0';
47600Sstevel@tonic-gate 		} else {
47613792Sakolb 			if (inzone || !is_system_labeled()) {
47621676Sjpk 				/*
47631676Sjpk 				 * Caller is not in the global zone.
47641676Sjpk 				 * if the query is on the current zone
47651676Sjpk 				 * or the system is not labeled,
47661676Sjpk 				 * just return faked-up path for current zone.
47671676Sjpk 				 */
47681676Sjpk 				zonepath = "/";
47691676Sjpk 				size = 2;
47701676Sjpk 			} else {
47711676Sjpk 				/*
47721676Sjpk 				 * Return related path for current zone.
47731676Sjpk 				 */
47741676Sjpk 				int prefix_len = strlen(zone_prefix);
47751676Sjpk 				int zname_len = strlen(zone->zone_name);
47761676Sjpk 
47771676Sjpk 				size = prefix_len + zname_len + 1;
47781676Sjpk 				zonepath = kmem_alloc(size, KM_SLEEP);
47791676Sjpk 				bcopy(zone_prefix, zonepath, prefix_len);
47801676Sjpk 				bcopy(zone->zone_name, zonepath +
47812267Sdp 				    prefix_len, zname_len);
47821676Sjpk 				zonepath[size - 1] = '\0';
47831676Sjpk 			}
47840Sstevel@tonic-gate 		}
47850Sstevel@tonic-gate 		if (bufsize > size)
47860Sstevel@tonic-gate 			bufsize = size;
47870Sstevel@tonic-gate 		if (buf != NULL) {
47880Sstevel@tonic-gate 			err = copyoutstr(zonepath, buf, bufsize, NULL);
47890Sstevel@tonic-gate 			if (err != 0 && err != ENAMETOOLONG)
47900Sstevel@tonic-gate 				error = EFAULT;
47910Sstevel@tonic-gate 		}
47923792Sakolb 		if (global || (is_system_labeled() && !inzone))
47930Sstevel@tonic-gate 			kmem_free(zonepath, size);
47940Sstevel@tonic-gate 		break;
47950Sstevel@tonic-gate 
47960Sstevel@tonic-gate 	case ZONE_ATTR_NAME:
47970Sstevel@tonic-gate 		size = strlen(zone->zone_name) + 1;
47980Sstevel@tonic-gate 		if (bufsize > size)
47990Sstevel@tonic-gate 			bufsize = size;
48000Sstevel@tonic-gate 		if (buf != NULL) {
48010Sstevel@tonic-gate 			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
48020Sstevel@tonic-gate 			if (err != 0 && err != ENAMETOOLONG)
48030Sstevel@tonic-gate 				error = EFAULT;
48040Sstevel@tonic-gate 		}
48050Sstevel@tonic-gate 		break;
48060Sstevel@tonic-gate 
48070Sstevel@tonic-gate 	case ZONE_ATTR_STATUS:
48080Sstevel@tonic-gate 		/*
48090Sstevel@tonic-gate 		 * Since we're not holding zonehash_lock, the zone status
48100Sstevel@tonic-gate 		 * may be anything; leave it up to userland to sort it out.
48110Sstevel@tonic-gate 		 */
48120Sstevel@tonic-gate 		size = sizeof (zone_status);
48130Sstevel@tonic-gate 		if (bufsize > size)
48140Sstevel@tonic-gate 			bufsize = size;
48150Sstevel@tonic-gate 		zone_status = zone_status_get(zone);
48160Sstevel@tonic-gate 		if (buf != NULL &&
48170Sstevel@tonic-gate 		    copyout(&zone_status, buf, bufsize) != 0)
48180Sstevel@tonic-gate 			error = EFAULT;
48190Sstevel@tonic-gate 		break;
48203448Sdh155122 	case ZONE_ATTR_FLAGS:
48213448Sdh155122 		size = sizeof (zone->zone_flags);
48223448Sdh155122 		if (bufsize > size)
48233448Sdh155122 			bufsize = size;
48243448Sdh155122 		flags = zone->zone_flags;
48253448Sdh155122 		if (buf != NULL &&
48263448Sdh155122 		    copyout(&flags, buf, bufsize) != 0)
48273448Sdh155122 			error = EFAULT;
48283448Sdh155122 		break;
48290Sstevel@tonic-gate 	case ZONE_ATTR_PRIVSET:
48300Sstevel@tonic-gate 		size = sizeof (priv_set_t);
48310Sstevel@tonic-gate 		if (bufsize > size)
48320Sstevel@tonic-gate 			bufsize = size;
48330Sstevel@tonic-gate 		if (buf != NULL &&
48340Sstevel@tonic-gate 		    copyout(zone->zone_privset, buf, bufsize) != 0)
48350Sstevel@tonic-gate 			error = EFAULT;
48360Sstevel@tonic-gate 		break;
48370Sstevel@tonic-gate 	case ZONE_ATTR_UNIQID:
48380Sstevel@tonic-gate 		size = sizeof (zone->zone_uniqid);
48390Sstevel@tonic-gate 		if (bufsize > size)
48400Sstevel@tonic-gate 			bufsize = size;
48410Sstevel@tonic-gate 		if (buf != NULL &&
48420Sstevel@tonic-gate 		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
48430Sstevel@tonic-gate 			error = EFAULT;
48440Sstevel@tonic-gate 		break;
48450Sstevel@tonic-gate 	case ZONE_ATTR_POOLID:
48460Sstevel@tonic-gate 		{
48470Sstevel@tonic-gate 			pool_t *pool;
48480Sstevel@tonic-gate 			poolid_t poolid;
48490Sstevel@tonic-gate 
48500Sstevel@tonic-gate 			if (pool_lock_intr() != 0) {
48510Sstevel@tonic-gate 				error = EINTR;
48520Sstevel@tonic-gate 				break;
48530Sstevel@tonic-gate 			}
48540Sstevel@tonic-gate 			pool = zone_pool_get(zone);
48550Sstevel@tonic-gate 			poolid = pool->pool_id;
48560Sstevel@tonic-gate 			pool_unlock();
48570Sstevel@tonic-gate 			size = sizeof (poolid);
48580Sstevel@tonic-gate 			if (bufsize > size)
48590Sstevel@tonic-gate 				bufsize = size;
48600Sstevel@tonic-gate 			if (buf != NULL && copyout(&poolid, buf, size) != 0)
48610Sstevel@tonic-gate 				error = EFAULT;
48620Sstevel@tonic-gate 		}
48630Sstevel@tonic-gate 		break;
48641676Sjpk 	case ZONE_ATTR_SLBL:
48651676Sjpk 		size = sizeof (bslabel_t);
48661676Sjpk 		if (bufsize > size)
48671676Sjpk 			bufsize = size;
48681676Sjpk 		if (zone->zone_slabel == NULL)
48691676Sjpk 			error = EINVAL;
48701676Sjpk 		else if (buf != NULL &&
48711676Sjpk 		    copyout(label2bslabel(zone->zone_slabel), buf,
48721676Sjpk 		    bufsize) != 0)
48731676Sjpk 			error = EFAULT;
48741676Sjpk 		break;
48750Sstevel@tonic-gate 	case ZONE_ATTR_INITPID:
48760Sstevel@tonic-gate 		size = sizeof (initpid);
48770Sstevel@tonic-gate 		if (bufsize > size)
48780Sstevel@tonic-gate 			bufsize = size;
48790Sstevel@tonic-gate 		initpid = zone->zone_proc_initpid;
48800Sstevel@tonic-gate 		if (initpid == -1) {
48810Sstevel@tonic-gate 			error = ESRCH;
48820Sstevel@tonic-gate 			break;
48830Sstevel@tonic-gate 		}
48840Sstevel@tonic-gate 		if (buf != NULL &&
48850Sstevel@tonic-gate 		    copyout(&initpid, buf, bufsize) != 0)
48860Sstevel@tonic-gate 			error = EFAULT;
48870Sstevel@tonic-gate 		break;
48882712Snn35248 	case ZONE_ATTR_BRAND:
48892712Snn35248 		size = strlen(zone->zone_brand->b_name) + 1;
48902712Snn35248 
48912712Snn35248 		if (bufsize > size)
48922712Snn35248 			bufsize = size;
48932712Snn35248 		if (buf != NULL) {
48942712Snn35248 			err = copyoutstr(zone->zone_brand->b_name, buf,
48952712Snn35248 			    bufsize, NULL);
48962712Snn35248 			if (err != 0 && err != ENAMETOOLONG)
48972712Snn35248 				error = EFAULT;
48982712Snn35248 		}
48992712Snn35248 		break;
49002267Sdp 	case ZONE_ATTR_INITNAME:
49012267Sdp 		size = strlen(zone->zone_initname) + 1;
49022267Sdp 		if (bufsize > size)
49032267Sdp 			bufsize = size;
49042267Sdp 		if (buf != NULL) {
49052267Sdp 			err = copyoutstr(zone->zone_initname, buf, bufsize,
49062267Sdp 			    NULL);
49072267Sdp 			if (err != 0 && err != ENAMETOOLONG)
49082267Sdp 				error = EFAULT;
49092267Sdp 		}
49102267Sdp 		break;
49112267Sdp 	case ZONE_ATTR_BOOTARGS:
49122267Sdp 		if (zone->zone_bootargs == NULL)
49132267Sdp 			outstr = "";
49142267Sdp 		else
49152267Sdp 			outstr = zone->zone_bootargs;
49162267Sdp 		size = strlen(outstr) + 1;
49172267Sdp 		if (bufsize > size)
49182267Sdp 			bufsize = size;
49192267Sdp 		if (buf != NULL) {
49202267Sdp 			err = copyoutstr(outstr, buf, bufsize, NULL);
49212267Sdp 			if (err != 0 && err != ENAMETOOLONG)
49222267Sdp 				error = EFAULT;
49232267Sdp 		}
49242267Sdp 		break;
49253247Sgjelinek 	case ZONE_ATTR_PHYS_MCAP:
49263247Sgjelinek 		size = sizeof (zone->zone_phys_mcap);
49273247Sgjelinek 		if (bufsize > size)
49283247Sgjelinek 			bufsize = size;
49293247Sgjelinek 		if (buf != NULL &&
49303247Sgjelinek 		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
49313247Sgjelinek 			error = EFAULT;
49323247Sgjelinek 		break;
49333247Sgjelinek 	case ZONE_ATTR_SCHED_CLASS:
49343247Sgjelinek 		mutex_enter(&class_lock);
49353247Sgjelinek 
49363247Sgjelinek 		if (zone->zone_defaultcid >= loaded_classes)
49373247Sgjelinek 			outstr = "";
49383247Sgjelinek 		else
49393247Sgjelinek 			outstr = sclass[zone->zone_defaultcid].cl_name;
49403247Sgjelinek 		size = strlen(outstr) + 1;
49413247Sgjelinek 		if (bufsize > size)
49423247Sgjelinek 			bufsize = size;
49433247Sgjelinek 		if (buf != NULL) {
49443247Sgjelinek 			err = copyoutstr(outstr, buf, bufsize, NULL);
49453247Sgjelinek 			if (err != 0 && err != ENAMETOOLONG)
49463247Sgjelinek 				error = EFAULT;
49473247Sgjelinek 		}
49483247Sgjelinek 
49493247Sgjelinek 		mutex_exit(&class_lock);
49503247Sgjelinek 		break;
49518662SJordan.Vaughan@Sun.com 	case ZONE_ATTR_HOSTID:
49528662SJordan.Vaughan@Sun.com 		if (zone->zone_hostid != HW_INVALID_HOSTID &&
49538662SJordan.Vaughan@Sun.com 		    bufsize == sizeof (zone->zone_hostid)) {
49548662SJordan.Vaughan@Sun.com 			size = sizeof (zone->zone_hostid);
49558662SJordan.Vaughan@Sun.com 			if (buf != NULL && copyout(&zone->zone_hostid, buf,
49568662SJordan.Vaughan@Sun.com 			    bufsize) != 0)
49578662SJordan.Vaughan@Sun.com 				error = EFAULT;
49588662SJordan.Vaughan@Sun.com 		} else {
49598662SJordan.Vaughan@Sun.com 			error = EINVAL;
49608662SJordan.Vaughan@Sun.com 		}
49618662SJordan.Vaughan@Sun.com 		break;
496212633Sjohn.levon@sun.com 	case ZONE_ATTR_FS_ALLOWED:
496312633Sjohn.levon@sun.com 		if (zone->zone_fs_allowed == NULL)
496412633Sjohn.levon@sun.com 			outstr = "";
496512633Sjohn.levon@sun.com 		else
496612633Sjohn.levon@sun.com 			outstr = zone->zone_fs_allowed;
496712633Sjohn.levon@sun.com 		size = strlen(outstr) + 1;
496812633Sjohn.levon@sun.com 		if (bufsize > size)
496912633Sjohn.levon@sun.com 			bufsize = size;
497012633Sjohn.levon@sun.com 		if (buf != NULL) {
497112633Sjohn.levon@sun.com 			err = copyoutstr(outstr, buf, bufsize, NULL);
497212633Sjohn.levon@sun.com 			if (err != 0 && err != ENAMETOOLONG)
497312633Sjohn.levon@sun.com 				error = EFAULT;
497412633Sjohn.levon@sun.com 		}
497512633Sjohn.levon@sun.com 		break;
49760Sstevel@tonic-gate 	default:
49772712Snn35248 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
49782712Snn35248 			size = bufsize;
49792712Snn35248 			error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
49802712Snn35248 		} else {
49812712Snn35248 			error = EINVAL;
49822712Snn35248 		}
49830Sstevel@tonic-gate 	}
49840Sstevel@tonic-gate 	zone_rele(zone);
49850Sstevel@tonic-gate 
49860Sstevel@tonic-gate 	if (error)
49870Sstevel@tonic-gate 		return (set_errno(error));
49880Sstevel@tonic-gate 	return ((ssize_t)size);
49890Sstevel@tonic-gate }
49900Sstevel@tonic-gate 
49910Sstevel@tonic-gate /*
49922267Sdp  * Systemcall entry point for zone_setattr(2).
49932267Sdp  */
49942267Sdp /*ARGSUSED*/
49952267Sdp static int
49962267Sdp zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
49972267Sdp {
49982267Sdp 	zone_t *zone;
49992267Sdp 	zone_status_t zone_status;
50002267Sdp 	int err;
50012267Sdp 
50022267Sdp 	if (secpolicy_zone_config(CRED()) != 0)
50032267Sdp 		return (set_errno(EPERM));
50042267Sdp 
50052267Sdp 	/*
50063247Sgjelinek 	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
50073247Sgjelinek 	 * global zone.
50082267Sdp 	 */
50093247Sgjelinek 	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
50102267Sdp 		return (set_errno(EINVAL));
50112267Sdp 	}
50122267Sdp 
50132267Sdp 	mutex_enter(&zonehash_lock);
50142267Sdp 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
50152267Sdp 		mutex_exit(&zonehash_lock);
50162267Sdp 		return (set_errno(EINVAL));
50172267Sdp 	}
50182267Sdp 	zone_hold(zone);
50192267Sdp 	mutex_exit(&zonehash_lock);
50202267Sdp 
50213247Sgjelinek 	/*
50223247Sgjelinek 	 * At present most attributes can only be set on non-running,
50233247Sgjelinek 	 * non-global zones.
50243247Sgjelinek 	 */
50252267Sdp 	zone_status = zone_status_get(zone);
50263247Sgjelinek 	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
50272267Sdp 		goto done;
50282267Sdp 
50292267Sdp 	switch (attr) {
50302267Sdp 	case ZONE_ATTR_INITNAME:
50312267Sdp 		err = zone_set_initname(zone, (const char *)buf);
50322267Sdp 		break;
50332267Sdp 	case ZONE_ATTR_BOOTARGS:
50342267Sdp 		err = zone_set_bootargs(zone, (const char *)buf);
50352267Sdp 		break;
50362712Snn35248 	case ZONE_ATTR_BRAND:
50374141Sedp 		err = zone_set_brand(zone, (const char *)buf);
50382712Snn35248 		break;
503912633Sjohn.levon@sun.com 	case ZONE_ATTR_FS_ALLOWED:
504012633Sjohn.levon@sun.com 		err = zone_set_fs_allowed(zone, (const char *)buf);
504112633Sjohn.levon@sun.com 		break;
50423247Sgjelinek 	case ZONE_ATTR_PHYS_MCAP:
50433247Sgjelinek 		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
50443247Sgjelinek 		break;
50453247Sgjelinek 	case ZONE_ATTR_SCHED_CLASS:
50463247Sgjelinek 		err = zone_set_sched_class(zone, (const char *)buf);
50473247Sgjelinek 		break;
50488662SJordan.Vaughan@Sun.com 	case ZONE_ATTR_HOSTID:
50498662SJordan.Vaughan@Sun.com 		if (bufsize == sizeof (zone->zone_hostid)) {
50508662SJordan.Vaughan@Sun.com 			if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
50518662SJordan.Vaughan@Sun.com 				err = 0;
50528662SJordan.Vaughan@Sun.com 			else
50538662SJordan.Vaughan@Sun.com 				err = EFAULT;
50548662SJordan.Vaughan@Sun.com 		} else {
50558662SJordan.Vaughan@Sun.com 			err = EINVAL;
50568662SJordan.Vaughan@Sun.com 		}
50578662SJordan.Vaughan@Sun.com 		break;
50582267Sdp 	default:
50592712Snn35248 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
50602712Snn35248 			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
50612712Snn35248 		else
50622712Snn35248 			err = EINVAL;
50632267Sdp 	}
50642267Sdp 
50652267Sdp done:
50662267Sdp 	zone_rele(zone);
50672267Sdp 	return (err != 0 ? set_errno(err) : 0);
50682267Sdp }
50692267Sdp 
50702267Sdp /*
50710Sstevel@tonic-gate  * Return zero if the process has at least one vnode mapped in to its
50720Sstevel@tonic-gate  * address space which shouldn't be allowed to change zones.
50733247Sgjelinek  *
50743247Sgjelinek  * Also return zero if the process has any shared mappings which reserve
50753247Sgjelinek  * swap.  This is because the counting for zone.max-swap does not allow swap
50765331Samw  * reservation to be shared between zones.  zone swap reservation is counted
50773247Sgjelinek  * on zone->zone_max_swap.
50780Sstevel@tonic-gate  */
50790Sstevel@tonic-gate static int
50800Sstevel@tonic-gate as_can_change_zones(void)
50810Sstevel@tonic-gate {
50820Sstevel@tonic-gate 	proc_t *pp = curproc;
50830Sstevel@tonic-gate 	struct seg *seg;
50840Sstevel@tonic-gate 	struct as *as = pp->p_as;
50850Sstevel@tonic-gate 	vnode_t *vp;
50860Sstevel@tonic-gate 	int allow = 1;
50870Sstevel@tonic-gate 
50880Sstevel@tonic-gate 	ASSERT(pp->p_as != &kas);
50893247Sgjelinek 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
50900Sstevel@tonic-gate 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
50913247Sgjelinek 
50923247Sgjelinek 		/*
50933247Sgjelinek 		 * Cannot enter zone with shared anon memory which
50943247Sgjelinek 		 * reserves swap.  See comment above.
50953247Sgjelinek 		 */
50963247Sgjelinek 		if (seg_can_change_zones(seg) == B_FALSE) {
50973247Sgjelinek 			allow = 0;
50983247Sgjelinek 			break;
50993247Sgjelinek 		}
51000Sstevel@tonic-gate 		/*
51010Sstevel@tonic-gate 		 * if we can't get a backing vnode for this segment then skip
51020Sstevel@tonic-gate 		 * it.
51030Sstevel@tonic-gate 		 */
51040Sstevel@tonic-gate 		vp = NULL;
51050Sstevel@tonic-gate 		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
51060Sstevel@tonic-gate 			continue;
51070Sstevel@tonic-gate 		if (!vn_can_change_zones(vp)) { /* bail on first match */
51080Sstevel@tonic-gate 			allow = 0;
51090Sstevel@tonic-gate 			break;
51100Sstevel@tonic-gate 		}
51110Sstevel@tonic-gate 	}
51123247Sgjelinek 	AS_LOCK_EXIT(as, &as->a_lock);
51130Sstevel@tonic-gate 	return (allow);
51140Sstevel@tonic-gate }
51150Sstevel@tonic-gate 
51160Sstevel@tonic-gate /*
51173247Sgjelinek  * Count swap reserved by curproc's address space
51183247Sgjelinek  */
51193247Sgjelinek static size_t
51203247Sgjelinek as_swresv(void)
51213247Sgjelinek {
51223247Sgjelinek 	proc_t *pp = curproc;
51233247Sgjelinek 	struct seg *seg;
51243247Sgjelinek 	struct as *as = pp->p_as;
51253247Sgjelinek 	size_t swap = 0;
51263247Sgjelinek 
51273247Sgjelinek 	ASSERT(pp->p_as != &kas);
51283247Sgjelinek 	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
51293247Sgjelinek 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
51303247Sgjelinek 		swap += seg_swresv(seg);
51313247Sgjelinek 
51323247Sgjelinek 	return (swap);
51333247Sgjelinek }
51343247Sgjelinek 
51353247Sgjelinek /*
51360Sstevel@tonic-gate  * Systemcall entry point for zone_enter().
51370Sstevel@tonic-gate  *
51380Sstevel@tonic-gate  * The current process is injected into said zone.  In the process
51390Sstevel@tonic-gate  * it will change its project membership, privileges, rootdir/cwd,
51400Sstevel@tonic-gate  * zone-wide rctls, and pool association to match those of the zone.
51410Sstevel@tonic-gate  *
51420Sstevel@tonic-gate  * The first zone_enter() called while the zone is in the ZONE_IS_READY
51430Sstevel@tonic-gate  * state will transition it to ZONE_IS_RUNNING.  Processes may only
51440Sstevel@tonic-gate  * enter a zone that is "ready" or "running".
51450Sstevel@tonic-gate  */
51460Sstevel@tonic-gate static int
51470Sstevel@tonic-gate zone_enter(zoneid_t zoneid)
51480Sstevel@tonic-gate {
51490Sstevel@tonic-gate 	zone_t *zone;
51500Sstevel@tonic-gate 	vnode_t *vp;
51510Sstevel@tonic-gate 	proc_t *pp = curproc;
51520Sstevel@tonic-gate 	contract_t *ct;
51530Sstevel@tonic-gate 	cont_process_t *ctp;
51540Sstevel@tonic-gate 	task_t *tk, *oldtk;
51550Sstevel@tonic-gate 	kproject_t *zone_proj0;
51560Sstevel@tonic-gate 	cred_t *cr, *newcr;
51570Sstevel@tonic-gate 	pool_t *oldpool, *newpool;
51580Sstevel@tonic-gate 	sess_t *sp;
51590Sstevel@tonic-gate 	uid_t uid;
51600Sstevel@tonic-gate 	zone_status_t status;
51610Sstevel@tonic-gate 	int err = 0;
51620Sstevel@tonic-gate 	rctl_entity_p_t e;
51633247Sgjelinek 	size_t swap;
51643792Sakolb 	kthread_id_t t;
51650Sstevel@tonic-gate 
51660Sstevel@tonic-gate 	if (secpolicy_zone_config(CRED()) != 0)
51670Sstevel@tonic-gate 		return (set_errno(EPERM));
51680Sstevel@tonic-gate 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
51690Sstevel@tonic-gate 		return (set_errno(EINVAL));
51700Sstevel@tonic-gate 
51710Sstevel@tonic-gate 	/*
51720Sstevel@tonic-gate 	 * Stop all lwps so we don't need to hold a lock to look at
51730Sstevel@tonic-gate 	 * curproc->p_zone.  This needs to happen before we grab any
51740Sstevel@tonic-gate 	 * locks to avoid deadlock (another lwp in the process could
51750Sstevel@tonic-gate 	 * be waiting for the held lock).
51760Sstevel@tonic-gate 	 */
51770Sstevel@tonic-gate 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
51780Sstevel@tonic-gate 		return (set_errno(EINTR));
51790Sstevel@tonic-gate 
51800Sstevel@tonic-gate 	/*
51810Sstevel@tonic-gate 	 * Make sure we're not changing zones with files open or mapped in
51820Sstevel@tonic-gate 	 * to our address space which shouldn't be changing zones.
51830Sstevel@tonic-gate 	 */
51840Sstevel@tonic-gate 	if (!files_can_change_zones()) {
51850Sstevel@tonic-gate 		err = EBADF;
51860Sstevel@tonic-gate 		goto out;
51870Sstevel@tonic-gate 	}
51880Sstevel@tonic-gate 	if (!as_can_change_zones()) {
51890Sstevel@tonic-gate 		err = EFAULT;
51900Sstevel@tonic-gate 		goto out;
51910Sstevel@tonic-gate 	}
51920Sstevel@tonic-gate 
51930Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
51940Sstevel@tonic-gate 	if (pp->p_zone != global_zone) {
51950Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
51960Sstevel@tonic-gate 		err = EINVAL;
51970Sstevel@tonic-gate 		goto out;
51980Sstevel@tonic-gate 	}
51990Sstevel@tonic-gate 
52000Sstevel@tonic-gate 	zone = zone_find_all_by_id(zoneid);
52010Sstevel@tonic-gate 	if (zone == NULL) {
52020Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
52030Sstevel@tonic-gate 		err = EINVAL;
52040Sstevel@tonic-gate 		goto out;
52050Sstevel@tonic-gate 	}
52060Sstevel@tonic-gate 
52070Sstevel@tonic-gate 	/*
52080Sstevel@tonic-gate 	 * To prevent processes in a zone from holding contracts on
52090Sstevel@tonic-gate 	 * extrazonal resources, and to avoid process contract
52100Sstevel@tonic-gate 	 * memberships which span zones, contract holders and processes
52110Sstevel@tonic-gate 	 * which aren't the sole members of their encapsulating process
52120Sstevel@tonic-gate 	 * contracts are not allowed to zone_enter.
52130Sstevel@tonic-gate 	 */
52140Sstevel@tonic-gate 	ctp = pp->p_ct_process;
52150Sstevel@tonic-gate 	ct = &ctp->conp_contract;
52160Sstevel@tonic-gate 	mutex_enter(&ct->ct_lock);
52170Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
52180Sstevel@tonic-gate 	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
52190Sstevel@tonic-gate 		mutex_exit(&pp->p_lock);
52200Sstevel@tonic-gate 		mutex_exit(&ct->ct_lock);
52210Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
52220Sstevel@tonic-gate 		err = EINVAL;
52230Sstevel@tonic-gate 		goto out;
52240Sstevel@tonic-gate 	}
52250Sstevel@tonic-gate 
52260Sstevel@tonic-gate 	/*
52270Sstevel@tonic-gate 	 * Moreover, we don't allow processes whose encapsulating
52280Sstevel@tonic-gate 	 * process contracts have inherited extrazonal contracts.
52290Sstevel@tonic-gate 	 * While it would be easier to eliminate all process contracts
52300Sstevel@tonic-gate 	 * with inherited contracts, we need to be able to give a
52310Sstevel@tonic-gate 	 * restarted init (or other zone-penetrating process) its
52320Sstevel@tonic-gate 	 * predecessor's contracts.
52330Sstevel@tonic-gate 	 */
52340Sstevel@tonic-gate 	if (ctp->conp_ninherited != 0) {
52350Sstevel@tonic-gate 		contract_t *next;
52360Sstevel@tonic-gate 		for (next = list_head(&ctp->conp_inherited); next;
52370Sstevel@tonic-gate 		    next = list_next(&ctp->conp_inherited, next)) {
52380Sstevel@tonic-gate 			if (contract_getzuniqid(next) != zone->zone_uniqid) {
52390Sstevel@tonic-gate 				mutex_exit(&pp->p_lock);
52400Sstevel@tonic-gate 				mutex_exit(&ct->ct_lock);
52410Sstevel@tonic-gate 				mutex_exit(&zonehash_lock);
52420Sstevel@tonic-gate 				err = EINVAL;
52430Sstevel@tonic-gate 				goto out;
52440Sstevel@tonic-gate 			}
52450Sstevel@tonic-gate 		}
52460Sstevel@tonic-gate 	}
52476073Sacruz 
52480Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
52490Sstevel@tonic-gate 	mutex_exit(&ct->ct_lock);
52500Sstevel@tonic-gate 
52510Sstevel@tonic-gate 	status = zone_status_get(zone);
52520Sstevel@tonic-gate 	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
52530Sstevel@tonic-gate 		/*
52540Sstevel@tonic-gate 		 * Can't join
52550Sstevel@tonic-gate 		 */
52560Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
52570Sstevel@tonic-gate 		err = EINVAL;
52580Sstevel@tonic-gate 		goto out;
52590Sstevel@tonic-gate 	}
52600Sstevel@tonic-gate 
52610Sstevel@tonic-gate 	/*
52620Sstevel@tonic-gate 	 * Make sure new priv set is within the permitted set for caller
52630Sstevel@tonic-gate 	 */
52640Sstevel@tonic-gate 	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
52650Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
52660Sstevel@tonic-gate 		err = EPERM;
52670Sstevel@tonic-gate 		goto out;
52680Sstevel@tonic-gate 	}
52690Sstevel@tonic-gate 	/*
52700Sstevel@tonic-gate 	 * We want to momentarily drop zonehash_lock while we optimistically
52710Sstevel@tonic-gate 	 * bind curproc to the pool it should be running in.  This is safe
52720Sstevel@tonic-gate 	 * since the zone can't disappear (we have a hold on it).
52730Sstevel@tonic-gate 	 */
52740Sstevel@tonic-gate 	zone_hold(zone);
52750Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
52760Sstevel@tonic-gate 
52770Sstevel@tonic-gate 	/*
52780Sstevel@tonic-gate 	 * Grab pool_lock to keep the pools configuration from changing
52790Sstevel@tonic-gate 	 * and to stop ourselves from getting rebound to another pool
52800Sstevel@tonic-gate 	 * until we join the zone.
52810Sstevel@tonic-gate 	 */
52820Sstevel@tonic-gate 	if (pool_lock_intr() != 0) {
52830Sstevel@tonic-gate 		zone_rele(zone);
52840Sstevel@tonic-gate 		err = EINTR;
52850Sstevel@tonic-gate 		goto out;
52860Sstevel@tonic-gate 	}
52870Sstevel@tonic-gate 	ASSERT(secpolicy_pool(CRED()) == 0);
52880Sstevel@tonic-gate 	/*
52890Sstevel@tonic-gate 	 * Bind ourselves to the pool currently associated with the zone.
52900Sstevel@tonic-gate 	 */
52910Sstevel@tonic-gate 	oldpool = curproc->p_pool;
52920Sstevel@tonic-gate 	newpool = zone_pool_get(zone);
52930Sstevel@tonic-gate 	if (pool_state == POOL_ENABLED && newpool != oldpool &&
52940Sstevel@tonic-gate 	    (err = pool_do_bind(newpool, P_PID, P_MYID,
52950Sstevel@tonic-gate 	    POOL_BIND_ALL)) != 0) {
52960Sstevel@tonic-gate 		pool_unlock();
52970Sstevel@tonic-gate 		zone_rele(zone);
52980Sstevel@tonic-gate 		goto out;
52990Sstevel@tonic-gate 	}
53000Sstevel@tonic-gate 
53010Sstevel@tonic-gate 	/*
53020Sstevel@tonic-gate 	 * Grab cpu_lock now; we'll need it later when we call
53030Sstevel@tonic-gate 	 * task_join().
53040Sstevel@tonic-gate 	 */
53050Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
53060Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
53070Sstevel@tonic-gate 	/*
53080Sstevel@tonic-gate 	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
53090Sstevel@tonic-gate 	 */
53100Sstevel@tonic-gate 	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
53110Sstevel@tonic-gate 		/*
53120Sstevel@tonic-gate 		 * Can't join anymore.
53130Sstevel@tonic-gate 		 */
53140Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
53150Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
53160Sstevel@tonic-gate 		if (pool_state == POOL_ENABLED &&
53170Sstevel@tonic-gate 		    newpool != oldpool)
53180Sstevel@tonic-gate 			(void) pool_do_bind(oldpool, P_PID, P_MYID,
53190Sstevel@tonic-gate 			    POOL_BIND_ALL);
53200Sstevel@tonic-gate 		pool_unlock();
53210Sstevel@tonic-gate 		zone_rele(zone);
53220Sstevel@tonic-gate 		err = EINVAL;
53230Sstevel@tonic-gate 		goto out;
53240Sstevel@tonic-gate 	}
53250Sstevel@tonic-gate 
53263247Sgjelinek 	/*
53273247Sgjelinek 	 * a_lock must be held while transfering locked memory and swap
53283247Sgjelinek 	 * reservation from the global zone to the non global zone because
53293247Sgjelinek 	 * asynchronous faults on the processes' address space can lock
53303247Sgjelinek 	 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
53313247Sgjelinek 	 * segments respectively.
53323247Sgjelinek 	 */
53333247Sgjelinek 	AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
53343247Sgjelinek 	swap = as_swresv();
53350Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
53360Sstevel@tonic-gate 	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
53370Sstevel@tonic-gate 	/* verify that we do not exceed and task or lwp limits */
53380Sstevel@tonic-gate 	mutex_enter(&zone->zone_nlwps_lock);
53390Sstevel@tonic-gate 	/* add new lwps to zone and zone's proj0 */
53400Sstevel@tonic-gate 	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
53410Sstevel@tonic-gate 	zone->zone_nlwps += pp->p_lwpcnt;
53420Sstevel@tonic-gate 	/* add 1 task to zone's proj0 */
53430Sstevel@tonic-gate 	zone_proj0->kpj_ntasks += 1;
5344*12725SMenno.Lageman@Sun.COM 
5345*12725SMenno.Lageman@Sun.COM 	zone_proj0->kpj_nprocs++;
5346*12725SMenno.Lageman@Sun.COM 	zone->zone_nprocs++;
53470Sstevel@tonic-gate 	mutex_exit(&zone->zone_nlwps_lock);
53480Sstevel@tonic-gate 
53493247Sgjelinek 	mutex_enter(&zone->zone_mem_lock);
53502768Ssl108498 	zone->zone_locked_mem += pp->p_locked_mem;
53512768Ssl108498 	zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
53523247Sgjelinek 	zone->zone_max_swap += swap;
53533247Sgjelinek 	mutex_exit(&zone->zone_mem_lock);
53542768Ssl108498 
53553916Skrishna 	mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
53563916Skrishna 	zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
53573916Skrishna 	mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
53583916Skrishna 
5359*12725SMenno.Lageman@Sun.COM 	/* remove lwps and process from proc's old zone and old project */
53600Sstevel@tonic-gate 	mutex_enter(&pp->p_zone->zone_nlwps_lock);
53610Sstevel@tonic-gate 	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
53620Sstevel@tonic-gate 	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5363*12725SMenno.Lageman@Sun.COM 	pp->p_task->tk_proj->kpj_nprocs--;
5364*12725SMenno.Lageman@Sun.COM 	pp->p_zone->zone_nprocs--;
53650Sstevel@tonic-gate 	mutex_exit(&pp->p_zone->zone_nlwps_lock);
53660Sstevel@tonic-gate 
53673247Sgjelinek 	mutex_enter(&pp->p_zone->zone_mem_lock);
53682768Ssl108498 	pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
53692768Ssl108498 	pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
53703247Sgjelinek 	pp->p_zone->zone_max_swap -= swap;
53713247Sgjelinek 	mutex_exit(&pp->p_zone->zone_mem_lock);
53722768Ssl108498 
53733916Skrishna 	mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
53743916Skrishna 	pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
53753916Skrishna 	mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
53763916Skrishna 
53779121SVamsi.Krishna@Sun.COM 	pp->p_flag |= SZONETOP;
53789121SVamsi.Krishna@Sun.COM 	pp->p_zone = zone;
53792768Ssl108498 	mutex_exit(&pp->p_lock);
53803247Sgjelinek 	AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
53812768Ssl108498 
53820Sstevel@tonic-gate 	/*
53830Sstevel@tonic-gate 	 * Joining the zone cannot fail from now on.
53840Sstevel@tonic-gate 	 *
53850Sstevel@tonic-gate 	 * This means that a lot of the following code can be commonized and
53860Sstevel@tonic-gate 	 * shared with zsched().
53870Sstevel@tonic-gate 	 */
53880Sstevel@tonic-gate 
53890Sstevel@tonic-gate 	/*
53906073Sacruz 	 * If the process contract fmri was inherited, we need to
53916073Sacruz 	 * flag this so that any contract status will not leak
53926073Sacruz 	 * extra zone information, svc_fmri in this case
53936073Sacruz 	 */
53946073Sacruz 	if (ctp->conp_svc_ctid != ct->ct_id) {
53956073Sacruz 		mutex_enter(&ct->ct_lock);
53966073Sacruz 		ctp->conp_svc_zone_enter = ct->ct_id;
53976073Sacruz 		mutex_exit(&ct->ct_lock);
53986073Sacruz 	}
53996073Sacruz 
54006073Sacruz 	/*
54010Sstevel@tonic-gate 	 * Reset the encapsulating process contract's zone.
54020Sstevel@tonic-gate 	 */
54030Sstevel@tonic-gate 	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
54040Sstevel@tonic-gate 	contract_setzuniqid(ct, zone->zone_uniqid);
54050Sstevel@tonic-gate 
54060Sstevel@tonic-gate 	/*
54070Sstevel@tonic-gate 	 * Create a new task and associate the process with the project keyed
54080Sstevel@tonic-gate 	 * by (projid,zoneid).
54090Sstevel@tonic-gate 	 *
54100Sstevel@tonic-gate 	 * We might as well be in project 0; the global zone's projid doesn't
54110Sstevel@tonic-gate 	 * make much sense in a zone anyhow.
54120Sstevel@tonic-gate 	 *
54130Sstevel@tonic-gate 	 * This also increments zone_ntasks, and returns with p_lock held.
54140Sstevel@tonic-gate 	 */
54150Sstevel@tonic-gate 	tk = task_create(0, zone);
54160Sstevel@tonic-gate 	oldtk = task_join(tk, 0);
54170Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
54180Sstevel@tonic-gate 
54190Sstevel@tonic-gate 	/*
54200Sstevel@tonic-gate 	 * call RCTLOP_SET functions on this proc
54210Sstevel@tonic-gate 	 */
54220Sstevel@tonic-gate 	e.rcep_p.zone = zone;
54230Sstevel@tonic-gate 	e.rcep_t = RCENTITY_ZONE;
54240Sstevel@tonic-gate 	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
54250Sstevel@tonic-gate 	    RCD_CALLBACK);
54260Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
54270Sstevel@tonic-gate 
54280Sstevel@tonic-gate 	/*
54290Sstevel@tonic-gate 	 * We don't need to hold any of zsched's locks here; not only do we know
54300Sstevel@tonic-gate 	 * the process and zone aren't going away, we know its session isn't
54310Sstevel@tonic-gate 	 * changing either.
54320Sstevel@tonic-gate 	 *
54330Sstevel@tonic-gate 	 * By joining zsched's session here, we mimic the behavior in the
54340Sstevel@tonic-gate 	 * global zone of init's sid being the pid of sched.  We extend this
54350Sstevel@tonic-gate 	 * to all zlogin-like zone_enter()'ing processes as well.
54360Sstevel@tonic-gate 	 */
54370Sstevel@tonic-gate 	mutex_enter(&pidlock);
54380Sstevel@tonic-gate 	sp = zone->zone_zsched->p_sessp;
54392712Snn35248 	sess_hold(zone->zone_zsched);
54400Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
54410Sstevel@tonic-gate 	pgexit(pp);
54422712Snn35248 	sess_rele(pp->p_sessp, B_TRUE);
54430Sstevel@tonic-gate 	pp->p_sessp = sp;
54440Sstevel@tonic-gate 	pgjoin(pp, zone->zone_zsched->p_pidp);
54453247Sgjelinek 
54463247Sgjelinek 	/*
54473792Sakolb 	 * If any threads are scheduled to be placed on zone wait queue they
54483792Sakolb 	 * should abandon the idea since the wait queue is changing.
54493792Sakolb 	 * We need to be holding pidlock & p_lock to do this.
54503792Sakolb 	 */
54513792Sakolb 	if ((t = pp->p_tlist) != NULL) {
54523792Sakolb 		do {
54533792Sakolb 			thread_lock(t);
54543792Sakolb 			/*
54553792Sakolb 			 * Kick this thread so that he doesn't sit
54563792Sakolb 			 * on a wrong wait queue.
54573792Sakolb 			 */
54583792Sakolb 			if (ISWAITING(t))
54593792Sakolb 				setrun_locked(t);
54603792Sakolb 
54613792Sakolb 			if (t->t_schedflag & TS_ANYWAITQ)
54623792Sakolb 				t->t_schedflag &= ~ TS_ANYWAITQ;
54633792Sakolb 
54643792Sakolb 			thread_unlock(t);
54653792Sakolb 		} while ((t = t->t_forw) != pp->p_tlist);
54663792Sakolb 	}
54673792Sakolb 
54683792Sakolb 	/*
54693247Sgjelinek 	 * If there is a default scheduling class for the zone and it is not
54703247Sgjelinek 	 * the class we are currently in, change all of the threads in the
54713247Sgjelinek 	 * process to the new class.  We need to be holding pidlock & p_lock
54723247Sgjelinek 	 * when we call parmsset so this is a good place to do it.
54733247Sgjelinek 	 */
54743247Sgjelinek 	if (zone->zone_defaultcid > 0 &&
54753247Sgjelinek 	    zone->zone_defaultcid != curthread->t_cid) {
54763247Sgjelinek 		pcparms_t pcparms;
54773247Sgjelinek 
54783247Sgjelinek 		pcparms.pc_cid = zone->zone_defaultcid;
54793247Sgjelinek 		pcparms.pc_clparms[0] = 0;
54803247Sgjelinek 
54813247Sgjelinek 		/*
54823247Sgjelinek 		 * If setting the class fails, we still want to enter the zone.
54833247Sgjelinek 		 */
54843247Sgjelinek 		if ((t = pp->p_tlist) != NULL) {
54853247Sgjelinek 			do {
54863247Sgjelinek 				(void) parmsset(&pcparms, t);
54873247Sgjelinek 			} while ((t = t->t_forw) != pp->p_tlist);
54883247Sgjelinek 		}
54893247Sgjelinek 	}
54903247Sgjelinek 
54910Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
54920Sstevel@tonic-gate 	mutex_exit(&pidlock);
54930Sstevel@tonic-gate 
54940Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
54950Sstevel@tonic-gate 	/*
54960Sstevel@tonic-gate 	 * We're firmly in the zone; let pools progress.
54970Sstevel@tonic-gate 	 */
54980Sstevel@tonic-gate 	pool_unlock();
54990Sstevel@tonic-gate 	task_rele(oldtk);
55000Sstevel@tonic-gate 	/*
55010Sstevel@tonic-gate 	 * We don't need to retain a hold on the zone since we already
55020Sstevel@tonic-gate 	 * incremented zone_ntasks, so the zone isn't going anywhere.
55030Sstevel@tonic-gate 	 */
55040Sstevel@tonic-gate 	zone_rele(zone);
55050Sstevel@tonic-gate 
55060Sstevel@tonic-gate 	/*
55070Sstevel@tonic-gate 	 * Chroot
55080Sstevel@tonic-gate 	 */
55090Sstevel@tonic-gate 	vp = zone->zone_rootvp;
55100Sstevel@tonic-gate 	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
55110Sstevel@tonic-gate 	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
55120Sstevel@tonic-gate 
55130Sstevel@tonic-gate 	/*
55140Sstevel@tonic-gate 	 * Change process credentials
55150Sstevel@tonic-gate 	 */
55160Sstevel@tonic-gate 	newcr = cralloc();
55170Sstevel@tonic-gate 	mutex_enter(&pp->p_crlock);
55180Sstevel@tonic-gate 	cr = pp->p_cred;
55190Sstevel@tonic-gate 	crcopy_to(cr, newcr);
55200Sstevel@tonic-gate 	crsetzone(newcr, zone);
55210Sstevel@tonic-gate 	pp->p_cred = newcr;
55220Sstevel@tonic-gate 
55230Sstevel@tonic-gate 	/*
55240Sstevel@tonic-gate 	 * Restrict all process privilege sets to zone limit
55250Sstevel@tonic-gate 	 */
55260Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
55270Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
55280Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
55290Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
55300Sstevel@tonic-gate 	mutex_exit(&pp->p_crlock);
55310Sstevel@tonic-gate 	crset(pp, newcr);
55320Sstevel@tonic-gate 
55330Sstevel@tonic-gate 	/*
55340Sstevel@tonic-gate 	 * Adjust upcount to reflect zone entry.
55350Sstevel@tonic-gate 	 */
55360Sstevel@tonic-gate 	uid = crgetruid(newcr);
55370Sstevel@tonic-gate 	mutex_enter(&pidlock);
55380Sstevel@tonic-gate 	upcount_dec(uid, GLOBAL_ZONEID);
55390Sstevel@tonic-gate 	upcount_inc(uid, zoneid);
55400Sstevel@tonic-gate 	mutex_exit(&pidlock);
55410Sstevel@tonic-gate 
55420Sstevel@tonic-gate 	/*
55430Sstevel@tonic-gate 	 * Set up core file path and content.
55440Sstevel@tonic-gate 	 */
55450Sstevel@tonic-gate 	set_core_defaults();
55460Sstevel@tonic-gate 
55470Sstevel@tonic-gate out:
55480Sstevel@tonic-gate 	/*
55490Sstevel@tonic-gate 	 * Let the other lwps continue.
55500Sstevel@tonic-gate 	 */
55510Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
55520Sstevel@tonic-gate 	if (curthread != pp->p_agenttp)
55530Sstevel@tonic-gate 		continuelwps(pp);
55540Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
55550Sstevel@tonic-gate 
55560Sstevel@tonic-gate 	return (err != 0 ? set_errno(err) : 0);
55570Sstevel@tonic-gate }
55580Sstevel@tonic-gate 
55590Sstevel@tonic-gate /*
55600Sstevel@tonic-gate  * Systemcall entry point for zone_list(2).
55610Sstevel@tonic-gate  *
55620Sstevel@tonic-gate  * Processes running in a (non-global) zone only see themselves.
55631676Sjpk  * On labeled systems, they see all zones whose label they dominate.
55640Sstevel@tonic-gate  */
55650Sstevel@tonic-gate static int
55660Sstevel@tonic-gate zone_list(zoneid_t *zoneidlist, uint_t *numzones)
55670Sstevel@tonic-gate {
55680Sstevel@tonic-gate 	zoneid_t *zoneids;
55691769Scarlsonj 	zone_t *zone, *myzone;
55700Sstevel@tonic-gate 	uint_t user_nzones, real_nzones;
55711676Sjpk 	uint_t domi_nzones;
55721676Sjpk 	int error;
55730Sstevel@tonic-gate 
55740Sstevel@tonic-gate 	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
55750Sstevel@tonic-gate 		return (set_errno(EFAULT));
55760Sstevel@tonic-gate 
55771769Scarlsonj 	myzone = curproc->p_zone;
55781769Scarlsonj 	if (myzone != global_zone) {
55791676Sjpk 		bslabel_t *mybslab;
55801676Sjpk 
55811676Sjpk 		if (!is_system_labeled()) {
55821676Sjpk 			/* just return current zone */
55831676Sjpk 			real_nzones = domi_nzones = 1;
55841676Sjpk 			zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
55851769Scarlsonj 			zoneids[0] = myzone->zone_id;
55861676Sjpk 		} else {
55871676Sjpk 			/* return all zones that are dominated */
55881676Sjpk 			mutex_enter(&zonehash_lock);
55891676Sjpk 			real_nzones = zonecount;
55901676Sjpk 			domi_nzones = 0;
55911676Sjpk 			if (real_nzones > 0) {
55921676Sjpk 				zoneids = kmem_alloc(real_nzones *
55931676Sjpk 				    sizeof (zoneid_t), KM_SLEEP);
55941769Scarlsonj 				mybslab = label2bslabel(myzone->zone_slabel);
55951676Sjpk 				for (zone = list_head(&zone_active);
55961676Sjpk 				    zone != NULL;
55971676Sjpk 				    zone = list_next(&zone_active, zone)) {
55981676Sjpk 					if (zone->zone_id == GLOBAL_ZONEID)
55991676Sjpk 						continue;
56001769Scarlsonj 					if (zone != myzone &&
56011769Scarlsonj 					    (zone->zone_flags & ZF_IS_SCRATCH))
56021769Scarlsonj 						continue;
56031769Scarlsonj 					/*
56041769Scarlsonj 					 * Note that a label always dominates
56051769Scarlsonj 					 * itself, so myzone is always included
56061769Scarlsonj 					 * in the list.
56071769Scarlsonj 					 */
56081676Sjpk 					if (bldominates(mybslab,
56091676Sjpk 					    label2bslabel(zone->zone_slabel))) {
56101676Sjpk 						zoneids[domi_nzones++] =
56111676Sjpk 						    zone->zone_id;
56121676Sjpk 					}
56131676Sjpk 				}
56141676Sjpk 			}
56151676Sjpk 			mutex_exit(&zonehash_lock);
56161676Sjpk 		}
56170Sstevel@tonic-gate 	} else {
56180Sstevel@tonic-gate 		mutex_enter(&zonehash_lock);
56190Sstevel@tonic-gate 		real_nzones = zonecount;
56201676Sjpk 		domi_nzones = 0;
56211676Sjpk 		if (real_nzones > 0) {
56220Sstevel@tonic-gate 			zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
56230Sstevel@tonic-gate 			    KM_SLEEP);
56240Sstevel@tonic-gate 			for (zone = list_head(&zone_active); zone != NULL;
56250Sstevel@tonic-gate 			    zone = list_next(&zone_active, zone))
56261676Sjpk 				zoneids[domi_nzones++] = zone->zone_id;
56271676Sjpk 			ASSERT(domi_nzones == real_nzones);
56280Sstevel@tonic-gate 		}
56290Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
56300Sstevel@tonic-gate 	}
56310Sstevel@tonic-gate 
56321676Sjpk 	/*
56331676Sjpk 	 * If user has allocated space for fewer entries than we found, then
56341676Sjpk 	 * return only up to his limit.  Either way, tell him exactly how many
56351676Sjpk 	 * we found.
56361676Sjpk 	 */
56371676Sjpk 	if (domi_nzones < user_nzones)
56381676Sjpk 		user_nzones = domi_nzones;
56391676Sjpk 	error = 0;
56401676Sjpk 	if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
56410Sstevel@tonic-gate 		error = EFAULT;
56421676Sjpk 	} else if (zoneidlist != NULL && user_nzones != 0) {
56430Sstevel@tonic-gate 		if (copyout(zoneids, zoneidlist,
56440Sstevel@tonic-gate 		    user_nzones * sizeof (zoneid_t)) != 0)
56450Sstevel@tonic-gate 			error = EFAULT;
56460Sstevel@tonic-gate 	}
56470Sstevel@tonic-gate 
56481676Sjpk 	if (real_nzones > 0)
56490Sstevel@tonic-gate 		kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
56500Sstevel@tonic-gate 
56511676Sjpk 	if (error != 0)
56520Sstevel@tonic-gate 		return (set_errno(error));
56530Sstevel@tonic-gate 	else
56540Sstevel@tonic-gate 		return (0);
56550Sstevel@tonic-gate }
56560Sstevel@tonic-gate 
56570Sstevel@tonic-gate /*
56580Sstevel@tonic-gate  * Systemcall entry point for zone_lookup(2).
56590Sstevel@tonic-gate  *
56601676Sjpk  * Non-global zones are only able to see themselves and (on labeled systems)
56611676Sjpk  * the zones they dominate.
56620Sstevel@tonic-gate  */
56630Sstevel@tonic-gate static zoneid_t
56640Sstevel@tonic-gate zone_lookup(const char *zone_name)
56650Sstevel@tonic-gate {
56660Sstevel@tonic-gate 	char *kname;
56670Sstevel@tonic-gate 	zone_t *zone;
56680Sstevel@tonic-gate 	zoneid_t zoneid;
56690Sstevel@tonic-gate 	int err;
56700Sstevel@tonic-gate 
56710Sstevel@tonic-gate 	if (zone_name == NULL) {
56720Sstevel@tonic-gate 		/* return caller's zone id */
56730Sstevel@tonic-gate 		return (getzoneid());
56740Sstevel@tonic-gate 	}
56750Sstevel@tonic-gate 
56760Sstevel@tonic-gate 	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
56770Sstevel@tonic-gate 	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
56780Sstevel@tonic-gate 		kmem_free(kname, ZONENAME_MAX);
56790Sstevel@tonic-gate 		return (set_errno(err));
56800Sstevel@tonic-gate 	}
56810Sstevel@tonic-gate 
56820Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
56830Sstevel@tonic-gate 	zone = zone_find_all_by_name(kname);
56840Sstevel@tonic-gate 	kmem_free(kname, ZONENAME_MAX);
56851676Sjpk 	/*
56861676Sjpk 	 * In a non-global zone, can only lookup global and own name.
56871676Sjpk 	 * In Trusted Extensions zone label dominance rules apply.
56881676Sjpk 	 */
56891676Sjpk 	if (zone == NULL ||
56901676Sjpk 	    zone_status_get(zone) < ZONE_IS_READY ||
56911676Sjpk 	    !zone_list_access(zone)) {
56920Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
56930Sstevel@tonic-gate 		return (set_errno(EINVAL));
56941676Sjpk 	} else {
56951676Sjpk 		zoneid = zone->zone_id;
56961676Sjpk 		mutex_exit(&zonehash_lock);
56971676Sjpk 		return (zoneid);
56980Sstevel@tonic-gate 	}
56990Sstevel@tonic-gate }
57000Sstevel@tonic-gate 
5701813Sdp static int
5702813Sdp zone_version(int *version_arg)
5703813Sdp {
5704813Sdp 	int version = ZONE_SYSCALL_API_VERSION;
5705813Sdp 
5706813Sdp 	if (copyout(&version, version_arg, sizeof (int)) != 0)
5707813Sdp 		return (set_errno(EFAULT));
5708813Sdp 	return (0);
5709813Sdp }
5710813Sdp 
57110Sstevel@tonic-gate /* ARGSUSED */
57120Sstevel@tonic-gate long
5713789Sahrens zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
57140Sstevel@tonic-gate {
57150Sstevel@tonic-gate 	zone_def zs;
571610616SSebastien.Roy@Sun.COM 	int err;
57170Sstevel@tonic-gate 
57180Sstevel@tonic-gate 	switch (cmd) {
57190Sstevel@tonic-gate 	case ZONE_CREATE:
57200Sstevel@tonic-gate 		if (get_udatamodel() == DATAMODEL_NATIVE) {
57210Sstevel@tonic-gate 			if (copyin(arg1, &zs, sizeof (zone_def))) {
57220Sstevel@tonic-gate 				return (set_errno(EFAULT));
57230Sstevel@tonic-gate 			}
57240Sstevel@tonic-gate 		} else {
57250Sstevel@tonic-gate #ifdef _SYSCALL32_IMPL
57260Sstevel@tonic-gate 			zone_def32 zs32;
57270Sstevel@tonic-gate 
57280Sstevel@tonic-gate 			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
57290Sstevel@tonic-gate 				return (set_errno(EFAULT));
57300Sstevel@tonic-gate 			}
57310Sstevel@tonic-gate 			zs.zone_name =
57320Sstevel@tonic-gate 			    (const char *)(unsigned long)zs32.zone_name;
57330Sstevel@tonic-gate 			zs.zone_root =
57340Sstevel@tonic-gate 			    (const char *)(unsigned long)zs32.zone_root;
57350Sstevel@tonic-gate 			zs.zone_privs =
57360Sstevel@tonic-gate 			    (const struct priv_set *)
57370Sstevel@tonic-gate 			    (unsigned long)zs32.zone_privs;
57381409Sdp 			zs.zone_privssz = zs32.zone_privssz;
57390Sstevel@tonic-gate 			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
57400Sstevel@tonic-gate 			zs.rctlbufsz = zs32.rctlbufsz;
5741789Sahrens 			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
5742789Sahrens 			zs.zfsbufsz = zs32.zfsbufsz;
57430Sstevel@tonic-gate 			zs.extended_error =
57440Sstevel@tonic-gate 			    (int *)(unsigned long)zs32.extended_error;
57451676Sjpk 			zs.match = zs32.match;
57461676Sjpk 			zs.doi = zs32.doi;
57471676Sjpk 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
57483448Sdh155122 			zs.flags = zs32.flags;
57490Sstevel@tonic-gate #else
57500Sstevel@tonic-gate 			panic("get_udatamodel() returned bogus result\n");
57510Sstevel@tonic-gate #endif
57520Sstevel@tonic-gate 		}
57530Sstevel@tonic-gate 
57540Sstevel@tonic-gate 		return (zone_create(zs.zone_name, zs.zone_root,
5755813Sdp 		    zs.zone_privs, zs.zone_privssz,
5756813Sdp 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
5757813Sdp 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
57581676Sjpk 		    zs.extended_error, zs.match, zs.doi,
57593448Sdh155122 		    zs.label, zs.flags));
57600Sstevel@tonic-gate 	case ZONE_BOOT:
57612267Sdp 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
57620Sstevel@tonic-gate 	case ZONE_DESTROY:
57630Sstevel@tonic-gate 		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
57640Sstevel@tonic-gate 	case ZONE_GETATTR:
57650Sstevel@tonic-gate 		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
57660Sstevel@tonic-gate 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
57672267Sdp 	case ZONE_SETATTR:
57682267Sdp 		return (zone_setattr((zoneid_t)(uintptr_t)arg1,
57692267Sdp 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
57700Sstevel@tonic-gate 	case ZONE_ENTER:
57710Sstevel@tonic-gate 		return (zone_enter((zoneid_t)(uintptr_t)arg1));
57720Sstevel@tonic-gate 	case ZONE_LIST:
57730Sstevel@tonic-gate 		return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
57740Sstevel@tonic-gate 	case ZONE_SHUTDOWN:
57750Sstevel@tonic-gate 		return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
57760Sstevel@tonic-gate 	case ZONE_LOOKUP:
57770Sstevel@tonic-gate 		return (zone_lookup((const char *)arg1));
5778813Sdp 	case ZONE_VERSION:
5779813Sdp 		return (zone_version((int *)arg1));
57803448Sdh155122 	case ZONE_ADD_DATALINK:
57813448Sdh155122 		return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
578210616SSebastien.Roy@Sun.COM 		    (datalink_id_t)(uintptr_t)arg2));
57833448Sdh155122 	case ZONE_DEL_DATALINK:
57843448Sdh155122 		return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
578510616SSebastien.Roy@Sun.COM 		    (datalink_id_t)(uintptr_t)arg2));
578610616SSebastien.Roy@Sun.COM 	case ZONE_CHECK_DATALINK: {
578710616SSebastien.Roy@Sun.COM 		zoneid_t	zoneid;
578810616SSebastien.Roy@Sun.COM 		boolean_t	need_copyout;
578910616SSebastien.Roy@Sun.COM 
579010616SSebastien.Roy@Sun.COM 		if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
579110616SSebastien.Roy@Sun.COM 			return (EFAULT);
579210616SSebastien.Roy@Sun.COM 		need_copyout = (zoneid == ALL_ZONES);
579310616SSebastien.Roy@Sun.COM 		err = zone_check_datalink(&zoneid,
579410616SSebastien.Roy@Sun.COM 		    (datalink_id_t)(uintptr_t)arg2);
579510616SSebastien.Roy@Sun.COM 		if (err == 0 && need_copyout) {
579610616SSebastien.Roy@Sun.COM 			if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
579710616SSebastien.Roy@Sun.COM 				err = EFAULT;
579810616SSebastien.Roy@Sun.COM 		}
579910616SSebastien.Roy@Sun.COM 		return (err == 0 ? 0 : set_errno(err));
580010616SSebastien.Roy@Sun.COM 	}
58013448Sdh155122 	case ZONE_LIST_DATALINK:
58023448Sdh155122 		return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
580310616SSebastien.Roy@Sun.COM 		    (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
58040Sstevel@tonic-gate 	default:
58050Sstevel@tonic-gate 		return (set_errno(EINVAL));
58060Sstevel@tonic-gate 	}
58070Sstevel@tonic-gate }
58080Sstevel@tonic-gate 
58090Sstevel@tonic-gate struct zarg {
58100Sstevel@tonic-gate 	zone_t *zone;
58110Sstevel@tonic-gate 	zone_cmd_arg_t arg;
58120Sstevel@tonic-gate };
58130Sstevel@tonic-gate 
58140Sstevel@tonic-gate static int
58150Sstevel@tonic-gate zone_lookup_door(const char *zone_name, door_handle_t *doorp)
58160Sstevel@tonic-gate {
58170Sstevel@tonic-gate 	char *buf;
58180Sstevel@tonic-gate 	size_t buflen;
58190Sstevel@tonic-gate 	int error;
58200Sstevel@tonic-gate 
58210Sstevel@tonic-gate 	buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
58220Sstevel@tonic-gate 	buf = kmem_alloc(buflen, KM_SLEEP);
58230Sstevel@tonic-gate 	(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
58240Sstevel@tonic-gate 	error = door_ki_open(buf, doorp);
58250Sstevel@tonic-gate 	kmem_free(buf, buflen);
58260Sstevel@tonic-gate 	return (error);
58270Sstevel@tonic-gate }
58280Sstevel@tonic-gate 
58290Sstevel@tonic-gate static void
58300Sstevel@tonic-gate zone_release_door(door_handle_t *doorp)
58310Sstevel@tonic-gate {
58320Sstevel@tonic-gate 	door_ki_rele(*doorp);
58330Sstevel@tonic-gate 	*doorp = NULL;
58340Sstevel@tonic-gate }
58350Sstevel@tonic-gate 
58360Sstevel@tonic-gate static void
58370Sstevel@tonic-gate zone_ki_call_zoneadmd(struct zarg *zargp)
58380Sstevel@tonic-gate {
58390Sstevel@tonic-gate 	door_handle_t door = NULL;
58400Sstevel@tonic-gate 	door_arg_t darg, save_arg;
58410Sstevel@tonic-gate 	char *zone_name;
58420Sstevel@tonic-gate 	size_t zone_namelen;
58430Sstevel@tonic-gate 	zoneid_t zoneid;
58440Sstevel@tonic-gate 	zone_t *zone;
58450Sstevel@tonic-gate 	zone_cmd_arg_t arg;
58460Sstevel@tonic-gate 	uint64_t uniqid;
58470Sstevel@tonic-gate 	size_t size;
58480Sstevel@tonic-gate 	int error;
58490Sstevel@tonic-gate 	int retry;
58500Sstevel@tonic-gate 
58510Sstevel@tonic-gate 	zone = zargp->zone;
58520Sstevel@tonic-gate 	arg = zargp->arg;
58530Sstevel@tonic-gate 	kmem_free(zargp, sizeof (*zargp));
58540Sstevel@tonic-gate 
58550Sstevel@tonic-gate 	zone_namelen = strlen(zone->zone_name) + 1;
58560Sstevel@tonic-gate 	zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
58570Sstevel@tonic-gate 	bcopy(zone->zone_name, zone_name, zone_namelen);
58580Sstevel@tonic-gate 	zoneid = zone->zone_id;
58590Sstevel@tonic-gate 	uniqid = zone->zone_uniqid;
58600Sstevel@tonic-gate 	/*
58610Sstevel@tonic-gate 	 * zoneadmd may be down, but at least we can empty out the zone.
58620Sstevel@tonic-gate 	 * We can ignore the return value of zone_empty() since we're called
58630Sstevel@tonic-gate 	 * from a kernel thread and know we won't be delivered any signals.
58640Sstevel@tonic-gate 	 */
58650Sstevel@tonic-gate 	ASSERT(curproc == &p0);
58660Sstevel@tonic-gate 	(void) zone_empty(zone);
58670Sstevel@tonic-gate 	ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
58680Sstevel@tonic-gate 	zone_rele(zone);
58690Sstevel@tonic-gate 
58700Sstevel@tonic-gate 	size = sizeof (arg);
58710Sstevel@tonic-gate 	darg.rbuf = (char *)&arg;
58720Sstevel@tonic-gate 	darg.data_ptr = (char *)&arg;
58730Sstevel@tonic-gate 	darg.rsize = size;
58740Sstevel@tonic-gate 	darg.data_size = size;
58750Sstevel@tonic-gate 	darg.desc_ptr = NULL;
58760Sstevel@tonic-gate 	darg.desc_num = 0;
58770Sstevel@tonic-gate 
58780Sstevel@tonic-gate 	save_arg = darg;
58790Sstevel@tonic-gate 	/*
58800Sstevel@tonic-gate 	 * Since we're not holding a reference to the zone, any number of
58810Sstevel@tonic-gate 	 * things can go wrong, including the zone disappearing before we get a
58820Sstevel@tonic-gate 	 * chance to talk to zoneadmd.
58830Sstevel@tonic-gate 	 */
58840Sstevel@tonic-gate 	for (retry = 0; /* forever */; retry++) {
58850Sstevel@tonic-gate 		if (door == NULL &&
58860Sstevel@tonic-gate 		    (error = zone_lookup_door(zone_name, &door)) != 0) {
58870Sstevel@tonic-gate 			goto next;
58880Sstevel@tonic-gate 		}
58890Sstevel@tonic-gate 		ASSERT(door != NULL);
58900Sstevel@tonic-gate 
58916997Sjwadams 		if ((error = door_ki_upcall_limited(door, &darg, NULL,
58926997Sjwadams 		    SIZE_MAX, 0)) == 0) {
58930Sstevel@tonic-gate 			break;
58940Sstevel@tonic-gate 		}
58950Sstevel@tonic-gate 		switch (error) {
58960Sstevel@tonic-gate 		case EINTR:
58970Sstevel@tonic-gate 			/* FALLTHROUGH */
58980Sstevel@tonic-gate 		case EAGAIN:	/* process may be forking */
58990Sstevel@tonic-gate 			/*
59000Sstevel@tonic-gate 			 * Back off for a bit
59010Sstevel@tonic-gate 			 */
59020Sstevel@tonic-gate 			break;
59030Sstevel@tonic-gate 		case EBADF:
59040Sstevel@tonic-gate 			zone_release_door(&door);
59050Sstevel@tonic-gate 			if (zone_lookup_door(zone_name, &door) != 0) {
59060Sstevel@tonic-gate 				/*
59070Sstevel@tonic-gate 				 * zoneadmd may be dead, but it may come back to
59080Sstevel@tonic-gate 				 * life later.
59090Sstevel@tonic-gate 				 */
59100Sstevel@tonic-gate 				break;
59110Sstevel@tonic-gate 			}
59120Sstevel@tonic-gate 			break;
59130Sstevel@tonic-gate 		default:
59140Sstevel@tonic-gate 			cmn_err(CE_WARN,
59150Sstevel@tonic-gate 			    "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
59160Sstevel@tonic-gate 			    error);
59170Sstevel@tonic-gate 			goto out;
59180Sstevel@tonic-gate 		}
59190Sstevel@tonic-gate next:
59200Sstevel@tonic-gate 		/*
59210Sstevel@tonic-gate 		 * If this isn't the same zone_t that we originally had in mind,
59220Sstevel@tonic-gate 		 * then this is the same as if two kadmin requests come in at
59230Sstevel@tonic-gate 		 * the same time: the first one wins.  This means we lose, so we
59240Sstevel@tonic-gate 		 * bail.
59250Sstevel@tonic-gate 		 */
59260Sstevel@tonic-gate 		if ((zone = zone_find_by_id(zoneid)) == NULL) {
59270Sstevel@tonic-gate 			/*
59280Sstevel@tonic-gate 			 * Problem is solved.
59290Sstevel@tonic-gate 			 */
59300Sstevel@tonic-gate 			break;
59310Sstevel@tonic-gate 		}
59320Sstevel@tonic-gate 		if (zone->zone_uniqid != uniqid) {
59330Sstevel@tonic-gate 			/*
59340Sstevel@tonic-gate 			 * zoneid recycled
59350Sstevel@tonic-gate 			 */
59360Sstevel@tonic-gate 			zone_rele(zone);
59370Sstevel@tonic-gate 			break;
59380Sstevel@tonic-gate 		}
59390Sstevel@tonic-gate 		/*
59400Sstevel@tonic-gate 		 * We could zone_status_timedwait(), but there doesn't seem to
59410Sstevel@tonic-gate 		 * be much point in doing that (plus, it would mean that
59420Sstevel@tonic-gate 		 * zone_free() isn't called until this thread exits).
59430Sstevel@tonic-gate 		 */
59440Sstevel@tonic-gate 		zone_rele(zone);
59450Sstevel@tonic-gate 		delay(hz);
59460Sstevel@tonic-gate 		darg = save_arg;
59470Sstevel@tonic-gate 	}
59480Sstevel@tonic-gate out:
59490Sstevel@tonic-gate 	if (door != NULL) {
59500Sstevel@tonic-gate 		zone_release_door(&door);
59510Sstevel@tonic-gate 	}
59520Sstevel@tonic-gate 	kmem_free(zone_name, zone_namelen);
59530Sstevel@tonic-gate 	thread_exit();
59540Sstevel@tonic-gate }
59550Sstevel@tonic-gate 
59560Sstevel@tonic-gate /*
59572267Sdp  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
59582267Sdp  * kadmin().  The caller is a process in the zone.
59590Sstevel@tonic-gate  *
59600Sstevel@tonic-gate  * In order to shutdown the zone, we will hand off control to zoneadmd
59610Sstevel@tonic-gate  * (running in the global zone) via a door.  We do a half-hearted job at
59620Sstevel@tonic-gate  * killing all processes in the zone, create a kernel thread to contact
59630Sstevel@tonic-gate  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
59640Sstevel@tonic-gate  * a form of generation number used to let zoneadmd (as well as
59650Sstevel@tonic-gate  * zone_destroy()) know exactly which zone they're re talking about.
59660Sstevel@tonic-gate  */
59670Sstevel@tonic-gate int
59682267Sdp zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
59690Sstevel@tonic-gate {
59700Sstevel@tonic-gate 	struct zarg *zargp;
59710Sstevel@tonic-gate 	zone_cmd_t zcmd;
59720Sstevel@tonic-gate 	zone_t *zone;
59730Sstevel@tonic-gate 
59740Sstevel@tonic-gate 	zone = curproc->p_zone;
59750Sstevel@tonic-gate 	ASSERT(getzoneid() != GLOBAL_ZONEID);
59760Sstevel@tonic-gate 
59770Sstevel@tonic-gate 	switch (cmd) {
59780Sstevel@tonic-gate 	case A_SHUTDOWN:
59790Sstevel@tonic-gate 		switch (fcn) {
59800Sstevel@tonic-gate 		case AD_HALT:
59810Sstevel@tonic-gate 		case AD_POWEROFF:
59820Sstevel@tonic-gate 			zcmd = Z_HALT;
59830Sstevel@tonic-gate 			break;
59840Sstevel@tonic-gate 		case AD_BOOT:
59850Sstevel@tonic-gate 			zcmd = Z_REBOOT;
59860Sstevel@tonic-gate 			break;
59870Sstevel@tonic-gate 		case AD_IBOOT:
59880Sstevel@tonic-gate 		case AD_SBOOT:
59890Sstevel@tonic-gate 		case AD_SIBOOT:
59900Sstevel@tonic-gate 		case AD_NOSYNC:
59910Sstevel@tonic-gate 			return (ENOTSUP);
59920Sstevel@tonic-gate 		default:
59930Sstevel@tonic-gate 			return (EINVAL);
59940Sstevel@tonic-gate 		}
59950Sstevel@tonic-gate 		break;
59960Sstevel@tonic-gate 	case A_REBOOT:
59970Sstevel@tonic-gate 		zcmd = Z_REBOOT;
59980Sstevel@tonic-gate 		break;
59990Sstevel@tonic-gate 	case A_FTRACE:
60000Sstevel@tonic-gate 	case A_REMOUNT:
60010Sstevel@tonic-gate 	case A_FREEZE:
60020Sstevel@tonic-gate 	case A_DUMP:
60039160SSherry.Moore@Sun.COM 	case A_CONFIG:
60040Sstevel@tonic-gate 		return (ENOTSUP);
60050Sstevel@tonic-gate 	default:
60060Sstevel@tonic-gate 		ASSERT(cmd != A_SWAPCTL);	/* handled by uadmin() */
60070Sstevel@tonic-gate 		return (EINVAL);
60080Sstevel@tonic-gate 	}
60090Sstevel@tonic-gate 
60100Sstevel@tonic-gate 	if (secpolicy_zone_admin(credp, B_FALSE))
60110Sstevel@tonic-gate 		return (EPERM);
60120Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
60132267Sdp 
60140Sstevel@tonic-gate 	/*
60150Sstevel@tonic-gate 	 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
60160Sstevel@tonic-gate 	 * is in the zone.
60170Sstevel@tonic-gate 	 */
60180Sstevel@tonic-gate 	ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
60190Sstevel@tonic-gate 	if (zone_status_get(zone) > ZONE_IS_RUNNING) {
60200Sstevel@tonic-gate 		/*
60210Sstevel@tonic-gate 		 * This zone is already on its way down.
60220Sstevel@tonic-gate 		 */
60230Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
60240Sstevel@tonic-gate 		return (0);
60250Sstevel@tonic-gate 	}
60260Sstevel@tonic-gate 	/*
60270Sstevel@tonic-gate 	 * Prevent future zone_enter()s
60280Sstevel@tonic-gate 	 */
60290Sstevel@tonic-gate 	zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
60300Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
60310Sstevel@tonic-gate 
60320Sstevel@tonic-gate 	/*
60330Sstevel@tonic-gate 	 * Kill everyone now and call zoneadmd later.
60340Sstevel@tonic-gate 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
60350Sstevel@tonic-gate 	 * later.
60360Sstevel@tonic-gate 	 */
60370Sstevel@tonic-gate 	killall(zone->zone_id);
60380Sstevel@tonic-gate 	/*
60390Sstevel@tonic-gate 	 * Now, create the thread to contact zoneadmd and do the rest of the
60400Sstevel@tonic-gate 	 * work.  This thread can't be created in our zone otherwise
60410Sstevel@tonic-gate 	 * zone_destroy() would deadlock.
60420Sstevel@tonic-gate 	 */
60432267Sdp 	zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
60440Sstevel@tonic-gate 	zargp->arg.cmd = zcmd;
60450Sstevel@tonic-gate 	zargp->arg.uniqid = zone->zone_uniqid;
60462267Sdp 	zargp->zone = zone;
60470Sstevel@tonic-gate 	(void) strcpy(zargp->arg.locale, "C");
60482267Sdp 	/* mdep was already copied in for us by uadmin */
60492267Sdp 	if (mdep != NULL)
60502267Sdp 		(void) strlcpy(zargp->arg.bootbuf, mdep,
60512267Sdp 		    sizeof (zargp->arg.bootbuf));
60522267Sdp 	zone_hold(zone);
60530Sstevel@tonic-gate 
60540Sstevel@tonic-gate 	(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
60550Sstevel@tonic-gate 	    TS_RUN, minclsyspri);
60560Sstevel@tonic-gate 	exit(CLD_EXITED, 0);
60570Sstevel@tonic-gate 
60580Sstevel@tonic-gate 	return (EINVAL);
60590Sstevel@tonic-gate }
60600Sstevel@tonic-gate 
60610Sstevel@tonic-gate /*
60620Sstevel@tonic-gate  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
60630Sstevel@tonic-gate  * status to ZONE_IS_SHUTTING_DOWN.
60648364SJordan.Vaughan@Sun.com  *
60658364SJordan.Vaughan@Sun.com  * This function also shuts down all running zones to ensure that they won't
60668364SJordan.Vaughan@Sun.com  * fork new processes.
60670Sstevel@tonic-gate  */
60680Sstevel@tonic-gate void
60690Sstevel@tonic-gate zone_shutdown_global(void)
60700Sstevel@tonic-gate {
60718364SJordan.Vaughan@Sun.com 	zone_t *current_zonep;
60728364SJordan.Vaughan@Sun.com 
60738364SJordan.Vaughan@Sun.com 	ASSERT(INGLOBALZONE(curproc));
60748364SJordan.Vaughan@Sun.com 	mutex_enter(&zonehash_lock);
60750Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
60768364SJordan.Vaughan@Sun.com 
60778364SJordan.Vaughan@Sun.com 	/* Modify the global zone's status first. */
60780Sstevel@tonic-gate 	ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
60790Sstevel@tonic-gate 	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
60808364SJordan.Vaughan@Sun.com 
60818364SJordan.Vaughan@Sun.com 	/*
60828364SJordan.Vaughan@Sun.com 	 * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
60838364SJordan.Vaughan@Sun.com 	 * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
60848364SJordan.Vaughan@Sun.com 	 * could cause assertions to fail (e.g., assertions about a zone's
60858364SJordan.Vaughan@Sun.com 	 * state during initialization, readying, or booting) or produce races.
60868364SJordan.Vaughan@Sun.com 	 * We'll let threads continue to initialize and ready new zones: they'll
60878364SJordan.Vaughan@Sun.com 	 * fail to boot the new zones when they see that the global zone is
60888364SJordan.Vaughan@Sun.com 	 * shutting down.
60898364SJordan.Vaughan@Sun.com 	 */
60908364SJordan.Vaughan@Sun.com 	for (current_zonep = list_head(&zone_active); current_zonep != NULL;
60918364SJordan.Vaughan@Sun.com 	    current_zonep = list_next(&zone_active, current_zonep)) {
60928364SJordan.Vaughan@Sun.com 		if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
60938364SJordan.Vaughan@Sun.com 			zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
60948364SJordan.Vaughan@Sun.com 	}
60950Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
60968364SJordan.Vaughan@Sun.com 	mutex_exit(&zonehash_lock);
60970Sstevel@tonic-gate }
6098789Sahrens 
6099789Sahrens /*
6100789Sahrens  * Returns true if the named dataset is visible in the current zone.
6101789Sahrens  * The 'write' parameter is set to 1 if the dataset is also writable.
6102789Sahrens  */
6103789Sahrens int
6104789Sahrens zone_dataset_visible(const char *dataset, int *write)
6105789Sahrens {
610611850SSanjeev.Bagewadi@Sun.COM 	static int zfstype = -1;
6107789Sahrens 	zone_dataset_t *zd;
6108789Sahrens 	size_t len;
6109789Sahrens 	zone_t *zone = curproc->p_zone;
611011850SSanjeev.Bagewadi@Sun.COM 	const char *name = NULL;
611111850SSanjeev.Bagewadi@Sun.COM 	vfs_t *vfsp = NULL;
6112789Sahrens 
6113789Sahrens 	if (dataset[0] == '\0')
6114789Sahrens 		return (0);
6115789Sahrens 
6116789Sahrens 	/*
6117789Sahrens 	 * Walk the list once, looking for datasets which match exactly, or
6118789Sahrens 	 * specify a dataset underneath an exported dataset.  If found, return
6119789Sahrens 	 * true and note that it is writable.
6120789Sahrens 	 */
6121789Sahrens 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
6122789Sahrens 	    zd = list_next(&zone->zone_datasets, zd)) {
6123789Sahrens 
6124789Sahrens 		len = strlen(zd->zd_dataset);
6125789Sahrens 		if (strlen(dataset) >= len &&
6126789Sahrens 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
6127816Smaybee 		    (dataset[len] == '\0' || dataset[len] == '/' ||
6128816Smaybee 		    dataset[len] == '@')) {
6129789Sahrens 			if (write)
6130789Sahrens 				*write = 1;
6131789Sahrens 			return (1);
6132789Sahrens 		}
6133789Sahrens 	}
6134789Sahrens 
6135789Sahrens 	/*
6136789Sahrens 	 * Walk the list a second time, searching for datasets which are parents
6137789Sahrens 	 * of exported datasets.  These should be visible, but read-only.
6138789Sahrens 	 *
6139789Sahrens 	 * Note that we also have to support forms such as 'pool/dataset/', with
6140789Sahrens 	 * a trailing slash.
6141789Sahrens 	 */
6142789Sahrens 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
6143789Sahrens 	    zd = list_next(&zone->zone_datasets, zd)) {
6144789Sahrens 
6145789Sahrens 		len = strlen(dataset);
6146789Sahrens 		if (dataset[len - 1] == '/')
6147789Sahrens 			len--;	/* Ignore trailing slash */
6148789Sahrens 		if (len < strlen(zd->zd_dataset) &&
6149789Sahrens 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
6150789Sahrens 		    zd->zd_dataset[len] == '/') {
6151789Sahrens 			if (write)
6152789Sahrens 				*write = 0;
6153789Sahrens 			return (1);
6154789Sahrens 		}
6155789Sahrens 	}
6156789Sahrens 
615711850SSanjeev.Bagewadi@Sun.COM 	/*
615811850SSanjeev.Bagewadi@Sun.COM 	 * We reach here if the given dataset is not found in the zone_dataset
615911850SSanjeev.Bagewadi@Sun.COM 	 * list. Check if this dataset was added as a filesystem (ie. "add fs")
616011850SSanjeev.Bagewadi@Sun.COM 	 * instead of delegation. For this we search for the dataset in the
616111850SSanjeev.Bagewadi@Sun.COM 	 * zone_vfslist of this zone. If found, return true and note that it is
616211850SSanjeev.Bagewadi@Sun.COM 	 * not writable.
616311850SSanjeev.Bagewadi@Sun.COM 	 */
616411850SSanjeev.Bagewadi@Sun.COM 
616511850SSanjeev.Bagewadi@Sun.COM 	/*
616611850SSanjeev.Bagewadi@Sun.COM 	 * Initialize zfstype if it is not initialized yet.
616711850SSanjeev.Bagewadi@Sun.COM 	 */
616811850SSanjeev.Bagewadi@Sun.COM 	if (zfstype == -1) {
616911850SSanjeev.Bagewadi@Sun.COM 		struct vfssw *vswp = vfs_getvfssw("zfs");
617011850SSanjeev.Bagewadi@Sun.COM 		zfstype = vswp - vfssw;
617111850SSanjeev.Bagewadi@Sun.COM 		vfs_unrefvfssw(vswp);
617211850SSanjeev.Bagewadi@Sun.COM 	}
617311850SSanjeev.Bagewadi@Sun.COM 
617411850SSanjeev.Bagewadi@Sun.COM 	vfs_list_read_lock();
617511850SSanjeev.Bagewadi@Sun.COM 	vfsp = zone->zone_vfslist;
617611850SSanjeev.Bagewadi@Sun.COM 	do {
617711850SSanjeev.Bagewadi@Sun.COM 		ASSERT(vfsp);
617811850SSanjeev.Bagewadi@Sun.COM 		if (vfsp->vfs_fstype == zfstype) {
617911850SSanjeev.Bagewadi@Sun.COM 			name = refstr_value(vfsp->vfs_resource);
618011850SSanjeev.Bagewadi@Sun.COM 
618111850SSanjeev.Bagewadi@Sun.COM 			/*
618211850SSanjeev.Bagewadi@Sun.COM 			 * Check if we have an exact match.
618311850SSanjeev.Bagewadi@Sun.COM 			 */
618411850SSanjeev.Bagewadi@Sun.COM 			if (strcmp(dataset, name) == 0) {
618511850SSanjeev.Bagewadi@Sun.COM 				vfs_list_unlock();
618611850SSanjeev.Bagewadi@Sun.COM 				if (write)
618711850SSanjeev.Bagewadi@Sun.COM 					*write = 0;
618811850SSanjeev.Bagewadi@Sun.COM 				return (1);
618911850SSanjeev.Bagewadi@Sun.COM 			}
619011850SSanjeev.Bagewadi@Sun.COM 			/*
619111850SSanjeev.Bagewadi@Sun.COM 			 * We need to check if we are looking for parents of
619211850SSanjeev.Bagewadi@Sun.COM 			 * a dataset. These should be visible, but read-only.
619311850SSanjeev.Bagewadi@Sun.COM 			 */
619411850SSanjeev.Bagewadi@Sun.COM 			len = strlen(dataset);
619511850SSanjeev.Bagewadi@Sun.COM 			if (dataset[len - 1] == '/')
619611850SSanjeev.Bagewadi@Sun.COM 				len--;
619711850SSanjeev.Bagewadi@Sun.COM 
619811850SSanjeev.Bagewadi@Sun.COM 			if (len < strlen(name) &&
619911850SSanjeev.Bagewadi@Sun.COM 			    bcmp(dataset, name, len) == 0 && name[len] == '/') {
620011850SSanjeev.Bagewadi@Sun.COM 				vfs_list_unlock();
620111850SSanjeev.Bagewadi@Sun.COM 				if (write)
620211850SSanjeev.Bagewadi@Sun.COM 					*write = 0;
620311850SSanjeev.Bagewadi@Sun.COM 				return (1);
620411850SSanjeev.Bagewadi@Sun.COM 			}
620511850SSanjeev.Bagewadi@Sun.COM 		}
620611850SSanjeev.Bagewadi@Sun.COM 		vfsp = vfsp->vfs_zone_next;
620711850SSanjeev.Bagewadi@Sun.COM 	} while (vfsp != zone->zone_vfslist);
620811850SSanjeev.Bagewadi@Sun.COM 
620911850SSanjeev.Bagewadi@Sun.COM 	vfs_list_unlock();
6210789Sahrens 	return (0);
6211789Sahrens }
62121676Sjpk 
62131676Sjpk /*
62141676Sjpk  * zone_find_by_any_path() -
62151676Sjpk  *
62161676Sjpk  * kernel-private routine similar to zone_find_by_path(), but which
62171676Sjpk  * effectively compares against zone paths rather than zonerootpath
62181676Sjpk  * (i.e., the last component of zonerootpaths, which should be "root/",
62191676Sjpk  * are not compared.)  This is done in order to accurately identify all
62201676Sjpk  * paths, whether zone-visible or not, including those which are parallel
62211676Sjpk  * to /root/, such as /dev/, /home/, etc...
62221676Sjpk  *
62231676Sjpk  * If the specified path does not fall under any zone path then global
62241676Sjpk  * zone is returned.
62251676Sjpk  *
62261676Sjpk  * The treat_abs parameter indicates whether the path should be treated as
62271676Sjpk  * an absolute path although it does not begin with "/".  (This supports
62281676Sjpk  * nfs mount syntax such as host:any/path.)
62291676Sjpk  *
62301676Sjpk  * The caller is responsible for zone_rele of the returned zone.
62311676Sjpk  */
62321676Sjpk zone_t *
62331676Sjpk zone_find_by_any_path(const char *path, boolean_t treat_abs)
62341676Sjpk {
62351676Sjpk 	zone_t *zone;
62361676Sjpk 	int path_offset = 0;
62371676Sjpk 
62381676Sjpk 	if (path == NULL) {
62391676Sjpk 		zone_hold(global_zone);
62401676Sjpk 		return (global_zone);
62411676Sjpk 	}
62421676Sjpk 
62431676Sjpk 	if (*path != '/') {
62441676Sjpk 		ASSERT(treat_abs);
62451676Sjpk 		path_offset = 1;
62461676Sjpk 	}
62471676Sjpk 
62481676Sjpk 	mutex_enter(&zonehash_lock);
62491676Sjpk 	for (zone = list_head(&zone_active); zone != NULL;
62501676Sjpk 	    zone = list_next(&zone_active, zone)) {
62511676Sjpk 		char	*c;
62521676Sjpk 		size_t	pathlen;
62531876Smp46848 		char *rootpath_start;
62541676Sjpk 
62551676Sjpk 		if (zone == global_zone)	/* skip global zone */
62561676Sjpk 			continue;
62571676Sjpk 
62581676Sjpk 		/* scan backwards to find start of last component */
62591676Sjpk 		c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
62601676Sjpk 		do {
62611676Sjpk 			c--;
62621676Sjpk 		} while (*c != '/');
62631676Sjpk 
62641876Smp46848 		pathlen = c - zone->zone_rootpath + 1 - path_offset;
62651876Smp46848 		rootpath_start = (zone->zone_rootpath + path_offset);
62661876Smp46848 		if (strncmp(path, rootpath_start, pathlen) == 0)
62671676Sjpk 			break;
62681676Sjpk 	}
62691676Sjpk 	if (zone == NULL)
62701676Sjpk 		zone = global_zone;
62711676Sjpk 	zone_hold(zone);
62721676Sjpk 	mutex_exit(&zonehash_lock);
62731676Sjpk 	return (zone);
62741676Sjpk }
62753448Sdh155122 
62763448Sdh155122 /*
627710616SSebastien.Roy@Sun.COM  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
627810616SSebastien.Roy@Sun.COM  * zone_dl_t pointer if found, and NULL otherwise.
62793448Sdh155122  */
628010616SSebastien.Roy@Sun.COM static zone_dl_t *
628110616SSebastien.Roy@Sun.COM zone_find_dl(zone_t *zone, datalink_id_t linkid)
628210616SSebastien.Roy@Sun.COM {
628310616SSebastien.Roy@Sun.COM 	zone_dl_t *zdl;
628410616SSebastien.Roy@Sun.COM 
628510616SSebastien.Roy@Sun.COM 	ASSERT(mutex_owned(&zone->zone_lock));
628610616SSebastien.Roy@Sun.COM 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
628710616SSebastien.Roy@Sun.COM 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
628810616SSebastien.Roy@Sun.COM 		if (zdl->zdl_id == linkid)
628910616SSebastien.Roy@Sun.COM 			break;
629010616SSebastien.Roy@Sun.COM 	}
629110616SSebastien.Roy@Sun.COM 	return (zdl);
629210616SSebastien.Roy@Sun.COM }
629310616SSebastien.Roy@Sun.COM 
62943448Sdh155122 static boolean_t
629510616SSebastien.Roy@Sun.COM zone_dl_exists(zone_t *zone, datalink_id_t linkid)
62963448Sdh155122 {
629710616SSebastien.Roy@Sun.COM 	boolean_t exists;
62983448Sdh155122 
62993448Sdh155122 	mutex_enter(&zone->zone_lock);
630010616SSebastien.Roy@Sun.COM 	exists = (zone_find_dl(zone, linkid) != NULL);
63013448Sdh155122 	mutex_exit(&zone->zone_lock);
630210616SSebastien.Roy@Sun.COM 	return (exists);
63033448Sdh155122 }
63043448Sdh155122 
63053448Sdh155122 /*
630610616SSebastien.Roy@Sun.COM  * Add an data link name for the zone.
63073448Sdh155122  */
63083448Sdh155122 static int
630910616SSebastien.Roy@Sun.COM zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
63103448Sdh155122 {
631110616SSebastien.Roy@Sun.COM 	zone_dl_t *zdl;
63123448Sdh155122 	zone_t *zone;
63133448Sdh155122 	zone_t *thiszone;
631410616SSebastien.Roy@Sun.COM 
631510616SSebastien.Roy@Sun.COM 	if ((thiszone = zone_find_by_id(zoneid)) == NULL)
63163448Sdh155122 		return (set_errno(ENXIO));
631710616SSebastien.Roy@Sun.COM 
631810616SSebastien.Roy@Sun.COM 	/* Verify that the datalink ID doesn't already belong to a zone. */
63193448Sdh155122 	mutex_enter(&zonehash_lock);
63203448Sdh155122 	for (zone = list_head(&zone_active); zone != NULL;
63213448Sdh155122 	    zone = list_next(&zone_active, zone)) {
632210616SSebastien.Roy@Sun.COM 		if (zone_dl_exists(zone, linkid)) {
63233448Sdh155122 			mutex_exit(&zonehash_lock);
63243448Sdh155122 			zone_rele(thiszone);
632510616SSebastien.Roy@Sun.COM 			return (set_errno((zone == thiszone) ? EEXIST : EPERM));
63263448Sdh155122 		}
63273448Sdh155122 	}
632810616SSebastien.Roy@Sun.COM 
632910616SSebastien.Roy@Sun.COM 	zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
633010616SSebastien.Roy@Sun.COM 	zdl->zdl_id = linkid;
63313448Sdh155122 	mutex_enter(&thiszone->zone_lock);
633210616SSebastien.Roy@Sun.COM 	list_insert_head(&thiszone->zone_dl_list, zdl);
63333448Sdh155122 	mutex_exit(&thiszone->zone_lock);
63343448Sdh155122 	mutex_exit(&zonehash_lock);
63353448Sdh155122 	zone_rele(thiszone);
63363448Sdh155122 	return (0);
63373448Sdh155122 }
63383448Sdh155122 
63393448Sdh155122 static int
634010616SSebastien.Roy@Sun.COM zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
63413448Sdh155122 {
634210616SSebastien.Roy@Sun.COM 	zone_dl_t *zdl;
63433448Sdh155122 	zone_t *zone;
634410616SSebastien.Roy@Sun.COM 	int err = 0;
634510616SSebastien.Roy@Sun.COM 
634610616SSebastien.Roy@Sun.COM 	if ((zone = zone_find_by_id(zoneid)) == NULL)
63473448Sdh155122 		return (set_errno(EINVAL));
63483448Sdh155122 
63493448Sdh155122 	mutex_enter(&zone->zone_lock);
635010616SSebastien.Roy@Sun.COM 	if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
635110616SSebastien.Roy@Sun.COM 		err = ENXIO;
635210616SSebastien.Roy@Sun.COM 	} else {
635310616SSebastien.Roy@Sun.COM 		list_remove(&zone->zone_dl_list, zdl);
635410616SSebastien.Roy@Sun.COM 		kmem_free(zdl, sizeof (zone_dl_t));
63553448Sdh155122 	}
63563448Sdh155122 	mutex_exit(&zone->zone_lock);
63573448Sdh155122 	zone_rele(zone);
635810616SSebastien.Roy@Sun.COM 	return (err == 0 ? 0 : set_errno(err));
63593448Sdh155122 }
63603448Sdh155122 
63613448Sdh155122 /*
636210616SSebastien.Roy@Sun.COM  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
636310616SSebastien.Roy@Sun.COM  * the linkid.  Otherwise we just check if the specified zoneidp has been
636410616SSebastien.Roy@Sun.COM  * assigned the supplied linkid.
63653448Sdh155122  */
636610616SSebastien.Roy@Sun.COM int
636710616SSebastien.Roy@Sun.COM zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
63683448Sdh155122 {
63693448Sdh155122 	zone_t *zone;
637010616SSebastien.Roy@Sun.COM 	int err = ENXIO;
637110616SSebastien.Roy@Sun.COM 
637210616SSebastien.Roy@Sun.COM 	if (*zoneidp != ALL_ZONES) {
637310616SSebastien.Roy@Sun.COM 		if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
637410616SSebastien.Roy@Sun.COM 			if (zone_dl_exists(zone, linkid))
637510616SSebastien.Roy@Sun.COM 				err = 0;
637610616SSebastien.Roy@Sun.COM 			zone_rele(zone);
637710616SSebastien.Roy@Sun.COM 		}
637810616SSebastien.Roy@Sun.COM 		return (err);
637910616SSebastien.Roy@Sun.COM 	}
638010616SSebastien.Roy@Sun.COM 
63813448Sdh155122 	mutex_enter(&zonehash_lock);
63823448Sdh155122 	for (zone = list_head(&zone_active); zone != NULL;
63833448Sdh155122 	    zone = list_next(&zone_active, zone)) {
638410616SSebastien.Roy@Sun.COM 		if (zone_dl_exists(zone, linkid)) {
638510616SSebastien.Roy@Sun.COM 			*zoneidp = zone->zone_id;
638610616SSebastien.Roy@Sun.COM 			err = 0;
638710616SSebastien.Roy@Sun.COM 			break;
63883448Sdh155122 		}
63893448Sdh155122 	}
63903448Sdh155122 	mutex_exit(&zonehash_lock);
639110616SSebastien.Roy@Sun.COM 	return (err);
63923448Sdh155122 }
63933448Sdh155122 
63943448Sdh155122 /*
639510616SSebastien.Roy@Sun.COM  * Get the list of datalink IDs assigned to a zone.
639610616SSebastien.Roy@Sun.COM  *
639710616SSebastien.Roy@Sun.COM  * On input, *nump is the number of datalink IDs that can fit in the supplied
639810616SSebastien.Roy@Sun.COM  * idarray.  Upon return, *nump is either set to the number of datalink IDs
639910616SSebastien.Roy@Sun.COM  * that were placed in the array if the array was large enough, or to the
640010616SSebastien.Roy@Sun.COM  * number of datalink IDs that the function needs to place in the array if the
640110616SSebastien.Roy@Sun.COM  * array is too small.
64023448Sdh155122  */
64033448Sdh155122 static int
640410616SSebastien.Roy@Sun.COM zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
64053448Sdh155122 {
640610616SSebastien.Roy@Sun.COM 	uint_t num, dlcount;
64073448Sdh155122 	zone_t *zone;
640810616SSebastien.Roy@Sun.COM 	zone_dl_t *zdl;
640910616SSebastien.Roy@Sun.COM 	datalink_id_t *idptr = idarray;
64103448Sdh155122 
64113448Sdh155122 	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
64123448Sdh155122 		return (set_errno(EFAULT));
641310616SSebastien.Roy@Sun.COM 	if ((zone = zone_find_by_id(zoneid)) == NULL)
64143448Sdh155122 		return (set_errno(ENXIO));
64153448Sdh155122 
64163448Sdh155122 	num = 0;
64173448Sdh155122 	mutex_enter(&zone->zone_lock);
641810616SSebastien.Roy@Sun.COM 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
641910616SSebastien.Roy@Sun.COM 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
64203448Sdh155122 		/*
642110616SSebastien.Roy@Sun.COM 		 * If the list is bigger than what the caller supplied, just
642210616SSebastien.Roy@Sun.COM 		 * count, don't do copyout.
64233448Sdh155122 		 */
64243448Sdh155122 		if (++num > dlcount)
64253448Sdh155122 			continue;
642610616SSebastien.Roy@Sun.COM 		if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
64273448Sdh155122 			mutex_exit(&zone->zone_lock);
64283448Sdh155122 			zone_rele(zone);
64293448Sdh155122 			return (set_errno(EFAULT));
64303448Sdh155122 		}
643110616SSebastien.Roy@Sun.COM 		idptr++;
64323448Sdh155122 	}
64333448Sdh155122 	mutex_exit(&zone->zone_lock);
64343448Sdh155122 	zone_rele(zone);
64353448Sdh155122 
64363448Sdh155122 	/* Increased or decreased, caller should be notified. */
64373448Sdh155122 	if (num != dlcount) {
643810616SSebastien.Roy@Sun.COM 		if (copyout(&num, nump, sizeof (num)) != 0)
64393448Sdh155122 			return (set_errno(EFAULT));
64403448Sdh155122 	}
64413448Sdh155122 	return (0);
64423448Sdh155122 }
64433448Sdh155122 
64443448Sdh155122 /*
64453448Sdh155122  * Public interface for looking up a zone by zoneid. It's a customized version
64465880Snordmark  * for netstack_zone_create(). It can only be called from the zsd create
64475880Snordmark  * callbacks, since it doesn't have reference on the zone structure hence if
64485880Snordmark  * it is called elsewhere the zone could disappear after the zonehash_lock
64495880Snordmark  * is dropped.
64505880Snordmark  *
64515880Snordmark  * Furthermore it
64525880Snordmark  * 1. Doesn't check the status of the zone.
64535880Snordmark  * 2. It will be called even before zone_init is called, in that case the
64543448Sdh155122  *    address of zone0 is returned directly, and netstack_zone_create()
64553448Sdh155122  *    will only assign a value to zone0.zone_netstack, won't break anything.
64565880Snordmark  * 3. Returns without the zone being held.
64573448Sdh155122  */
64583448Sdh155122 zone_t *
64593448Sdh155122 zone_find_by_id_nolock(zoneid_t zoneid)
64603448Sdh155122 {
64615880Snordmark 	zone_t *zone;
64625880Snordmark 
64635880Snordmark 	mutex_enter(&zonehash_lock);
64643448Sdh155122 	if (zonehashbyid == NULL)
64655880Snordmark 		zone = &zone0;
64663448Sdh155122 	else
64675880Snordmark 		zone = zone_find_all_by_id(zoneid);
64685880Snordmark 	mutex_exit(&zonehash_lock);
64695880Snordmark 	return (zone);
64703448Sdh155122 }
64715895Syz147064 
64725895Syz147064 /*
64735895Syz147064  * Walk the datalinks for a given zone
64745895Syz147064  */
64755895Syz147064 int
647610616SSebastien.Roy@Sun.COM zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
647710616SSebastien.Roy@Sun.COM     void *data)
64785895Syz147064 {
647910616SSebastien.Roy@Sun.COM 	zone_t		*zone;
648010616SSebastien.Roy@Sun.COM 	zone_dl_t	*zdl;
648110616SSebastien.Roy@Sun.COM 	datalink_id_t	*idarray;
648210616SSebastien.Roy@Sun.COM 	uint_t		idcount = 0;
648310616SSebastien.Roy@Sun.COM 	int		i, ret = 0;
64845895Syz147064 
64855895Syz147064 	if ((zone = zone_find_by_id(zoneid)) == NULL)
64865895Syz147064 		return (ENOENT);
64875895Syz147064 
648810616SSebastien.Roy@Sun.COM 	/*
648910616SSebastien.Roy@Sun.COM 	 * We first build an array of linkid's so that we can walk these and
649010616SSebastien.Roy@Sun.COM 	 * execute the callback with the zone_lock dropped.
649110616SSebastien.Roy@Sun.COM 	 */
64925895Syz147064 	mutex_enter(&zone->zone_lock);
649310616SSebastien.Roy@Sun.COM 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
649410616SSebastien.Roy@Sun.COM 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
649510616SSebastien.Roy@Sun.COM 		idcount++;
649610616SSebastien.Roy@Sun.COM 	}
649710616SSebastien.Roy@Sun.COM 
649810616SSebastien.Roy@Sun.COM 	if (idcount == 0) {
649910616SSebastien.Roy@Sun.COM 		mutex_exit(&zone->zone_lock);
650010616SSebastien.Roy@Sun.COM 		zone_rele(zone);
650110616SSebastien.Roy@Sun.COM 		return (0);
650210616SSebastien.Roy@Sun.COM 	}
650310616SSebastien.Roy@Sun.COM 
650410616SSebastien.Roy@Sun.COM 	idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
650510616SSebastien.Roy@Sun.COM 	if (idarray == NULL) {
650610616SSebastien.Roy@Sun.COM 		mutex_exit(&zone->zone_lock);
650710616SSebastien.Roy@Sun.COM 		zone_rele(zone);
650810616SSebastien.Roy@Sun.COM 		return (ENOMEM);
650910616SSebastien.Roy@Sun.COM 	}
651010616SSebastien.Roy@Sun.COM 
651110616SSebastien.Roy@Sun.COM 	for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
651210616SSebastien.Roy@Sun.COM 	    i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
651310616SSebastien.Roy@Sun.COM 		idarray[i] = zdl->zdl_id;
651410616SSebastien.Roy@Sun.COM 	}
651510616SSebastien.Roy@Sun.COM 
651610616SSebastien.Roy@Sun.COM 	mutex_exit(&zone->zone_lock);
651710616SSebastien.Roy@Sun.COM 
651810616SSebastien.Roy@Sun.COM 	for (i = 0; i < idcount && ret == 0; i++) {
651910616SSebastien.Roy@Sun.COM 		if ((ret = (*cb)(idarray[i], data)) != 0)
65205895Syz147064 			break;
65215895Syz147064 	}
652210616SSebastien.Roy@Sun.COM 
65235895Syz147064 	zone_rele(zone);
652410616SSebastien.Roy@Sun.COM 	kmem_free(idarray, sizeof (datalink_id_t) * idcount);
65255895Syz147064 	return (ret);
65265895Syz147064 }
6527