xref: /onnv-gate/usr/src/uts/common/os/zone.c (revision 13096:b02331b7b26d)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51676Sjpk  * Common Development and Distribution License (the "License").
61676Sjpk  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
21390Sraf 
220Sstevel@tonic-gate /*
2312273SCasper.Dik@Sun.COM  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate  * Zones
280Sstevel@tonic-gate  *
290Sstevel@tonic-gate  *   A zone is a named collection of processes, namespace constraints,
300Sstevel@tonic-gate  *   and other system resources which comprise a secure and manageable
310Sstevel@tonic-gate  *   application containment facility.
320Sstevel@tonic-gate  *
330Sstevel@tonic-gate  *   Zones (represented by the reference counted zone_t) are tracked in
340Sstevel@tonic-gate  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
350Sstevel@tonic-gate  *   (zoneid_t) are used to track zone association.  Zone IDs are
360Sstevel@tonic-gate  *   dynamically generated when the zone is created; if a persistent
370Sstevel@tonic-gate  *   identifier is needed (core files, accounting logs, audit trail,
380Sstevel@tonic-gate  *   etc.), the zone name should be used.
390Sstevel@tonic-gate  *
400Sstevel@tonic-gate  *
410Sstevel@tonic-gate  *   Global Zone:
420Sstevel@tonic-gate  *
430Sstevel@tonic-gate  *   The global zone (zoneid 0) is automatically associated with all
440Sstevel@tonic-gate  *   system resources that have not been bound to a user-created zone.
450Sstevel@tonic-gate  *   This means that even systems where zones are not in active use
460Sstevel@tonic-gate  *   have a global zone, and all processes, mounts, etc. are
470Sstevel@tonic-gate  *   associated with that zone.  The global zone is generally
480Sstevel@tonic-gate  *   unconstrained in terms of privileges and access, though the usual
490Sstevel@tonic-gate  *   credential and privilege based restrictions apply.
500Sstevel@tonic-gate  *
510Sstevel@tonic-gate  *
520Sstevel@tonic-gate  *   Zone States:
530Sstevel@tonic-gate  *
540Sstevel@tonic-gate  *   The states in which a zone may be in and the transitions are as
550Sstevel@tonic-gate  *   follows:
560Sstevel@tonic-gate  *
570Sstevel@tonic-gate  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
580Sstevel@tonic-gate  *   initialized zone is added to the list of active zones on the system but
590Sstevel@tonic-gate  *   isn't accessible.
600Sstevel@tonic-gate  *
615880Snordmark  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
625880Snordmark  *   not yet completed. Not possible to enter the zone, but attributes can
635880Snordmark  *   be retrieved.
645880Snordmark  *
650Sstevel@tonic-gate  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
660Sstevel@tonic-gate  *   ready.  The zone is made visible after the ZSD constructor callbacks are
670Sstevel@tonic-gate  *   executed.  A zone remains in this state until it transitions into
680Sstevel@tonic-gate  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
690Sstevel@tonic-gate  *
700Sstevel@tonic-gate  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
710Sstevel@tonic-gate  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
720Sstevel@tonic-gate  *   state.
730Sstevel@tonic-gate  *
740Sstevel@tonic-gate  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
750Sstevel@tonic-gate  *   successfully started init.   A zone remains in this state until
760Sstevel@tonic-gate  *   zone_shutdown() is called.
770Sstevel@tonic-gate  *
780Sstevel@tonic-gate  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
790Sstevel@tonic-gate  *   killing all processes running in the zone. The zone remains
800Sstevel@tonic-gate  *   in this state until there are no more user processes running in the zone.
810Sstevel@tonic-gate  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
820Sstevel@tonic-gate  *   Since zone_shutdown() is restartable, it may be called successfully
830Sstevel@tonic-gate  *   multiple times for the same zone_t.  Setting of the zone's state to
840Sstevel@tonic-gate  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
850Sstevel@tonic-gate  *   the zone's status without worrying about it being a moving target.
860Sstevel@tonic-gate  *
870Sstevel@tonic-gate  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
880Sstevel@tonic-gate  *   are no more user processes in the zone.  The zone remains in this
890Sstevel@tonic-gate  *   state until there are no more kernel threads associated with the
900Sstevel@tonic-gate  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
910Sstevel@tonic-gate  *   fail.
920Sstevel@tonic-gate  *
930Sstevel@tonic-gate  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
940Sstevel@tonic-gate  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
950Sstevel@tonic-gate  *   join the zone or create kernel threads therein.
960Sstevel@tonic-gate  *
970Sstevel@tonic-gate  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
980Sstevel@tonic-gate  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
990Sstevel@tonic-gate  *   return NULL from now on.
1000Sstevel@tonic-gate  *
1010Sstevel@tonic-gate  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
1020Sstevel@tonic-gate  *   processes or threads doing work on behalf of the zone.  The zone is
1030Sstevel@tonic-gate  *   removed from the list of active zones.  zone_destroy() returns, and
1040Sstevel@tonic-gate  *   the zone can be recreated.
1050Sstevel@tonic-gate  *
1060Sstevel@tonic-gate  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
1070Sstevel@tonic-gate  *   callbacks are executed, and all memory associated with the zone is
1080Sstevel@tonic-gate  *   freed.
1090Sstevel@tonic-gate  *
1100Sstevel@tonic-gate  *   Threads can wait for the zone to enter a requested state by using
1110Sstevel@tonic-gate  *   zone_status_wait() or zone_status_timedwait() with the desired
1120Sstevel@tonic-gate  *   state passed in as an argument.  Zone state transitions are
1130Sstevel@tonic-gate  *   uni-directional; it is not possible to move back to an earlier state.
1140Sstevel@tonic-gate  *
1150Sstevel@tonic-gate  *
1160Sstevel@tonic-gate  *   Zone-Specific Data:
1170Sstevel@tonic-gate  *
1180Sstevel@tonic-gate  *   Subsystems needing to maintain zone-specific data can store that
1190Sstevel@tonic-gate  *   data using the ZSD mechanism.  This provides a zone-specific data
1200Sstevel@tonic-gate  *   store, similar to thread-specific data (see pthread_getspecific(3C)
1210Sstevel@tonic-gate  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
1220Sstevel@tonic-gate  *   to register callbacks to be invoked when a zone is created, shut
1230Sstevel@tonic-gate  *   down, or destroyed.  This can be used to initialize zone-specific
1240Sstevel@tonic-gate  *   data for new zones and to clean up when zones go away.
1250Sstevel@tonic-gate  *
1260Sstevel@tonic-gate  *
1270Sstevel@tonic-gate  *   Data Structures:
1280Sstevel@tonic-gate  *
1290Sstevel@tonic-gate  *   The per-zone structure (zone_t) is reference counted, and freed
1300Sstevel@tonic-gate  *   when all references are released.  zone_hold and zone_rele can be
1310Sstevel@tonic-gate  *   used to adjust the reference count.  In addition, reference counts
1320Sstevel@tonic-gate  *   associated with the cred_t structure are tracked separately using
1330Sstevel@tonic-gate  *   zone_cred_hold and zone_cred_rele.
1340Sstevel@tonic-gate  *
1350Sstevel@tonic-gate  *   Pointers to active zone_t's are stored in two hash tables; one
1360Sstevel@tonic-gate  *   for searching by id, the other for searching by name.  Lookups
1370Sstevel@tonic-gate  *   can be performed on either basis, using zone_find_by_id and
1380Sstevel@tonic-gate  *   zone_find_by_name.  Both return zone_t pointers with the zone
1390Sstevel@tonic-gate  *   held, so zone_rele should be called when the pointer is no longer
1400Sstevel@tonic-gate  *   needed.  Zones can also be searched by path; zone_find_by_path
1410Sstevel@tonic-gate  *   returns the zone with which a path name is associated (global
1420Sstevel@tonic-gate  *   zone if the path is not within some other zone's file system
1430Sstevel@tonic-gate  *   hierarchy).  This currently requires iterating through each zone,
1440Sstevel@tonic-gate  *   so it is slower than an id or name search via a hash table.
1450Sstevel@tonic-gate  *
1460Sstevel@tonic-gate  *
1470Sstevel@tonic-gate  *   Locking:
1480Sstevel@tonic-gate  *
1490Sstevel@tonic-gate  *   zonehash_lock: This is a top-level global lock used to protect the
1500Sstevel@tonic-gate  *       zone hash tables and lists.  Zones cannot be created or destroyed
1510Sstevel@tonic-gate  *       while this lock is held.
1520Sstevel@tonic-gate  *   zone_status_lock: This is a global lock protecting zone state.
1530Sstevel@tonic-gate  *       Zones cannot change state while this lock is held.  It also
1540Sstevel@tonic-gate  *       protects the list of kernel threads associated with a zone.
1550Sstevel@tonic-gate  *   zone_lock: This is a per-zone lock used to protect several fields of
1560Sstevel@tonic-gate  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
1570Sstevel@tonic-gate  *       this lock means that the zone cannot go away.
1583247Sgjelinek  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
1593247Sgjelinek  *	 related to the zone.max-lwps rctl.
1603247Sgjelinek  *   zone_mem_lock: This is a per-zone lock used to protect the fields
1613247Sgjelinek  *	 related to the zone.max-locked-memory and zone.max-swap rctls.
16212633Sjohn.levon@sun.com  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
16312633Sjohn.levon@sun.com  *       currently just max_lofi
1640Sstevel@tonic-gate  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
1650Sstevel@tonic-gate  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
1660Sstevel@tonic-gate  *       list (a list of zones in the ZONE_IS_DEAD state).
1670Sstevel@tonic-gate  *
1680Sstevel@tonic-gate  *   Ordering requirements:
1690Sstevel@tonic-gate  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
1700Sstevel@tonic-gate  *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
1710Sstevel@tonic-gate  *
1723247Sgjelinek  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
1733247Sgjelinek  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
17412725SMenno.Lageman@Sun.COM  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
1753247Sgjelinek  *
1760Sstevel@tonic-gate  *   Blocking memory allocations are permitted while holding any of the
1770Sstevel@tonic-gate  *   zone locks.
1780Sstevel@tonic-gate  *
1790Sstevel@tonic-gate  *
1800Sstevel@tonic-gate  *   System Call Interface:
1810Sstevel@tonic-gate  *
1820Sstevel@tonic-gate  *   The zone subsystem can be managed and queried from user level with
1830Sstevel@tonic-gate  *   the following system calls (all subcodes of the primary "zone"
1840Sstevel@tonic-gate  *   system call):
1850Sstevel@tonic-gate  *   - zone_create: creates a zone with selected attributes (name,
186789Sahrens  *     root path, privileges, resource controls, ZFS datasets)
1870Sstevel@tonic-gate  *   - zone_enter: allows the current process to enter a zone
1880Sstevel@tonic-gate  *   - zone_getattr: reports attributes of a zone
1892267Sdp  *   - zone_setattr: set attributes of a zone
1902267Sdp  *   - zone_boot: set 'init' running for the zone
1910Sstevel@tonic-gate  *   - zone_list: lists all zones active in the system
1920Sstevel@tonic-gate  *   - zone_lookup: looks up zone id based on name
1930Sstevel@tonic-gate  *   - zone_shutdown: initiates shutdown process (see states above)
1940Sstevel@tonic-gate  *   - zone_destroy: completes shutdown process (see states above)
1950Sstevel@tonic-gate  *
1960Sstevel@tonic-gate  */
1970Sstevel@tonic-gate 
1980Sstevel@tonic-gate #include <sys/priv_impl.h>
1990Sstevel@tonic-gate #include <sys/cred.h>
2000Sstevel@tonic-gate #include <c2/audit.h>
2010Sstevel@tonic-gate #include <sys/debug.h>
2020Sstevel@tonic-gate #include <sys/file.h>
2030Sstevel@tonic-gate #include <sys/kmem.h>
2043247Sgjelinek #include <sys/kstat.h>
2050Sstevel@tonic-gate #include <sys/mutex.h>
2061676Sjpk #include <sys/note.h>
2070Sstevel@tonic-gate #include <sys/pathname.h>
2080Sstevel@tonic-gate #include <sys/proc.h>
2090Sstevel@tonic-gate #include <sys/project.h>
2101166Sdstaff #include <sys/sysevent.h>
2110Sstevel@tonic-gate #include <sys/task.h>
2120Sstevel@tonic-gate #include <sys/systm.h>
2130Sstevel@tonic-gate #include <sys/types.h>
2140Sstevel@tonic-gate #include <sys/utsname.h>
2150Sstevel@tonic-gate #include <sys/vnode.h>
2160Sstevel@tonic-gate #include <sys/vfs.h>
2170Sstevel@tonic-gate #include <sys/systeminfo.h>
2180Sstevel@tonic-gate #include <sys/policy.h>
2190Sstevel@tonic-gate #include <sys/cred_impl.h>
2200Sstevel@tonic-gate #include <sys/contract_impl.h>
2210Sstevel@tonic-gate #include <sys/contract/process_impl.h>
2220Sstevel@tonic-gate #include <sys/class.h>
2230Sstevel@tonic-gate #include <sys/pool.h>
2240Sstevel@tonic-gate #include <sys/pool_pset.h>
2250Sstevel@tonic-gate #include <sys/pset.h>
226*13096SJordan.Vaughan@Sun.com #include <sys/strlog.h>
2270Sstevel@tonic-gate #include <sys/sysmacros.h>
2280Sstevel@tonic-gate #include <sys/callb.h>
2290Sstevel@tonic-gate #include <sys/vmparam.h>
2300Sstevel@tonic-gate #include <sys/corectl.h>
2312677Sml93401 #include <sys/ipc_impl.h>
23212273SCasper.Dik@Sun.COM #include <sys/klpd.h>
2330Sstevel@tonic-gate 
2340Sstevel@tonic-gate #include <sys/door.h>
2350Sstevel@tonic-gate #include <sys/cpuvar.h>
2365880Snordmark #include <sys/sdt.h>
2370Sstevel@tonic-gate 
2380Sstevel@tonic-gate #include <sys/uadmin.h>
2390Sstevel@tonic-gate #include <sys/session.h>
2400Sstevel@tonic-gate #include <sys/cmn_err.h>
2410Sstevel@tonic-gate #include <sys/modhash.h>
2422267Sdp #include <sys/sunddi.h>
2430Sstevel@tonic-gate #include <sys/nvpair.h>
2440Sstevel@tonic-gate #include <sys/rctl.h>
2450Sstevel@tonic-gate #include <sys/fss.h>
2462712Snn35248 #include <sys/brand.h>
2470Sstevel@tonic-gate #include <sys/zone.h>
2483448Sdh155122 #include <net/if.h>
2493792Sakolb #include <sys/cpucaps.h>
2503247Sgjelinek #include <vm/seg.h>
25110616SSebastien.Roy@Sun.COM #include <sys/mac.h>
25210616SSebastien.Roy@Sun.COM 
253*13096SJordan.Vaughan@Sun.com /*
254*13096SJordan.Vaughan@Sun.com  * This constant specifies the number of seconds that threads waiting for
255*13096SJordan.Vaughan@Sun.com  * subsystems to release a zone's general-purpose references will wait before
256*13096SJordan.Vaughan@Sun.com  * they log the zone's reference counts.  The constant's value shouldn't
257*13096SJordan.Vaughan@Sun.com  * be so small that reference counts are unnecessarily reported for zones
258*13096SJordan.Vaughan@Sun.com  * whose references are slowly released.  On the other hand, it shouldn't be so
259*13096SJordan.Vaughan@Sun.com  * large that users reboot their systems out of frustration over hung zones
260*13096SJordan.Vaughan@Sun.com  * before the system logs the zones' reference counts.
261*13096SJordan.Vaughan@Sun.com  */
262*13096SJordan.Vaughan@Sun.com #define	ZONE_DESTROY_TIMEOUT_SECS	60
263*13096SJordan.Vaughan@Sun.com 
26410616SSebastien.Roy@Sun.COM /* List of data link IDs which are accessible from the zone */
26510616SSebastien.Roy@Sun.COM typedef struct zone_dl {
26610616SSebastien.Roy@Sun.COM 	datalink_id_t	zdl_id;
26712748SSowmini.Varadhan@oracle.COM 	nvlist_t	*zdl_net;
26810616SSebastien.Roy@Sun.COM 	list_node_t	zdl_linkage;
26910616SSebastien.Roy@Sun.COM } zone_dl_t;
2703247Sgjelinek 
2710Sstevel@tonic-gate /*
2720Sstevel@tonic-gate  * cv used to signal that all references to the zone have been released.  This
2730Sstevel@tonic-gate  * needs to be global since there may be multiple waiters, and the first to
2740Sstevel@tonic-gate  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
2750Sstevel@tonic-gate  */
2760Sstevel@tonic-gate static kcondvar_t zone_destroy_cv;
2770Sstevel@tonic-gate /*
2780Sstevel@tonic-gate  * Lock used to serialize access to zone_cv.  This could have been per-zone,
2790Sstevel@tonic-gate  * but then we'd need another lock for zone_destroy_cv, and why bother?
2800Sstevel@tonic-gate  */
2810Sstevel@tonic-gate static kmutex_t zone_status_lock;
2820Sstevel@tonic-gate 
2830Sstevel@tonic-gate /*
2840Sstevel@tonic-gate  * ZSD-related global variables.
2850Sstevel@tonic-gate  */
2860Sstevel@tonic-gate static kmutex_t zsd_key_lock;	/* protects the following two */
2870Sstevel@tonic-gate /*
2880Sstevel@tonic-gate  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
2890Sstevel@tonic-gate  */
2900Sstevel@tonic-gate static zone_key_t zsd_keyval = 0;
2910Sstevel@tonic-gate /*
2920Sstevel@tonic-gate  * Global list of registered keys.  We use this when a new zone is created.
2930Sstevel@tonic-gate  */
2940Sstevel@tonic-gate static list_t zsd_registered_keys;
2950Sstevel@tonic-gate 
2960Sstevel@tonic-gate int zone_hash_size = 256;
2971676Sjpk static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
2980Sstevel@tonic-gate static kmutex_t zonehash_lock;
2990Sstevel@tonic-gate static uint_t zonecount;
3000Sstevel@tonic-gate static id_space_t *zoneid_space;
3010Sstevel@tonic-gate 
3020Sstevel@tonic-gate /*
3030Sstevel@tonic-gate  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
3040Sstevel@tonic-gate  * kernel proper runs, and which manages all other zones.
3050Sstevel@tonic-gate  *
3060Sstevel@tonic-gate  * Although not declared as static, the variable "zone0" should not be used
3070Sstevel@tonic-gate  * except for by code that needs to reference the global zone early on in boot,
3080Sstevel@tonic-gate  * before it is fully initialized.  All other consumers should use
3090Sstevel@tonic-gate  * 'global_zone'.
3100Sstevel@tonic-gate  */
3110Sstevel@tonic-gate zone_t zone0;
3120Sstevel@tonic-gate zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
3130Sstevel@tonic-gate 
3140Sstevel@tonic-gate /*
3150Sstevel@tonic-gate  * List of active zones, protected by zonehash_lock.
3160Sstevel@tonic-gate  */
3170Sstevel@tonic-gate static list_t zone_active;
3180Sstevel@tonic-gate 
3190Sstevel@tonic-gate /*
3200Sstevel@tonic-gate  * List of destroyed zones that still have outstanding cred references.
3210Sstevel@tonic-gate  * Used for debugging.  Uses a separate lock to avoid lock ordering
3220Sstevel@tonic-gate  * problems in zone_free.
3230Sstevel@tonic-gate  */
3240Sstevel@tonic-gate static list_t zone_deathrow;
3250Sstevel@tonic-gate static kmutex_t zone_deathrow_lock;
3260Sstevel@tonic-gate 
3270Sstevel@tonic-gate /* number of zones is limited by virtual interface limit in IP */
3280Sstevel@tonic-gate uint_t maxzones = 8192;
3290Sstevel@tonic-gate 
3301166Sdstaff /* Event channel to sent zone state change notifications */
3311166Sdstaff evchan_t *zone_event_chan;
3321166Sdstaff 
3331166Sdstaff /*
3341166Sdstaff  * This table holds the mapping from kernel zone states to
3351166Sdstaff  * states visible in the state notification API.
3361166Sdstaff  * The idea is that we only expose "obvious" states and
3371166Sdstaff  * do not expose states which are just implementation details.
3381166Sdstaff  */
3391166Sdstaff const char  *zone_status_table[] = {
3401166Sdstaff 	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
3415880Snordmark 	ZONE_EVENT_INITIALIZED,		/* initialized */
3421166Sdstaff 	ZONE_EVENT_READY,		/* ready */
3431166Sdstaff 	ZONE_EVENT_READY,		/* booting */
3441166Sdstaff 	ZONE_EVENT_RUNNING,		/* running */
3451166Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
3461166Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
3471166Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* down */
3481166Sdstaff 	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
3491166Sdstaff 	ZONE_EVENT_UNINITIALIZED,	/* dead */
3501166Sdstaff };
3511166Sdstaff 
3520Sstevel@tonic-gate /*
353*13096SJordan.Vaughan@Sun.com  * This array contains the names of the subsystems listed in zone_ref_subsys_t
354*13096SJordan.Vaughan@Sun.com  * (see sys/zone.h).
355*13096SJordan.Vaughan@Sun.com  */
356*13096SJordan.Vaughan@Sun.com static char *zone_ref_subsys_names[] = {
357*13096SJordan.Vaughan@Sun.com 	"NFS",		/* ZONE_REF_NFS */
358*13096SJordan.Vaughan@Sun.com 	"NFSv4",	/* ZONE_REF_NFSV4 */
359*13096SJordan.Vaughan@Sun.com 	"SMBFS",	/* ZONE_REF_SMBFS */
360*13096SJordan.Vaughan@Sun.com 	"MNTFS",	/* ZONE_REF_MNTFS */
361*13096SJordan.Vaughan@Sun.com 	"LOFI",		/* ZONE_REF_LOFI */
362*13096SJordan.Vaughan@Sun.com 	"VFS",		/* ZONE_REF_VFS */
363*13096SJordan.Vaughan@Sun.com 	"IPC"		/* ZONE_REF_IPC */
364*13096SJordan.Vaughan@Sun.com };
365*13096SJordan.Vaughan@Sun.com 
366*13096SJordan.Vaughan@Sun.com /*
3670Sstevel@tonic-gate  * This isn't static so lint doesn't complain.
3680Sstevel@tonic-gate  */
3690Sstevel@tonic-gate rctl_hndl_t rc_zone_cpu_shares;
3702768Ssl108498 rctl_hndl_t rc_zone_locked_mem;
3713247Sgjelinek rctl_hndl_t rc_zone_max_swap;
37212633Sjohn.levon@sun.com rctl_hndl_t rc_zone_max_lofi;
3733792Sakolb rctl_hndl_t rc_zone_cpu_cap;
3740Sstevel@tonic-gate rctl_hndl_t rc_zone_nlwps;
37512725SMenno.Lageman@Sun.COM rctl_hndl_t rc_zone_nprocs;
3762677Sml93401 rctl_hndl_t rc_zone_shmmax;
3772677Sml93401 rctl_hndl_t rc_zone_shmmni;
3782677Sml93401 rctl_hndl_t rc_zone_semmni;
3792677Sml93401 rctl_hndl_t rc_zone_msgmni;
3800Sstevel@tonic-gate /*
3810Sstevel@tonic-gate  * Synchronization primitives used to synchronize between mounts and zone
3820Sstevel@tonic-gate  * creation/destruction.
3830Sstevel@tonic-gate  */
3840Sstevel@tonic-gate static int mounts_in_progress;
3850Sstevel@tonic-gate static kcondvar_t mount_cv;
3860Sstevel@tonic-gate static kmutex_t mount_lock;
3870Sstevel@tonic-gate 
3882267Sdp const char * const zone_default_initname = "/sbin/init";
3891676Sjpk static char * const zone_prefix = "/zone/";
3900Sstevel@tonic-gate static int zone_shutdown(zoneid_t zoneid);
39110616SSebastien.Roy@Sun.COM static int zone_add_datalink(zoneid_t, datalink_id_t);
39210616SSebastien.Roy@Sun.COM static int zone_remove_datalink(zoneid_t, datalink_id_t);
39310616SSebastien.Roy@Sun.COM static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
39412748SSowmini.Varadhan@oracle.COM static int zone_set_network(zoneid_t, zone_net_data_t *);
39512748SSowmini.Varadhan@oracle.COM static int zone_get_network(zoneid_t, zone_net_data_t *);
3960Sstevel@tonic-gate 
3975880Snordmark typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
3985880Snordmark 
3995880Snordmark static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
4005880Snordmark static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
4015880Snordmark static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
4025880Snordmark static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
4035880Snordmark     zone_key_t);
4045880Snordmark static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
4055880Snordmark static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
4065880Snordmark     kmutex_t *);
4075880Snordmark static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
4085880Snordmark     kmutex_t *);
4095880Snordmark 
4100Sstevel@tonic-gate /*
411813Sdp  * Bump this number when you alter the zone syscall interfaces; this is
412813Sdp  * because we need to have support for previous API versions in libc
413813Sdp  * to support patching; libc calls into the kernel to determine this number.
414813Sdp  *
415813Sdp  * Version 1 of the API is the version originally shipped with Solaris 10
416813Sdp  * Version 2 alters the zone_create system call in order to support more
417813Sdp  *     arguments by moving the args into a structure; and to do better
418813Sdp  *     error reporting when zone_create() fails.
419813Sdp  * Version 3 alters the zone_create system call in order to support the
420813Sdp  *     import of ZFS datasets to zones.
4211676Sjpk  * Version 4 alters the zone_create system call in order to support
4221676Sjpk  *     Trusted Extensions.
4232267Sdp  * Version 5 alters the zone_boot system call, and converts its old
4242267Sdp  *     bootargs parameter to be set by the zone_setattr API instead.
4253448Sdh155122  * Version 6 adds the flag argument to zone_create.
426813Sdp  */
4273448Sdh155122 static const int ZONE_SYSCALL_API_VERSION = 6;
428813Sdp 
429813Sdp /*
4300Sstevel@tonic-gate  * Certain filesystems (such as NFS and autofs) need to know which zone
4310Sstevel@tonic-gate  * the mount is being placed in.  Because of this, we need to be able to
4320Sstevel@tonic-gate  * ensure that a zone isn't in the process of being created such that
4330Sstevel@tonic-gate  * nfs_mount() thinks it is in the global zone, while by the time it
4340Sstevel@tonic-gate  * gets added the list of mounted zones, it ends up on zoneA's mount
4350Sstevel@tonic-gate  * list.
4360Sstevel@tonic-gate  *
4370Sstevel@tonic-gate  * The following functions: block_mounts()/resume_mounts() and
4380Sstevel@tonic-gate  * mount_in_progress()/mount_completed() are used by zones and the VFS
4390Sstevel@tonic-gate  * layer (respectively) to synchronize zone creation and new mounts.
4400Sstevel@tonic-gate  *
4410Sstevel@tonic-gate  * The semantics are like a reader-reader lock such that there may
4420Sstevel@tonic-gate  * either be multiple mounts (or zone creations, if that weren't
4430Sstevel@tonic-gate  * serialized by zonehash_lock) in progress at the same time, but not
4440Sstevel@tonic-gate  * both.
4450Sstevel@tonic-gate  *
4460Sstevel@tonic-gate  * We use cv's so the user can ctrl-C out of the operation if it's
4470Sstevel@tonic-gate  * taking too long.
4480Sstevel@tonic-gate  *
4490Sstevel@tonic-gate  * The semantics are such that there is unfair bias towards the
4500Sstevel@tonic-gate  * "current" operation.  This means that zone creations may starve if
4510Sstevel@tonic-gate  * there is a rapid succession of new mounts coming in to the system, or
4520Sstevel@tonic-gate  * there is a remote possibility that zones will be created at such a
4530Sstevel@tonic-gate  * rate that new mounts will not be able to proceed.
4540Sstevel@tonic-gate  */
4550Sstevel@tonic-gate /*
4560Sstevel@tonic-gate  * Prevent new mounts from progressing to the point of calling
4570Sstevel@tonic-gate  * VFS_MOUNT().  If there are already mounts in this "region", wait for
4580Sstevel@tonic-gate  * them to complete.
4590Sstevel@tonic-gate  */
4600Sstevel@tonic-gate static int
block_mounts(void)4610Sstevel@tonic-gate block_mounts(void)
4620Sstevel@tonic-gate {
4630Sstevel@tonic-gate 	int retval = 0;
4640Sstevel@tonic-gate 
4650Sstevel@tonic-gate 	/*
4660Sstevel@tonic-gate 	 * Since it may block for a long time, block_mounts() shouldn't be
4670Sstevel@tonic-gate 	 * called with zonehash_lock held.
4680Sstevel@tonic-gate 	 */
4690Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4700Sstevel@tonic-gate 	mutex_enter(&mount_lock);
4710Sstevel@tonic-gate 	while (mounts_in_progress > 0) {
4720Sstevel@tonic-gate 		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
4730Sstevel@tonic-gate 			goto signaled;
4740Sstevel@tonic-gate 	}
4750Sstevel@tonic-gate 	/*
4760Sstevel@tonic-gate 	 * A negative value of mounts_in_progress indicates that mounts
4770Sstevel@tonic-gate 	 * have been blocked by (-mounts_in_progress) different callers.
4780Sstevel@tonic-gate 	 */
4790Sstevel@tonic-gate 	mounts_in_progress--;
4800Sstevel@tonic-gate 	retval = 1;
4810Sstevel@tonic-gate signaled:
4820Sstevel@tonic-gate 	mutex_exit(&mount_lock);
4830Sstevel@tonic-gate 	return (retval);
4840Sstevel@tonic-gate }
4850Sstevel@tonic-gate 
4860Sstevel@tonic-gate /*
4870Sstevel@tonic-gate  * The VFS layer may progress with new mounts as far as we're concerned.
4880Sstevel@tonic-gate  * Allow them to progress if we were the last obstacle.
4890Sstevel@tonic-gate  */
4900Sstevel@tonic-gate static void
resume_mounts(void)4910Sstevel@tonic-gate resume_mounts(void)
4920Sstevel@tonic-gate {
4930Sstevel@tonic-gate 	mutex_enter(&mount_lock);
4940Sstevel@tonic-gate 	if (++mounts_in_progress == 0)
4950Sstevel@tonic-gate 		cv_broadcast(&mount_cv);
4960Sstevel@tonic-gate 	mutex_exit(&mount_lock);
4970Sstevel@tonic-gate }
4980Sstevel@tonic-gate 
4990Sstevel@tonic-gate /*
5000Sstevel@tonic-gate  * The VFS layer is busy with a mount; zones should wait until all
5010Sstevel@tonic-gate  * mounts are completed to progress.
5020Sstevel@tonic-gate  */
5030Sstevel@tonic-gate void
mount_in_progress(void)5040Sstevel@tonic-gate mount_in_progress(void)
5050Sstevel@tonic-gate {
5060Sstevel@tonic-gate 	mutex_enter(&mount_lock);
5070Sstevel@tonic-gate 	while (mounts_in_progress < 0)
5080Sstevel@tonic-gate 		cv_wait(&mount_cv, &mount_lock);
5090Sstevel@tonic-gate 	mounts_in_progress++;
5100Sstevel@tonic-gate 	mutex_exit(&mount_lock);
5110Sstevel@tonic-gate }
5120Sstevel@tonic-gate 
5130Sstevel@tonic-gate /*
5140Sstevel@tonic-gate  * VFS is done with one mount; wake up any waiting block_mounts()
5150Sstevel@tonic-gate  * callers if this is the last mount.
5160Sstevel@tonic-gate  */
5170Sstevel@tonic-gate void
mount_completed(void)5180Sstevel@tonic-gate mount_completed(void)
5190Sstevel@tonic-gate {
5200Sstevel@tonic-gate 	mutex_enter(&mount_lock);
5210Sstevel@tonic-gate 	if (--mounts_in_progress == 0)
5220Sstevel@tonic-gate 		cv_broadcast(&mount_cv);
5230Sstevel@tonic-gate 	mutex_exit(&mount_lock);
5240Sstevel@tonic-gate }
5250Sstevel@tonic-gate 
5260Sstevel@tonic-gate /*
5270Sstevel@tonic-gate  * ZSD routines.
5280Sstevel@tonic-gate  *
5290Sstevel@tonic-gate  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
5300Sstevel@tonic-gate  * defined by the pthread_key_create() and related interfaces.
5310Sstevel@tonic-gate  *
5320Sstevel@tonic-gate  * Kernel subsystems may register one or more data items and/or
5330Sstevel@tonic-gate  * callbacks to be executed when a zone is created, shutdown, or
5340Sstevel@tonic-gate  * destroyed.
5350Sstevel@tonic-gate  *
5360Sstevel@tonic-gate  * Unlike the thread counterpart, destructor callbacks will be executed
5370Sstevel@tonic-gate  * even if the data pointer is NULL and/or there are no constructor
5380Sstevel@tonic-gate  * callbacks, so it is the responsibility of such callbacks to check for
5390Sstevel@tonic-gate  * NULL data values if necessary.
5400Sstevel@tonic-gate  *
5410Sstevel@tonic-gate  * The locking strategy and overall picture is as follows:
5420Sstevel@tonic-gate  *
5430Sstevel@tonic-gate  * When someone calls zone_key_create(), a template ZSD entry is added to the
5445880Snordmark  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
5455880Snordmark  * holding that lock all the existing zones are marked as
5465880Snordmark  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
5475880Snordmark  * zone_zsd list (protected by zone_lock). The global list is updated first
5485880Snordmark  * (under zone_key_lock) to make sure that newly created zones use the
5495880Snordmark  * most recent list of keys. Then under zonehash_lock we walk the zones
5505880Snordmark  * and mark them.  Similar locking is used in zone_key_delete().
5510Sstevel@tonic-gate  *
5525880Snordmark  * The actual create, shutdown, and destroy callbacks are done without
5535880Snordmark  * holding any lock. And zsd_flags are used to ensure that the operations
5545880Snordmark  * completed so that when zone_key_create (and zone_create) is done, as well as
5555880Snordmark  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
5565880Snordmark  * are completed.
5570Sstevel@tonic-gate  *
5580Sstevel@tonic-gate  * When new zones are created constructor callbacks for all registered ZSD
5595880Snordmark  * entries will be called. That also uses the above two phases of marking
5605880Snordmark  * what needs to be done, and then running the callbacks without holding
5615880Snordmark  * any locks.
5620Sstevel@tonic-gate  *
5630Sstevel@tonic-gate  * The framework does not provide any locking around zone_getspecific() and
5640Sstevel@tonic-gate  * zone_setspecific() apart from that needed for internal consistency, so
5650Sstevel@tonic-gate  * callers interested in atomic "test-and-set" semantics will need to provide
5660Sstevel@tonic-gate  * their own locking.
5670Sstevel@tonic-gate  */
5680Sstevel@tonic-gate 
5690Sstevel@tonic-gate /*
5700Sstevel@tonic-gate  * Helper function to find the zsd_entry associated with the key in the
5710Sstevel@tonic-gate  * given list.
5720Sstevel@tonic-gate  */
5730Sstevel@tonic-gate static struct zsd_entry *
zsd_find(list_t * l,zone_key_t key)5740Sstevel@tonic-gate zsd_find(list_t *l, zone_key_t key)
5750Sstevel@tonic-gate {
5760Sstevel@tonic-gate 	struct zsd_entry *zsd;
5770Sstevel@tonic-gate 
5780Sstevel@tonic-gate 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
5790Sstevel@tonic-gate 		if (zsd->zsd_key == key) {
5805880Snordmark 			return (zsd);
5815880Snordmark 		}
5825880Snordmark 	}
5835880Snordmark 	return (NULL);
5845880Snordmark }
5855880Snordmark 
5865880Snordmark /*
5875880Snordmark  * Helper function to find the zsd_entry associated with the key in the
5885880Snordmark  * given list. Move it to the front of the list.
5895880Snordmark  */
5905880Snordmark static struct zsd_entry *
zsd_find_mru(list_t * l,zone_key_t key)5915880Snordmark zsd_find_mru(list_t *l, zone_key_t key)
5925880Snordmark {
5935880Snordmark 	struct zsd_entry *zsd;
5945880Snordmark 
5955880Snordmark 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
5965880Snordmark 		if (zsd->zsd_key == key) {
5970Sstevel@tonic-gate 			/*
5980Sstevel@tonic-gate 			 * Move to head of list to keep list in MRU order.
5990Sstevel@tonic-gate 			 */
6000Sstevel@tonic-gate 			if (zsd != list_head(l)) {
6010Sstevel@tonic-gate 				list_remove(l, zsd);
6020Sstevel@tonic-gate 				list_insert_head(l, zsd);
6030Sstevel@tonic-gate 			}
6040Sstevel@tonic-gate 			return (zsd);
6050Sstevel@tonic-gate 		}
6060Sstevel@tonic-gate 	}
6070Sstevel@tonic-gate 	return (NULL);
6080Sstevel@tonic-gate }
6090Sstevel@tonic-gate 
6105880Snordmark void
zone_key_create(zone_key_t * keyp,void * (* create)(zoneid_t),void (* shutdown)(zoneid_t,void *),void (* destroy)(zoneid_t,void *))6115880Snordmark zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
6125880Snordmark     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
6135880Snordmark {
6145880Snordmark 	struct zsd_entry *zsdp;
6155880Snordmark 	struct zsd_entry *t;
6165880Snordmark 	struct zone *zone;
6175880Snordmark 	zone_key_t  key;
6185880Snordmark 
6195880Snordmark 	zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
6205880Snordmark 	zsdp->zsd_data = NULL;
6215880Snordmark 	zsdp->zsd_create = create;
6225880Snordmark 	zsdp->zsd_shutdown = shutdown;
6235880Snordmark 	zsdp->zsd_destroy = destroy;
6245880Snordmark 
6255880Snordmark 	/*
6265880Snordmark 	 * Insert in global list of callbacks. Makes future zone creations
6275880Snordmark 	 * see it.
6285880Snordmark 	 */
6295880Snordmark 	mutex_enter(&zsd_key_lock);
63010865SPramod.Batni@Sun.COM 	key = zsdp->zsd_key = ++zsd_keyval;
6315880Snordmark 	ASSERT(zsd_keyval != 0);
6325880Snordmark 	list_insert_tail(&zsd_registered_keys, zsdp);
6335880Snordmark 	mutex_exit(&zsd_key_lock);
6345880Snordmark 
6355880Snordmark 	/*
6365880Snordmark 	 * Insert for all existing zones and mark them as needing
6375880Snordmark 	 * a create callback.
6385880Snordmark 	 */
6395880Snordmark 	mutex_enter(&zonehash_lock);	/* stop the world */
6405880Snordmark 	for (zone = list_head(&zone_active); zone != NULL;
6415880Snordmark 	    zone = list_next(&zone_active, zone)) {
6425880Snordmark 		zone_status_t status;
6435880Snordmark 
6445880Snordmark 		mutex_enter(&zone->zone_lock);
6455880Snordmark 
6465880Snordmark 		/* Skip zones that are on the way down or not yet up */
6475880Snordmark 		status = zone_status_get(zone);
6485880Snordmark 		if (status >= ZONE_IS_DOWN ||
6495880Snordmark 		    status == ZONE_IS_UNINITIALIZED) {
6505880Snordmark 			mutex_exit(&zone->zone_lock);
6515880Snordmark 			continue;
6525880Snordmark 		}
6535880Snordmark 
6545880Snordmark 		t = zsd_find_mru(&zone->zone_zsd, key);
6555880Snordmark 		if (t != NULL) {
6565880Snordmark 			/*
6575880Snordmark 			 * A zsd_configure already inserted it after
6585880Snordmark 			 * we dropped zsd_key_lock above.
6595880Snordmark 			 */
6605880Snordmark 			mutex_exit(&zone->zone_lock);
6615880Snordmark 			continue;
6625880Snordmark 		}
6635880Snordmark 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
6645880Snordmark 		t->zsd_key = key;
6655880Snordmark 		t->zsd_create = create;
6665880Snordmark 		t->zsd_shutdown = shutdown;
6675880Snordmark 		t->zsd_destroy = destroy;
6685880Snordmark 		if (create != NULL) {
6695880Snordmark 			t->zsd_flags = ZSD_CREATE_NEEDED;
6705880Snordmark 			DTRACE_PROBE2(zsd__create__needed,
6715880Snordmark 			    zone_t *, zone, zone_key_t, key);
6725880Snordmark 		}
6735880Snordmark 		list_insert_tail(&zone->zone_zsd, t);
6745880Snordmark 		mutex_exit(&zone->zone_lock);
6755880Snordmark 	}
6765880Snordmark 	mutex_exit(&zonehash_lock);
6775880Snordmark 
6785880Snordmark 	if (create != NULL) {
6795880Snordmark 		/* Now call the create callback for this key */
6805880Snordmark 		zsd_apply_all_zones(zsd_apply_create, key);
6815880Snordmark 	}
68210865SPramod.Batni@Sun.COM 	/*
68310910SRobert.Harris@Sun.COM 	 * It is safe for consumers to use the key now, make it
68410910SRobert.Harris@Sun.COM 	 * globally visible. Specifically zone_getspecific() will
68510910SRobert.Harris@Sun.COM 	 * always successfully return the zone specific data associated
68610910SRobert.Harris@Sun.COM 	 * with the key.
68710910SRobert.Harris@Sun.COM 	 */
68810865SPramod.Batni@Sun.COM 	*keyp = key;
68910865SPramod.Batni@Sun.COM 
6905880Snordmark }
6915880Snordmark 
6920Sstevel@tonic-gate /*
6930Sstevel@tonic-gate  * Function called when a module is being unloaded, or otherwise wishes
6940Sstevel@tonic-gate  * to unregister its ZSD key and callbacks.
6955880Snordmark  *
6965880Snordmark  * Remove from the global list and determine the functions that need to
6975880Snordmark  * be called under a global lock. Then call the functions without
6985880Snordmark  * holding any locks. Finally free up the zone_zsd entries. (The apply
6995880Snordmark  * functions need to access the zone_zsd entries to find zsd_data etc.)
7000Sstevel@tonic-gate  */
7010Sstevel@tonic-gate int
zone_key_delete(zone_key_t key)7020Sstevel@tonic-gate zone_key_delete(zone_key_t key)
7030Sstevel@tonic-gate {
7040Sstevel@tonic-gate 	struct zsd_entry *zsdp = NULL;
7050Sstevel@tonic-gate 	zone_t *zone;
7060Sstevel@tonic-gate 
7070Sstevel@tonic-gate 	mutex_enter(&zsd_key_lock);
7085880Snordmark 	zsdp = zsd_find_mru(&zsd_registered_keys, key);
7095880Snordmark 	if (zsdp == NULL) {
7105880Snordmark 		mutex_exit(&zsd_key_lock);
7115880Snordmark 		return (-1);
7125880Snordmark 	}
7130Sstevel@tonic-gate 	list_remove(&zsd_registered_keys, zsdp);
7140Sstevel@tonic-gate 	mutex_exit(&zsd_key_lock);
7150Sstevel@tonic-gate 
7165880Snordmark 	mutex_enter(&zonehash_lock);
7170Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
7180Sstevel@tonic-gate 	    zone = list_next(&zone_active, zone)) {
7190Sstevel@tonic-gate 		struct zsd_entry *del;
7205880Snordmark 
7215880Snordmark 		mutex_enter(&zone->zone_lock);
7225880Snordmark 		del = zsd_find_mru(&zone->zone_zsd, key);
7235880Snordmark 		if (del == NULL) {
7245880Snordmark 			/*
7255880Snordmark 			 * Somebody else got here first e.g the zone going
7265880Snordmark 			 * away.
7275880Snordmark 			 */
7285880Snordmark 			mutex_exit(&zone->zone_lock);
7295880Snordmark 			continue;
7305880Snordmark 		}
7315880Snordmark 		ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
7325880Snordmark 		ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
7335880Snordmark 		if (del->zsd_shutdown != NULL &&
7345880Snordmark 		    (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
7355880Snordmark 			del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
7365880Snordmark 			DTRACE_PROBE2(zsd__shutdown__needed,
7375880Snordmark 			    zone_t *, zone, zone_key_t, key);
7385880Snordmark 		}
7395880Snordmark 		if (del->zsd_destroy != NULL &&
7405880Snordmark 		    (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
7415880Snordmark 			del->zsd_flags |= ZSD_DESTROY_NEEDED;
7425880Snordmark 			DTRACE_PROBE2(zsd__destroy__needed,
7435880Snordmark 			    zone_t *, zone, zone_key_t, key);
7440Sstevel@tonic-gate 		}
7450Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
7460Sstevel@tonic-gate 	}
7470Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
7480Sstevel@tonic-gate 	kmem_free(zsdp, sizeof (*zsdp));
7495880Snordmark 
7505880Snordmark 	/* Now call the shutdown and destroy callback for this key */
7515880Snordmark 	zsd_apply_all_zones(zsd_apply_shutdown, key);
7525880Snordmark 	zsd_apply_all_zones(zsd_apply_destroy, key);
7535880Snordmark 
7545880Snordmark 	/* Now we can free up the zsdp structures in each zone */
7555880Snordmark 	mutex_enter(&zonehash_lock);
7560Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
7575880Snordmark 	    zone = list_next(&zone_active, zone)) {
7585880Snordmark 		struct zsd_entry *del;
7595880Snordmark 
7605880Snordmark 		mutex_enter(&zone->zone_lock);
7615880Snordmark 		del = zsd_find(&zone->zone_zsd, key);
7625880Snordmark 		if (del != NULL) {
7635880Snordmark 			list_remove(&zone->zone_zsd, del);
7645880Snordmark 			ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
7655880Snordmark 			kmem_free(del, sizeof (*del));
7665880Snordmark 		}
7670Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
7685880Snordmark 	}
7690Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
7705880Snordmark 
7715880Snordmark 	return (0);
7720Sstevel@tonic-gate }
7730Sstevel@tonic-gate 
7740Sstevel@tonic-gate /*
7750Sstevel@tonic-gate  * ZSD counterpart of pthread_setspecific().
7765880Snordmark  *
7775880Snordmark  * Since all zsd callbacks, including those with no create function,
7785880Snordmark  * have an entry in zone_zsd, if the key is registered it is part of
7795880Snordmark  * the zone_zsd list.
7805880Snordmark  * Return an error if the key wasn't registerd.
7810Sstevel@tonic-gate  */
7820Sstevel@tonic-gate int
zone_setspecific(zone_key_t key,zone_t * zone,const void * data)7830Sstevel@tonic-gate zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
7840Sstevel@tonic-gate {
7850Sstevel@tonic-gate 	struct zsd_entry *t;
7860Sstevel@tonic-gate 
7870Sstevel@tonic-gate 	mutex_enter(&zone->zone_lock);
7885880Snordmark 	t = zsd_find_mru(&zone->zone_zsd, key);
7890Sstevel@tonic-gate 	if (t != NULL) {
7900Sstevel@tonic-gate 		/*
7910Sstevel@tonic-gate 		 * Replace old value with new
7920Sstevel@tonic-gate 		 */
7930Sstevel@tonic-gate 		t->zsd_data = (void *)data;
7940Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
7950Sstevel@tonic-gate 		return (0);
7960Sstevel@tonic-gate 	}
7970Sstevel@tonic-gate 	mutex_exit(&zone->zone_lock);
7985880Snordmark 	return (-1);
7990Sstevel@tonic-gate }
8000Sstevel@tonic-gate 
8010Sstevel@tonic-gate /*
8020Sstevel@tonic-gate  * ZSD counterpart of pthread_getspecific().
8030Sstevel@tonic-gate  */
8040Sstevel@tonic-gate void *
zone_getspecific(zone_key_t key,zone_t * zone)8050Sstevel@tonic-gate zone_getspecific(zone_key_t key, zone_t *zone)
8060Sstevel@tonic-gate {
8070Sstevel@tonic-gate 	struct zsd_entry *t;
8080Sstevel@tonic-gate 	void *data;
8090Sstevel@tonic-gate 
8100Sstevel@tonic-gate 	mutex_enter(&zone->zone_lock);
8115880Snordmark 	t = zsd_find_mru(&zone->zone_zsd, key);
8120Sstevel@tonic-gate 	data = (t == NULL ? NULL : t->zsd_data);
8130Sstevel@tonic-gate 	mutex_exit(&zone->zone_lock);
8140Sstevel@tonic-gate 	return (data);
8150Sstevel@tonic-gate }
8160Sstevel@tonic-gate 
8170Sstevel@tonic-gate /*
8180Sstevel@tonic-gate  * Function used to initialize a zone's list of ZSD callbacks and data
8190Sstevel@tonic-gate  * when the zone is being created.  The callbacks are initialized from
8205880Snordmark  * the template list (zsd_registered_keys). The constructor callback is
8215880Snordmark  * executed later (once the zone exists and with locks dropped).
8220Sstevel@tonic-gate  */
8230Sstevel@tonic-gate static void
zone_zsd_configure(zone_t * zone)8240Sstevel@tonic-gate zone_zsd_configure(zone_t *zone)
8250Sstevel@tonic-gate {
8260Sstevel@tonic-gate 	struct zsd_entry *zsdp;
8270Sstevel@tonic-gate 	struct zsd_entry *t;
8280Sstevel@tonic-gate 
8290Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
8300Sstevel@tonic-gate 	ASSERT(list_head(&zone->zone_zsd) == NULL);
8315880Snordmark 	mutex_enter(&zone->zone_lock);
8320Sstevel@tonic-gate 	mutex_enter(&zsd_key_lock);
8330Sstevel@tonic-gate 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
8340Sstevel@tonic-gate 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
8355880Snordmark 		/*
8365880Snordmark 		 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
8375880Snordmark 		 * should not have added anything to it.
8385880Snordmark 		 */
8395880Snordmark 		ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
8405880Snordmark 
8415880Snordmark 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
8425880Snordmark 		t->zsd_key = zsdp->zsd_key;
8435880Snordmark 		t->zsd_create = zsdp->zsd_create;
8445880Snordmark 		t->zsd_shutdown = zsdp->zsd_shutdown;
8455880Snordmark 		t->zsd_destroy = zsdp->zsd_destroy;
8460Sstevel@tonic-gate 		if (zsdp->zsd_create != NULL) {
8475880Snordmark 			t->zsd_flags = ZSD_CREATE_NEEDED;
8485880Snordmark 			DTRACE_PROBE2(zsd__create__needed,
8495880Snordmark 			    zone_t *, zone, zone_key_t, zsdp->zsd_key);
8500Sstevel@tonic-gate 		}
8515880Snordmark 		list_insert_tail(&zone->zone_zsd, t);
8520Sstevel@tonic-gate 	}
8530Sstevel@tonic-gate 	mutex_exit(&zsd_key_lock);
8545880Snordmark 	mutex_exit(&zone->zone_lock);
8550Sstevel@tonic-gate }
8560Sstevel@tonic-gate 
8570Sstevel@tonic-gate enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
8580Sstevel@tonic-gate 
8590Sstevel@tonic-gate /*
8600Sstevel@tonic-gate  * Helper function to execute shutdown or destructor callbacks.
8610Sstevel@tonic-gate  */
8620Sstevel@tonic-gate static void
zone_zsd_callbacks(zone_t * zone,enum zsd_callback_type ct)8630Sstevel@tonic-gate zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
8640Sstevel@tonic-gate {
8650Sstevel@tonic-gate 	struct zsd_entry *t;
8660Sstevel@tonic-gate 
8670Sstevel@tonic-gate 	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
8680Sstevel@tonic-gate 	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
8690Sstevel@tonic-gate 	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
8700Sstevel@tonic-gate 
8715880Snordmark 	/*
8725880Snordmark 	 * Run the callback solely based on what is registered for the zone
8735880Snordmark 	 * in zone_zsd. The global list can change independently of this
8745880Snordmark 	 * as keys are registered and unregistered and we don't register new
8755880Snordmark 	 * callbacks for a zone that is in the process of going away.
8765880Snordmark 	 */
8770Sstevel@tonic-gate 	mutex_enter(&zone->zone_lock);
8785880Snordmark 	for (t = list_head(&zone->zone_zsd); t != NULL;
8795880Snordmark 	    t = list_next(&zone->zone_zsd, t)) {
8805880Snordmark 		zone_key_t key = t->zsd_key;
8810Sstevel@tonic-gate 
8820Sstevel@tonic-gate 		/* Skip if no callbacks registered */
8835880Snordmark 
8845880Snordmark 		if (ct == ZSD_SHUTDOWN) {
8855880Snordmark 			if (t->zsd_shutdown != NULL &&
8865880Snordmark 			    (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
8875880Snordmark 				t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
8885880Snordmark 				DTRACE_PROBE2(zsd__shutdown__needed,
8895880Snordmark 				    zone_t *, zone, zone_key_t, key);
8900Sstevel@tonic-gate 			}
8910Sstevel@tonic-gate 		} else {
8925880Snordmark 			if (t->zsd_destroy != NULL &&
8935880Snordmark 			    (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
8945880Snordmark 				t->zsd_flags |= ZSD_DESTROY_NEEDED;
8955880Snordmark 				DTRACE_PROBE2(zsd__destroy__needed,
8965880Snordmark 				    zone_t *, zone, zone_key_t, key);
8970Sstevel@tonic-gate 			}
8980Sstevel@tonic-gate 		}
8990Sstevel@tonic-gate 	}
9005880Snordmark 	mutex_exit(&zone->zone_lock);
9015880Snordmark 
9025880Snordmark 	/* Now call the shutdown and destroy callback for this key */
9035880Snordmark 	zsd_apply_all_keys(zsd_apply_shutdown, zone);
9045880Snordmark 	zsd_apply_all_keys(zsd_apply_destroy, zone);
9055880Snordmark 
9060Sstevel@tonic-gate }
9070Sstevel@tonic-gate 
9080Sstevel@tonic-gate /*
9090Sstevel@tonic-gate  * Called when the zone is going away; free ZSD-related memory, and
9100Sstevel@tonic-gate  * destroy the zone_zsd list.
9110Sstevel@tonic-gate  */
9120Sstevel@tonic-gate static void
zone_free_zsd(zone_t * zone)9130Sstevel@tonic-gate zone_free_zsd(zone_t *zone)
9140Sstevel@tonic-gate {
9150Sstevel@tonic-gate 	struct zsd_entry *t, *next;
9160Sstevel@tonic-gate 
9170Sstevel@tonic-gate 	/*
9180Sstevel@tonic-gate 	 * Free all the zsd_entry's we had on this zone.
9190Sstevel@tonic-gate 	 */
9205880Snordmark 	mutex_enter(&zone->zone_lock);
9210Sstevel@tonic-gate 	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
9220Sstevel@tonic-gate 		next = list_next(&zone->zone_zsd, t);
9230Sstevel@tonic-gate 		list_remove(&zone->zone_zsd, t);
9245880Snordmark 		ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
9250Sstevel@tonic-gate 		kmem_free(t, sizeof (*t));
9260Sstevel@tonic-gate 	}
9270Sstevel@tonic-gate 	list_destroy(&zone->zone_zsd);
9285880Snordmark 	mutex_exit(&zone->zone_lock);
9295880Snordmark 
9305880Snordmark }
9315880Snordmark 
9325880Snordmark /*
9335880Snordmark  * Apply a function to all zones for particular key value.
9345880Snordmark  *
9355880Snordmark  * The applyfn has to drop zonehash_lock if it does some work, and
9365880Snordmark  * then reacquire it before it returns.
9375880Snordmark  * When the lock is dropped we don't follow list_next even
9385880Snordmark  * if it is possible to do so without any hazards. This is
9395880Snordmark  * because we want the design to allow for the list of zones
9405880Snordmark  * to change in any arbitrary way during the time the
9415880Snordmark  * lock was dropped.
9425880Snordmark  *
9435880Snordmark  * It is safe to restart the loop at list_head since the applyfn
9445880Snordmark  * changes the zsd_flags as it does work, so a subsequent
9455880Snordmark  * pass through will have no effect in applyfn, hence the loop will terminate
9465880Snordmark  * in at worst O(N^2).
9475880Snordmark  */
9485880Snordmark static void
zsd_apply_all_zones(zsd_applyfn_t * applyfn,zone_key_t key)9495880Snordmark zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
9505880Snordmark {
9515880Snordmark 	zone_t *zone;
9525880Snordmark 
9535880Snordmark 	mutex_enter(&zonehash_lock);
9545880Snordmark 	zone = list_head(&zone_active);
9555880Snordmark 	while (zone != NULL) {
9565880Snordmark 		if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
9575880Snordmark 			/* Lock dropped - restart at head */
9585880Snordmark 			zone = list_head(&zone_active);
9595880Snordmark 		} else {
9605880Snordmark 			zone = list_next(&zone_active, zone);
9615880Snordmark 		}
9625880Snordmark 	}
9635880Snordmark 	mutex_exit(&zonehash_lock);
9645880Snordmark }
9655880Snordmark 
9665880Snordmark /*
9675880Snordmark  * Apply a function to all keys for a particular zone.
9685880Snordmark  *
9695880Snordmark  * The applyfn has to drop zonehash_lock if it does some work, and
9705880Snordmark  * then reacquire it before it returns.
9715880Snordmark  * When the lock is dropped we don't follow list_next even
9725880Snordmark  * if it is possible to do so without any hazards. This is
9735880Snordmark  * because we want the design to allow for the list of zsd callbacks
9745880Snordmark  * to change in any arbitrary way during the time the
9755880Snordmark  * lock was dropped.
9765880Snordmark  *
9775880Snordmark  * It is safe to restart the loop at list_head since the applyfn
9785880Snordmark  * changes the zsd_flags as it does work, so a subsequent
9795880Snordmark  * pass through will have no effect in applyfn, hence the loop will terminate
9805880Snordmark  * in at worst O(N^2).
9815880Snordmark  */
9825880Snordmark static void
zsd_apply_all_keys(zsd_applyfn_t * applyfn,zone_t * zone)9835880Snordmark zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
9845880Snordmark {
9855880Snordmark 	struct zsd_entry *t;
9865880Snordmark 
9875880Snordmark 	mutex_enter(&zone->zone_lock);
9885880Snordmark 	t = list_head(&zone->zone_zsd);
9895880Snordmark 	while (t != NULL) {
9905880Snordmark 		if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
9915880Snordmark 			/* Lock dropped - restart at head */
9925880Snordmark 			t = list_head(&zone->zone_zsd);
9935880Snordmark 		} else {
9945880Snordmark 			t = list_next(&zone->zone_zsd, t);
9955880Snordmark 		}
9965880Snordmark 	}
9975880Snordmark 	mutex_exit(&zone->zone_lock);
9985880Snordmark }
9995880Snordmark 
10005880Snordmark /*
10015880Snordmark  * Call the create function for the zone and key if CREATE_NEEDED
10025880Snordmark  * is set.
10035880Snordmark  * If some other thread gets here first and sets CREATE_INPROGRESS, then
10045880Snordmark  * we wait for that thread to complete so that we can ensure that
10055880Snordmark  * all the callbacks are done when we've looped over all zones/keys.
10065880Snordmark  *
10075880Snordmark  * When we call the create function, we drop the global held by the
10085880Snordmark  * caller, and return true to tell the caller it needs to re-evalute the
10095880Snordmark  * state.
10105880Snordmark  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
10115880Snordmark  * remains held on exit.
10125880Snordmark  */
10135880Snordmark static boolean_t
zsd_apply_create(kmutex_t * lockp,boolean_t zone_lock_held,zone_t * zone,zone_key_t key)10145880Snordmark zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
10155880Snordmark     zone_t *zone, zone_key_t key)
10165880Snordmark {
10175880Snordmark 	void *result;
10185880Snordmark 	struct zsd_entry *t;
10195880Snordmark 	boolean_t dropped;
10205880Snordmark 
10215880Snordmark 	if (lockp != NULL) {
10225880Snordmark 		ASSERT(MUTEX_HELD(lockp));
10235880Snordmark 	}
10245880Snordmark 	if (zone_lock_held) {
10255880Snordmark 		ASSERT(MUTEX_HELD(&zone->zone_lock));
10265880Snordmark 	} else {
10275880Snordmark 		mutex_enter(&zone->zone_lock);
10285880Snordmark 	}
10295880Snordmark 
10305880Snordmark 	t = zsd_find(&zone->zone_zsd, key);
10315880Snordmark 	if (t == NULL) {
10325880Snordmark 		/*
10335880Snordmark 		 * Somebody else got here first e.g the zone going
10345880Snordmark 		 * away.
10355880Snordmark 		 */
10365880Snordmark 		if (!zone_lock_held)
10375880Snordmark 			mutex_exit(&zone->zone_lock);
10385880Snordmark 		return (B_FALSE);
10395880Snordmark 	}
10405880Snordmark 	dropped = B_FALSE;
10415880Snordmark 	if (zsd_wait_for_inprogress(zone, t, lockp))
10425880Snordmark 		dropped = B_TRUE;
10435880Snordmark 
10445880Snordmark 	if (t->zsd_flags & ZSD_CREATE_NEEDED) {
10455880Snordmark 		t->zsd_flags &= ~ZSD_CREATE_NEEDED;
10465880Snordmark 		t->zsd_flags |= ZSD_CREATE_INPROGRESS;
10475880Snordmark 		DTRACE_PROBE2(zsd__create__inprogress,
10485880Snordmark 		    zone_t *, zone, zone_key_t, key);
10495880Snordmark 		mutex_exit(&zone->zone_lock);
10505880Snordmark 		if (lockp != NULL)
10515880Snordmark 			mutex_exit(lockp);
10525880Snordmark 
10535880Snordmark 		dropped = B_TRUE;
10545880Snordmark 		ASSERT(t->zsd_create != NULL);
10555880Snordmark 		DTRACE_PROBE2(zsd__create__start,
10565880Snordmark 		    zone_t *, zone, zone_key_t, key);
10575880Snordmark 
10585880Snordmark 		result = (*t->zsd_create)(zone->zone_id);
10595880Snordmark 
10605880Snordmark 		DTRACE_PROBE2(zsd__create__end,
10615880Snordmark 		    zone_t *, zone, voidn *, result);
10625880Snordmark 
10635880Snordmark 		ASSERT(result != NULL);
10645880Snordmark 		if (lockp != NULL)
10655880Snordmark 			mutex_enter(lockp);
10665880Snordmark 		mutex_enter(&zone->zone_lock);
10675880Snordmark 		t->zsd_data = result;
10685880Snordmark 		t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
10695880Snordmark 		t->zsd_flags |= ZSD_CREATE_COMPLETED;
10705880Snordmark 		cv_broadcast(&t->zsd_cv);
10715880Snordmark 		DTRACE_PROBE2(zsd__create__completed,
10725880Snordmark 		    zone_t *, zone, zone_key_t, key);
10735880Snordmark 	}
10745880Snordmark 	if (!zone_lock_held)
10755880Snordmark 		mutex_exit(&zone->zone_lock);
10765880Snordmark 	return (dropped);
10775880Snordmark }
10785880Snordmark 
10795880Snordmark /*
10805880Snordmark  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
10815880Snordmark  * is set.
10825880Snordmark  * If some other thread gets here first and sets *_INPROGRESS, then
10835880Snordmark  * we wait for that thread to complete so that we can ensure that
10845880Snordmark  * all the callbacks are done when we've looped over all zones/keys.
10855880Snordmark  *
10865880Snordmark  * When we call the shutdown function, we drop the global held by the
10875880Snordmark  * caller, and return true to tell the caller it needs to re-evalute the
10885880Snordmark  * state.
10895880Snordmark  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
10905880Snordmark  * remains held on exit.
10915880Snordmark  */
10925880Snordmark static boolean_t
zsd_apply_shutdown(kmutex_t * lockp,boolean_t zone_lock_held,zone_t * zone,zone_key_t key)10935880Snordmark zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
10945880Snordmark     zone_t *zone, zone_key_t key)
10955880Snordmark {
10965880Snordmark 	struct zsd_entry *t;
10975880Snordmark 	void *data;
10985880Snordmark 	boolean_t dropped;
10995880Snordmark 
11005880Snordmark 	if (lockp != NULL) {
11015880Snordmark 		ASSERT(MUTEX_HELD(lockp));
11025880Snordmark 	}
11035880Snordmark 	if (zone_lock_held) {
11045880Snordmark 		ASSERT(MUTEX_HELD(&zone->zone_lock));
11055880Snordmark 	} else {
11065880Snordmark 		mutex_enter(&zone->zone_lock);
11075880Snordmark 	}
11085880Snordmark 
11095880Snordmark 	t = zsd_find(&zone->zone_zsd, key);
11105880Snordmark 	if (t == NULL) {
11115880Snordmark 		/*
11125880Snordmark 		 * Somebody else got here first e.g the zone going
11135880Snordmark 		 * away.
11145880Snordmark 		 */
11155880Snordmark 		if (!zone_lock_held)
11165880Snordmark 			mutex_exit(&zone->zone_lock);
11175880Snordmark 		return (B_FALSE);
11185880Snordmark 	}
11195880Snordmark 	dropped = B_FALSE;
11205880Snordmark 	if (zsd_wait_for_creator(zone, t, lockp))
11215880Snordmark 		dropped = B_TRUE;
11225880Snordmark 
11235880Snordmark 	if (zsd_wait_for_inprogress(zone, t, lockp))
11245880Snordmark 		dropped = B_TRUE;
11255880Snordmark 
11265880Snordmark 	if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
11275880Snordmark 		t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
11285880Snordmark 		t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
11295880Snordmark 		DTRACE_PROBE2(zsd__shutdown__inprogress,
11305880Snordmark 		    zone_t *, zone, zone_key_t, key);
11315880Snordmark 		mutex_exit(&zone->zone_lock);
11325880Snordmark 		if (lockp != NULL)
11335880Snordmark 			mutex_exit(lockp);
11345880Snordmark 		dropped = B_TRUE;
11355880Snordmark 
11365880Snordmark 		ASSERT(t->zsd_shutdown != NULL);
11375880Snordmark 		data = t->zsd_data;
11385880Snordmark 
11395880Snordmark 		DTRACE_PROBE2(zsd__shutdown__start,
11405880Snordmark 		    zone_t *, zone, zone_key_t, key);
11415880Snordmark 
11425880Snordmark 		(t->zsd_shutdown)(zone->zone_id, data);
11435880Snordmark 		DTRACE_PROBE2(zsd__shutdown__end,
11445880Snordmark 		    zone_t *, zone, zone_key_t, key);
11455880Snordmark 
11465880Snordmark 		if (lockp != NULL)
11475880Snordmark 			mutex_enter(lockp);
11485880Snordmark 		mutex_enter(&zone->zone_lock);
11495880Snordmark 		t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
11505880Snordmark 		t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
11515880Snordmark 		cv_broadcast(&t->zsd_cv);
11525880Snordmark 		DTRACE_PROBE2(zsd__shutdown__completed,
11535880Snordmark 		    zone_t *, zone, zone_key_t, key);
11545880Snordmark 	}
11555880Snordmark 	if (!zone_lock_held)
11565880Snordmark 		mutex_exit(&zone->zone_lock);
11575880Snordmark 	return (dropped);
11585880Snordmark }
11595880Snordmark 
11605880Snordmark /*
11615880Snordmark  * Call the destroy function for the zone and key if DESTROY_NEEDED
11625880Snordmark  * is set.
11635880Snordmark  * If some other thread gets here first and sets *_INPROGRESS, then
11645880Snordmark  * we wait for that thread to complete so that we can ensure that
11655880Snordmark  * all the callbacks are done when we've looped over all zones/keys.
11665880Snordmark  *
11675880Snordmark  * When we call the destroy function, we drop the global held by the
11685880Snordmark  * caller, and return true to tell the caller it needs to re-evalute the
11695880Snordmark  * state.
11705880Snordmark  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
11715880Snordmark  * remains held on exit.
11725880Snordmark  */
11735880Snordmark static boolean_t
zsd_apply_destroy(kmutex_t * lockp,boolean_t zone_lock_held,zone_t * zone,zone_key_t key)11745880Snordmark zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
11755880Snordmark     zone_t *zone, zone_key_t key)
11765880Snordmark {
11775880Snordmark 	struct zsd_entry *t;
11785880Snordmark 	void *data;
11795880Snordmark 	boolean_t dropped;
11805880Snordmark 
11815880Snordmark 	if (lockp != NULL) {
11825880Snordmark 		ASSERT(MUTEX_HELD(lockp));
11835880Snordmark 	}
11845880Snordmark 	if (zone_lock_held) {
11855880Snordmark 		ASSERT(MUTEX_HELD(&zone->zone_lock));
11865880Snordmark 	} else {
11875880Snordmark 		mutex_enter(&zone->zone_lock);
11885880Snordmark 	}
11895880Snordmark 
11905880Snordmark 	t = zsd_find(&zone->zone_zsd, key);
11915880Snordmark 	if (t == NULL) {
11925880Snordmark 		/*
11935880Snordmark 		 * Somebody else got here first e.g the zone going
11945880Snordmark 		 * away.
11955880Snordmark 		 */
11965880Snordmark 		if (!zone_lock_held)
11975880Snordmark 			mutex_exit(&zone->zone_lock);
11985880Snordmark 		return (B_FALSE);
11995880Snordmark 	}
12005880Snordmark 	dropped = B_FALSE;
12015880Snordmark 	if (zsd_wait_for_creator(zone, t, lockp))
12025880Snordmark 		dropped = B_TRUE;
12035880Snordmark 
12045880Snordmark 	if (zsd_wait_for_inprogress(zone, t, lockp))
12055880Snordmark 		dropped = B_TRUE;
12065880Snordmark 
12075880Snordmark 	if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
12085880Snordmark 		t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
12095880Snordmark 		t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
12105880Snordmark 		DTRACE_PROBE2(zsd__destroy__inprogress,
12115880Snordmark 		    zone_t *, zone, zone_key_t, key);
12125880Snordmark 		mutex_exit(&zone->zone_lock);
12135880Snordmark 		if (lockp != NULL)
12145880Snordmark 			mutex_exit(lockp);
12155880Snordmark 		dropped = B_TRUE;
12165880Snordmark 
12175880Snordmark 		ASSERT(t->zsd_destroy != NULL);
12185880Snordmark 		data = t->zsd_data;
12195880Snordmark 		DTRACE_PROBE2(zsd__destroy__start,
12205880Snordmark 		    zone_t *, zone, zone_key_t, key);
12215880Snordmark 
12225880Snordmark 		(t->zsd_destroy)(zone->zone_id, data);
12235880Snordmark 		DTRACE_PROBE2(zsd__destroy__end,
12245880Snordmark 		    zone_t *, zone, zone_key_t, key);
12255880Snordmark 
12265880Snordmark 		if (lockp != NULL)
12275880Snordmark 			mutex_enter(lockp);
12285880Snordmark 		mutex_enter(&zone->zone_lock);
12295880Snordmark 		t->zsd_data = NULL;
12305880Snordmark 		t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
12315880Snordmark 		t->zsd_flags |= ZSD_DESTROY_COMPLETED;
12325880Snordmark 		cv_broadcast(&t->zsd_cv);
12335880Snordmark 		DTRACE_PROBE2(zsd__destroy__completed,
12345880Snordmark 		    zone_t *, zone, zone_key_t, key);
12355880Snordmark 	}
12365880Snordmark 	if (!zone_lock_held)
12375880Snordmark 		mutex_exit(&zone->zone_lock);
12385880Snordmark 	return (dropped);
12395880Snordmark }
12405880Snordmark 
12415880Snordmark /*
12425880Snordmark  * Wait for any CREATE_NEEDED flag to be cleared.
12435880Snordmark  * Returns true if lockp was temporarily dropped while waiting.
12445880Snordmark  */
12455880Snordmark static boolean_t
zsd_wait_for_creator(zone_t * zone,struct zsd_entry * t,kmutex_t * lockp)12465880Snordmark zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
12475880Snordmark {
12485880Snordmark 	boolean_t dropped = B_FALSE;
12495880Snordmark 
12505880Snordmark 	while (t->zsd_flags & ZSD_CREATE_NEEDED) {
12515880Snordmark 		DTRACE_PROBE2(zsd__wait__for__creator,
12525880Snordmark 		    zone_t *, zone, struct zsd_entry *, t);
12535880Snordmark 		if (lockp != NULL) {
12545880Snordmark 			dropped = B_TRUE;
12555880Snordmark 			mutex_exit(lockp);
12565880Snordmark 		}
12575880Snordmark 		cv_wait(&t->zsd_cv, &zone->zone_lock);
12585880Snordmark 		if (lockp != NULL) {
12595880Snordmark 			/* First drop zone_lock to preserve order */
12605880Snordmark 			mutex_exit(&zone->zone_lock);
12615880Snordmark 			mutex_enter(lockp);
12625880Snordmark 			mutex_enter(&zone->zone_lock);
12635880Snordmark 		}
12645880Snordmark 	}
12655880Snordmark 	return (dropped);
12665880Snordmark }
12675880Snordmark 
12685880Snordmark /*
12695880Snordmark  * Wait for any INPROGRESS flag to be cleared.
12705880Snordmark  * Returns true if lockp was temporarily dropped while waiting.
12715880Snordmark  */
12725880Snordmark static boolean_t
zsd_wait_for_inprogress(zone_t * zone,struct zsd_entry * t,kmutex_t * lockp)12735880Snordmark zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
12745880Snordmark {
12755880Snordmark 	boolean_t dropped = B_FALSE;
12765880Snordmark 
12775880Snordmark 	while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
12785880Snordmark 		DTRACE_PROBE2(zsd__wait__for__inprogress,
12795880Snordmark 		    zone_t *, zone, struct zsd_entry *, t);
12805880Snordmark 		if (lockp != NULL) {
12815880Snordmark 			dropped = B_TRUE;
12825880Snordmark 			mutex_exit(lockp);
12835880Snordmark 		}
12845880Snordmark 		cv_wait(&t->zsd_cv, &zone->zone_lock);
12855880Snordmark 		if (lockp != NULL) {
12865880Snordmark 			/* First drop zone_lock to preserve order */
12875880Snordmark 			mutex_exit(&zone->zone_lock);
12885880Snordmark 			mutex_enter(lockp);
12895880Snordmark 			mutex_enter(&zone->zone_lock);
12905880Snordmark 		}
12915880Snordmark 	}
12925880Snordmark 	return (dropped);
12930Sstevel@tonic-gate }
12940Sstevel@tonic-gate 
12950Sstevel@tonic-gate /*
1296789Sahrens  * Frees memory associated with the zone dataset list.
1297789Sahrens  */
1298789Sahrens static void
zone_free_datasets(zone_t * zone)1299789Sahrens zone_free_datasets(zone_t *zone)
1300789Sahrens {
1301789Sahrens 	zone_dataset_t *t, *next;
1302789Sahrens 
1303789Sahrens 	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1304789Sahrens 		next = list_next(&zone->zone_datasets, t);
1305789Sahrens 		list_remove(&zone->zone_datasets, t);
1306789Sahrens 		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1307789Sahrens 		kmem_free(t, sizeof (*t));
1308789Sahrens 	}
1309789Sahrens 	list_destroy(&zone->zone_datasets);
1310789Sahrens }
1311789Sahrens 
1312789Sahrens /*
13130Sstevel@tonic-gate  * zone.cpu-shares resource control support.
13140Sstevel@tonic-gate  */
13150Sstevel@tonic-gate /*ARGSUSED*/
13160Sstevel@tonic-gate static rctl_qty_t
zone_cpu_shares_usage(rctl_t * rctl,struct proc * p)13170Sstevel@tonic-gate zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
13180Sstevel@tonic-gate {
13190Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&p->p_lock));
13200Sstevel@tonic-gate 	return (p->p_zone->zone_shares);
13210Sstevel@tonic-gate }
13220Sstevel@tonic-gate 
13230Sstevel@tonic-gate /*ARGSUSED*/
13240Sstevel@tonic-gate static int
zone_cpu_shares_set(rctl_t * rctl,struct proc * p,rctl_entity_p_t * e,rctl_qty_t nv)13250Sstevel@tonic-gate zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
13260Sstevel@tonic-gate     rctl_qty_t nv)
13270Sstevel@tonic-gate {
13280Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&p->p_lock));
13290Sstevel@tonic-gate 	ASSERT(e->rcep_t == RCENTITY_ZONE);
13300Sstevel@tonic-gate 	if (e->rcep_p.zone == NULL)
13310Sstevel@tonic-gate 		return (0);
13320Sstevel@tonic-gate 
13330Sstevel@tonic-gate 	e->rcep_p.zone->zone_shares = nv;
13340Sstevel@tonic-gate 	return (0);
13350Sstevel@tonic-gate }
13360Sstevel@tonic-gate 
13370Sstevel@tonic-gate static rctl_ops_t zone_cpu_shares_ops = {
13380Sstevel@tonic-gate 	rcop_no_action,
13390Sstevel@tonic-gate 	zone_cpu_shares_usage,
13400Sstevel@tonic-gate 	zone_cpu_shares_set,
13410Sstevel@tonic-gate 	rcop_no_test
13420Sstevel@tonic-gate };
13430Sstevel@tonic-gate 
13443792Sakolb /*
13453792Sakolb  * zone.cpu-cap resource control support.
13463792Sakolb  */
13473792Sakolb /*ARGSUSED*/
13483792Sakolb static rctl_qty_t
zone_cpu_cap_get(rctl_t * rctl,struct proc * p)13493792Sakolb zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
13503792Sakolb {
13513792Sakolb 	ASSERT(MUTEX_HELD(&p->p_lock));
13523792Sakolb 	return (cpucaps_zone_get(p->p_zone));
13533792Sakolb }
13543792Sakolb 
13553792Sakolb /*ARGSUSED*/
13563792Sakolb static int
zone_cpu_cap_set(rctl_t * rctl,struct proc * p,rctl_entity_p_t * e,rctl_qty_t nv)13573792Sakolb zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
13583792Sakolb     rctl_qty_t nv)
13593792Sakolb {
13603792Sakolb 	zone_t *zone = e->rcep_p.zone;
13613792Sakolb 
13623792Sakolb 	ASSERT(MUTEX_HELD(&p->p_lock));
13633792Sakolb 	ASSERT(e->rcep_t == RCENTITY_ZONE);
13643792Sakolb 
13653792Sakolb 	if (zone == NULL)
13663792Sakolb 		return (0);
13673792Sakolb 
13683792Sakolb 	/*
13693792Sakolb 	 * set cap to the new value.
13703792Sakolb 	 */
13713792Sakolb 	return (cpucaps_zone_set(zone, nv));
13723792Sakolb }
13733792Sakolb 
13743792Sakolb static rctl_ops_t zone_cpu_cap_ops = {
13753792Sakolb 	rcop_no_action,
13763792Sakolb 	zone_cpu_cap_get,
13773792Sakolb 	zone_cpu_cap_set,
13783792Sakolb 	rcop_no_test
13793792Sakolb };
13803792Sakolb 
13810Sstevel@tonic-gate /*ARGSUSED*/
13820Sstevel@tonic-gate static rctl_qty_t
zone_lwps_usage(rctl_t * r,proc_t * p)13830Sstevel@tonic-gate zone_lwps_usage(rctl_t *r, proc_t *p)
13840Sstevel@tonic-gate {
13850Sstevel@tonic-gate 	rctl_qty_t nlwps;
13860Sstevel@tonic-gate 	zone_t *zone = p->p_zone;
13870Sstevel@tonic-gate 
13880Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&p->p_lock));
13890Sstevel@tonic-gate 
13900Sstevel@tonic-gate 	mutex_enter(&zone->zone_nlwps_lock);
13910Sstevel@tonic-gate 	nlwps = zone->zone_nlwps;
13920Sstevel@tonic-gate 	mutex_exit(&zone->zone_nlwps_lock);
13930Sstevel@tonic-gate 
13940Sstevel@tonic-gate 	return (nlwps);
13950Sstevel@tonic-gate }
13960Sstevel@tonic-gate 
13970Sstevel@tonic-gate /*ARGSUSED*/
13980Sstevel@tonic-gate static int
zone_lwps_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,rctl_val_t * rcntl,rctl_qty_t incr,uint_t flags)13990Sstevel@tonic-gate zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
14000Sstevel@tonic-gate     rctl_qty_t incr, uint_t flags)
14010Sstevel@tonic-gate {
14020Sstevel@tonic-gate 	rctl_qty_t nlwps;
14030Sstevel@tonic-gate 
14040Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&p->p_lock));
14050Sstevel@tonic-gate 	ASSERT(e->rcep_t == RCENTITY_ZONE);
14060Sstevel@tonic-gate 	if (e->rcep_p.zone == NULL)
14070Sstevel@tonic-gate 		return (0);
14080Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
14090Sstevel@tonic-gate 	nlwps = e->rcep_p.zone->zone_nlwps;
14100Sstevel@tonic-gate 
14110Sstevel@tonic-gate 	if (nlwps + incr > rcntl->rcv_value)
14120Sstevel@tonic-gate 		return (1);
14130Sstevel@tonic-gate 
14140Sstevel@tonic-gate 	return (0);
14150Sstevel@tonic-gate }
14160Sstevel@tonic-gate 
14170Sstevel@tonic-gate /*ARGSUSED*/
14180Sstevel@tonic-gate static int
zone_lwps_set(rctl_t * rctl,struct proc * p,rctl_entity_p_t * e,rctl_qty_t nv)14192768Ssl108498 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
14202768Ssl108498 {
14210Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&p->p_lock));
14220Sstevel@tonic-gate 	ASSERT(e->rcep_t == RCENTITY_ZONE);
14230Sstevel@tonic-gate 	if (e->rcep_p.zone == NULL)
14240Sstevel@tonic-gate 		return (0);
14250Sstevel@tonic-gate 	e->rcep_p.zone->zone_nlwps_ctl = nv;
14260Sstevel@tonic-gate 	return (0);
14270Sstevel@tonic-gate }
14280Sstevel@tonic-gate 
14290Sstevel@tonic-gate static rctl_ops_t zone_lwps_ops = {
14300Sstevel@tonic-gate 	rcop_no_action,
14310Sstevel@tonic-gate 	zone_lwps_usage,
14320Sstevel@tonic-gate 	zone_lwps_set,
14330Sstevel@tonic-gate 	zone_lwps_test,
14340Sstevel@tonic-gate };
14350Sstevel@tonic-gate 
14362677Sml93401 /*ARGSUSED*/
143712725SMenno.Lageman@Sun.COM static rctl_qty_t
zone_procs_usage(rctl_t * r,proc_t * p)143812725SMenno.Lageman@Sun.COM zone_procs_usage(rctl_t *r, proc_t *p)
143912725SMenno.Lageman@Sun.COM {
144012725SMenno.Lageman@Sun.COM 	rctl_qty_t nprocs;
144112725SMenno.Lageman@Sun.COM 	zone_t *zone = p->p_zone;
144212725SMenno.Lageman@Sun.COM 
144312725SMenno.Lageman@Sun.COM 	ASSERT(MUTEX_HELD(&p->p_lock));
144412725SMenno.Lageman@Sun.COM 
144512725SMenno.Lageman@Sun.COM 	mutex_enter(&zone->zone_nlwps_lock);
144612725SMenno.Lageman@Sun.COM 	nprocs = zone->zone_nprocs;
144712725SMenno.Lageman@Sun.COM 	mutex_exit(&zone->zone_nlwps_lock);
144812725SMenno.Lageman@Sun.COM 
144912725SMenno.Lageman@Sun.COM 	return (nprocs);
145012725SMenno.Lageman@Sun.COM }
145112725SMenno.Lageman@Sun.COM 
145212725SMenno.Lageman@Sun.COM /*ARGSUSED*/
145312725SMenno.Lageman@Sun.COM static int
zone_procs_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,rctl_val_t * rcntl,rctl_qty_t incr,uint_t flags)145412725SMenno.Lageman@Sun.COM zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
145512725SMenno.Lageman@Sun.COM     rctl_qty_t incr, uint_t flags)
145612725SMenno.Lageman@Sun.COM {
145712725SMenno.Lageman@Sun.COM 	rctl_qty_t nprocs;
145812725SMenno.Lageman@Sun.COM 
145912725SMenno.Lageman@Sun.COM 	ASSERT(MUTEX_HELD(&p->p_lock));
146012725SMenno.Lageman@Sun.COM 	ASSERT(e->rcep_t == RCENTITY_ZONE);
146112725SMenno.Lageman@Sun.COM 	if (e->rcep_p.zone == NULL)
146212725SMenno.Lageman@Sun.COM 		return (0);
146312725SMenno.Lageman@Sun.COM 	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
146412725SMenno.Lageman@Sun.COM 	nprocs = e->rcep_p.zone->zone_nprocs;
146512725SMenno.Lageman@Sun.COM 
146612725SMenno.Lageman@Sun.COM 	if (nprocs + incr > rcntl->rcv_value)
146712725SMenno.Lageman@Sun.COM 		return (1);
146812725SMenno.Lageman@Sun.COM 
146912725SMenno.Lageman@Sun.COM 	return (0);
147012725SMenno.Lageman@Sun.COM }
147112725SMenno.Lageman@Sun.COM 
147212725SMenno.Lageman@Sun.COM /*ARGSUSED*/
147312725SMenno.Lageman@Sun.COM static int
zone_procs_set(rctl_t * rctl,struct proc * p,rctl_entity_p_t * e,rctl_qty_t nv)147412725SMenno.Lageman@Sun.COM zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
147512725SMenno.Lageman@Sun.COM {
147612725SMenno.Lageman@Sun.COM 	ASSERT(MUTEX_HELD(&p->p_lock));
147712725SMenno.Lageman@Sun.COM 	ASSERT(e->rcep_t == RCENTITY_ZONE);
147812725SMenno.Lageman@Sun.COM 	if (e->rcep_p.zone == NULL)
147912725SMenno.Lageman@Sun.COM 		return (0);
148012725SMenno.Lageman@Sun.COM 	e->rcep_p.zone->zone_nprocs_ctl = nv;
148112725SMenno.Lageman@Sun.COM 	return (0);
148212725SMenno.Lageman@Sun.COM }
148312725SMenno.Lageman@Sun.COM 
148412725SMenno.Lageman@Sun.COM static rctl_ops_t zone_procs_ops = {
148512725SMenno.Lageman@Sun.COM 	rcop_no_action,
148612725SMenno.Lageman@Sun.COM 	zone_procs_usage,
148712725SMenno.Lageman@Sun.COM 	zone_procs_set,
148812725SMenno.Lageman@Sun.COM 	zone_procs_test,
148912725SMenno.Lageman@Sun.COM };
149012725SMenno.Lageman@Sun.COM 
149112725SMenno.Lageman@Sun.COM /*ARGSUSED*/
14922677Sml93401 static int
zone_shmmax_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,rctl_val_t * rval,rctl_qty_t incr,uint_t flags)14932677Sml93401 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
14942677Sml93401     rctl_qty_t incr, uint_t flags)
14952677Sml93401 {
14962677Sml93401 	rctl_qty_t v;
14972677Sml93401 	ASSERT(MUTEX_HELD(&p->p_lock));
14982677Sml93401 	ASSERT(e->rcep_t == RCENTITY_ZONE);
14992677Sml93401 	v = e->rcep_p.zone->zone_shmmax + incr;
15002677Sml93401 	if (v > rval->rcv_value)
15012677Sml93401 		return (1);
15022677Sml93401 	return (0);
15032677Sml93401 }
15042677Sml93401 
15052677Sml93401 static rctl_ops_t zone_shmmax_ops = {
15062677Sml93401 	rcop_no_action,
15072677Sml93401 	rcop_no_usage,
15082677Sml93401 	rcop_no_set,
15092677Sml93401 	zone_shmmax_test
15102677Sml93401 };
15112677Sml93401 
15122677Sml93401 /*ARGSUSED*/
15132677Sml93401 static int
zone_shmmni_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,rctl_val_t * rval,rctl_qty_t incr,uint_t flags)15142677Sml93401 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
15152677Sml93401     rctl_qty_t incr, uint_t flags)
15162677Sml93401 {
15172677Sml93401 	rctl_qty_t v;
15182677Sml93401 	ASSERT(MUTEX_HELD(&p->p_lock));
15192677Sml93401 	ASSERT(e->rcep_t == RCENTITY_ZONE);
15202677Sml93401 	v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
15212677Sml93401 	if (v > rval->rcv_value)
15222677Sml93401 		return (1);
15232677Sml93401 	return (0);
15242677Sml93401 }
15252677Sml93401 
15262677Sml93401 static rctl_ops_t zone_shmmni_ops = {
15272677Sml93401 	rcop_no_action,
15282677Sml93401 	rcop_no_usage,
15292677Sml93401 	rcop_no_set,
15302677Sml93401 	zone_shmmni_test
15312677Sml93401 };
15322677Sml93401 
15332677Sml93401 /*ARGSUSED*/
15342677Sml93401 static int
zone_semmni_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,rctl_val_t * rval,rctl_qty_t incr,uint_t flags)15352677Sml93401 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
15362677Sml93401     rctl_qty_t incr, uint_t flags)
15372677Sml93401 {
15382677Sml93401 	rctl_qty_t v;
15392677Sml93401 	ASSERT(MUTEX_HELD(&p->p_lock));
15402677Sml93401 	ASSERT(e->rcep_t == RCENTITY_ZONE);
15412677Sml93401 	v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
15422677Sml93401 	if (v > rval->rcv_value)
15432677Sml93401 		return (1);
15442677Sml93401 	return (0);
15452677Sml93401 }
15462677Sml93401 
15472677Sml93401 static rctl_ops_t zone_semmni_ops = {
15482677Sml93401 	rcop_no_action,
15492677Sml93401 	rcop_no_usage,
15502677Sml93401 	rcop_no_set,
15512677Sml93401 	zone_semmni_test
15522677Sml93401 };
15532677Sml93401 
15542677Sml93401 /*ARGSUSED*/
15552677Sml93401 static int
zone_msgmni_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,rctl_val_t * rval,rctl_qty_t incr,uint_t flags)15562677Sml93401 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
15572677Sml93401     rctl_qty_t incr, uint_t flags)
15582677Sml93401 {
15592677Sml93401 	rctl_qty_t v;
15602677Sml93401 	ASSERT(MUTEX_HELD(&p->p_lock));
15612677Sml93401 	ASSERT(e->rcep_t == RCENTITY_ZONE);
15622677Sml93401 	v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
15632677Sml93401 	if (v > rval->rcv_value)
15642677Sml93401 		return (1);
15652677Sml93401 	return (0);
15662677Sml93401 }
15672677Sml93401 
15682677Sml93401 static rctl_ops_t zone_msgmni_ops = {
15692677Sml93401 	rcop_no_action,
15702677Sml93401 	rcop_no_usage,
15712677Sml93401 	rcop_no_set,
15722677Sml93401 	zone_msgmni_test
15732677Sml93401 };
15742677Sml93401 
15752768Ssl108498 /*ARGSUSED*/
15762768Ssl108498 static rctl_qty_t
zone_locked_mem_usage(rctl_t * rctl,struct proc * p)15772768Ssl108498 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
15782768Ssl108498 {
15792768Ssl108498 	rctl_qty_t q;
15802768Ssl108498 	ASSERT(MUTEX_HELD(&p->p_lock));
15813247Sgjelinek 	mutex_enter(&p->p_zone->zone_mem_lock);
15822768Ssl108498 	q = p->p_zone->zone_locked_mem;
15833247Sgjelinek 	mutex_exit(&p->p_zone->zone_mem_lock);
15842768Ssl108498 	return (q);
15852768Ssl108498 }
15862768Ssl108498 
15872768Ssl108498 /*ARGSUSED*/
15882768Ssl108498 static int
zone_locked_mem_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,rctl_val_t * rcntl,rctl_qty_t incr,uint_t flags)15892768Ssl108498 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
15902768Ssl108498     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
15912768Ssl108498 {
15922768Ssl108498 	rctl_qty_t q;
15933247Sgjelinek 	zone_t *z;
15943247Sgjelinek 
15953247Sgjelinek 	z = e->rcep_p.zone;
15962768Ssl108498 	ASSERT(MUTEX_HELD(&p->p_lock));
15973247Sgjelinek 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
15983247Sgjelinek 	q = z->zone_locked_mem;
15992768Ssl108498 	if (q + incr > rcntl->rcv_value)
16002768Ssl108498 		return (1);
16012768Ssl108498 	return (0);
16022768Ssl108498 }
16032768Ssl108498 
16042768Ssl108498 /*ARGSUSED*/
16052768Ssl108498 static int
zone_locked_mem_set(rctl_t * rctl,struct proc * p,rctl_entity_p_t * e,rctl_qty_t nv)16062768Ssl108498 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
16072768Ssl108498     rctl_qty_t nv)
16082768Ssl108498 {
16092768Ssl108498 	ASSERT(MUTEX_HELD(&p->p_lock));
16102768Ssl108498 	ASSERT(e->rcep_t == RCENTITY_ZONE);
16112768Ssl108498 	if (e->rcep_p.zone == NULL)
16122768Ssl108498 		return (0);
16132768Ssl108498 	e->rcep_p.zone->zone_locked_mem_ctl = nv;
16142768Ssl108498 	return (0);
16152768Ssl108498 }
16162768Ssl108498 
16172768Ssl108498 static rctl_ops_t zone_locked_mem_ops = {
16182768Ssl108498 	rcop_no_action,
16192768Ssl108498 	zone_locked_mem_usage,
16202768Ssl108498 	zone_locked_mem_set,
16212768Ssl108498 	zone_locked_mem_test
16222768Ssl108498 };
16232677Sml93401 
16243247Sgjelinek /*ARGSUSED*/
16253247Sgjelinek static rctl_qty_t
zone_max_swap_usage(rctl_t * rctl,struct proc * p)16263247Sgjelinek zone_max_swap_usage(rctl_t *rctl, struct proc *p)
16273247Sgjelinek {
16283247Sgjelinek 	rctl_qty_t q;
16293247Sgjelinek 	zone_t *z = p->p_zone;
16303247Sgjelinek 
16313247Sgjelinek 	ASSERT(MUTEX_HELD(&p->p_lock));
16323247Sgjelinek 	mutex_enter(&z->zone_mem_lock);
16333247Sgjelinek 	q = z->zone_max_swap;
16343247Sgjelinek 	mutex_exit(&z->zone_mem_lock);
16353247Sgjelinek 	return (q);
16363247Sgjelinek }
16373247Sgjelinek 
16383247Sgjelinek /*ARGSUSED*/
16393247Sgjelinek static int
zone_max_swap_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,rctl_val_t * rcntl,rctl_qty_t incr,uint_t flags)16403247Sgjelinek zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
16413247Sgjelinek     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
16423247Sgjelinek {
16433247Sgjelinek 	rctl_qty_t q;
16443247Sgjelinek 	zone_t *z;
16453247Sgjelinek 
16463247Sgjelinek 	z = e->rcep_p.zone;
16473247Sgjelinek 	ASSERT(MUTEX_HELD(&p->p_lock));
16483247Sgjelinek 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
16493247Sgjelinek 	q = z->zone_max_swap;
16503247Sgjelinek 	if (q + incr > rcntl->rcv_value)
16513247Sgjelinek 		return (1);
16523247Sgjelinek 	return (0);
16533247Sgjelinek }
16543247Sgjelinek 
16553247Sgjelinek /*ARGSUSED*/
16563247Sgjelinek static int
zone_max_swap_set(rctl_t * rctl,struct proc * p,rctl_entity_p_t * e,rctl_qty_t nv)16573247Sgjelinek zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
16583247Sgjelinek     rctl_qty_t nv)
16593247Sgjelinek {
16603247Sgjelinek 	ASSERT(MUTEX_HELD(&p->p_lock));
16613247Sgjelinek 	ASSERT(e->rcep_t == RCENTITY_ZONE);
16623247Sgjelinek 	if (e->rcep_p.zone == NULL)
16633247Sgjelinek 		return (0);
16643247Sgjelinek 	e->rcep_p.zone->zone_max_swap_ctl = nv;
16653247Sgjelinek 	return (0);
16663247Sgjelinek }
16673247Sgjelinek 
16683247Sgjelinek static rctl_ops_t zone_max_swap_ops = {
16693247Sgjelinek 	rcop_no_action,
16703247Sgjelinek 	zone_max_swap_usage,
16713247Sgjelinek 	zone_max_swap_set,
16723247Sgjelinek 	zone_max_swap_test
16733247Sgjelinek };
16743247Sgjelinek 
167512633Sjohn.levon@sun.com /*ARGSUSED*/
167612633Sjohn.levon@sun.com static rctl_qty_t
zone_max_lofi_usage(rctl_t * rctl,struct proc * p)167712633Sjohn.levon@sun.com zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
167812633Sjohn.levon@sun.com {
167912633Sjohn.levon@sun.com 	rctl_qty_t q;
168012633Sjohn.levon@sun.com 	zone_t *z = p->p_zone;
168112633Sjohn.levon@sun.com 
168212633Sjohn.levon@sun.com 	ASSERT(MUTEX_HELD(&p->p_lock));
168312633Sjohn.levon@sun.com 	mutex_enter(&z->zone_rctl_lock);
168412633Sjohn.levon@sun.com 	q = z->zone_max_lofi;
168512633Sjohn.levon@sun.com 	mutex_exit(&z->zone_rctl_lock);
168612633Sjohn.levon@sun.com 	return (q);
168712633Sjohn.levon@sun.com }
168812633Sjohn.levon@sun.com 
168912633Sjohn.levon@sun.com /*ARGSUSED*/
169012633Sjohn.levon@sun.com static int
zone_max_lofi_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,rctl_val_t * rcntl,rctl_qty_t incr,uint_t flags)169112633Sjohn.levon@sun.com zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
169212633Sjohn.levon@sun.com     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
169312633Sjohn.levon@sun.com {
169412633Sjohn.levon@sun.com 	rctl_qty_t q;
169512633Sjohn.levon@sun.com 	zone_t *z;
169612633Sjohn.levon@sun.com 
169712633Sjohn.levon@sun.com 	z = e->rcep_p.zone;
169812633Sjohn.levon@sun.com 	ASSERT(MUTEX_HELD(&p->p_lock));
169912633Sjohn.levon@sun.com 	ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
170012633Sjohn.levon@sun.com 	q = z->zone_max_lofi;
170112633Sjohn.levon@sun.com 	if (q + incr > rcntl->rcv_value)
170212633Sjohn.levon@sun.com 		return (1);
170312633Sjohn.levon@sun.com 	return (0);
170412633Sjohn.levon@sun.com }
170512633Sjohn.levon@sun.com 
170612633Sjohn.levon@sun.com /*ARGSUSED*/
170712633Sjohn.levon@sun.com static int
zone_max_lofi_set(rctl_t * rctl,struct proc * p,rctl_entity_p_t * e,rctl_qty_t nv)170812633Sjohn.levon@sun.com zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
170912633Sjohn.levon@sun.com     rctl_qty_t nv)
171012633Sjohn.levon@sun.com {
171112633Sjohn.levon@sun.com 	ASSERT(MUTEX_HELD(&p->p_lock));
171212633Sjohn.levon@sun.com 	ASSERT(e->rcep_t == RCENTITY_ZONE);
171312633Sjohn.levon@sun.com 	if (e->rcep_p.zone == NULL)
171412633Sjohn.levon@sun.com 		return (0);
171512633Sjohn.levon@sun.com 	e->rcep_p.zone->zone_max_lofi_ctl = nv;
171612633Sjohn.levon@sun.com 	return (0);
171712633Sjohn.levon@sun.com }
171812633Sjohn.levon@sun.com 
171912633Sjohn.levon@sun.com static rctl_ops_t zone_max_lofi_ops = {
172012633Sjohn.levon@sun.com 	rcop_no_action,
172112633Sjohn.levon@sun.com 	zone_max_lofi_usage,
172212633Sjohn.levon@sun.com 	zone_max_lofi_set,
172312633Sjohn.levon@sun.com 	zone_max_lofi_test
172412633Sjohn.levon@sun.com };
172512633Sjohn.levon@sun.com 
17260Sstevel@tonic-gate /*
17270Sstevel@tonic-gate  * Helper function to brand the zone with a unique ID.
17280Sstevel@tonic-gate  */
17290Sstevel@tonic-gate static void
zone_uniqid(zone_t * zone)17300Sstevel@tonic-gate zone_uniqid(zone_t *zone)
17310Sstevel@tonic-gate {
17320Sstevel@tonic-gate 	static uint64_t uniqid = 0;
17330Sstevel@tonic-gate 
17340Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
17350Sstevel@tonic-gate 	zone->zone_uniqid = uniqid++;
17360Sstevel@tonic-gate }
17370Sstevel@tonic-gate 
17380Sstevel@tonic-gate /*
17390Sstevel@tonic-gate  * Returns a held pointer to the "kcred" for the specified zone.
17400Sstevel@tonic-gate  */
17410Sstevel@tonic-gate struct cred *
zone_get_kcred(zoneid_t zoneid)17420Sstevel@tonic-gate zone_get_kcred(zoneid_t zoneid)
17430Sstevel@tonic-gate {
17440Sstevel@tonic-gate 	zone_t *zone;
17450Sstevel@tonic-gate 	cred_t *cr;
17460Sstevel@tonic-gate 
17470Sstevel@tonic-gate 	if ((zone = zone_find_by_id(zoneid)) == NULL)
17480Sstevel@tonic-gate 		return (NULL);
17490Sstevel@tonic-gate 	cr = zone->zone_kcred;
17500Sstevel@tonic-gate 	crhold(cr);
17510Sstevel@tonic-gate 	zone_rele(zone);
17520Sstevel@tonic-gate 	return (cr);
17530Sstevel@tonic-gate }
17540Sstevel@tonic-gate 
17553247Sgjelinek static int
zone_lockedmem_kstat_update(kstat_t * ksp,int rw)17563247Sgjelinek zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
17573247Sgjelinek {
17583247Sgjelinek 	zone_t *zone = ksp->ks_private;
17593247Sgjelinek 	zone_kstat_t *zk = ksp->ks_data;
17603247Sgjelinek 
17613247Sgjelinek 	if (rw == KSTAT_WRITE)
17623247Sgjelinek 		return (EACCES);
17633247Sgjelinek 
17643247Sgjelinek 	zk->zk_usage.value.ui64 = zone->zone_locked_mem;
17653247Sgjelinek 	zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
17663247Sgjelinek 	return (0);
17673247Sgjelinek }
17683247Sgjelinek 
17693247Sgjelinek static int
zone_nprocs_kstat_update(kstat_t * ksp,int rw)177012725SMenno.Lageman@Sun.COM zone_nprocs_kstat_update(kstat_t *ksp, int rw)
177112725SMenno.Lageman@Sun.COM {
177212725SMenno.Lageman@Sun.COM 	zone_t *zone = ksp->ks_private;
177312725SMenno.Lageman@Sun.COM 	zone_kstat_t *zk = ksp->ks_data;
177412725SMenno.Lageman@Sun.COM 
177512725SMenno.Lageman@Sun.COM 	if (rw == KSTAT_WRITE)
177612725SMenno.Lageman@Sun.COM 		return (EACCES);
177712725SMenno.Lageman@Sun.COM 
177812725SMenno.Lageman@Sun.COM 	zk->zk_usage.value.ui64 = zone->zone_nprocs;
177912725SMenno.Lageman@Sun.COM 	zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
178012725SMenno.Lageman@Sun.COM 	return (0);
178112725SMenno.Lageman@Sun.COM }
178212725SMenno.Lageman@Sun.COM 
178312725SMenno.Lageman@Sun.COM static int
zone_swapresv_kstat_update(kstat_t * ksp,int rw)17843247Sgjelinek zone_swapresv_kstat_update(kstat_t *ksp, int rw)
17853247Sgjelinek {
17863247Sgjelinek 	zone_t *zone = ksp->ks_private;
17873247Sgjelinek 	zone_kstat_t *zk = ksp->ks_data;
17883247Sgjelinek 
17893247Sgjelinek 	if (rw == KSTAT_WRITE)
17903247Sgjelinek 		return (EACCES);
17913247Sgjelinek 
17923247Sgjelinek 	zk->zk_usage.value.ui64 = zone->zone_max_swap;
17933247Sgjelinek 	zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
17943247Sgjelinek 	return (0);
17953247Sgjelinek }
17963247Sgjelinek 
179712725SMenno.Lageman@Sun.COM static kstat_t *
zone_kstat_create_common(zone_t * zone,char * name,int (* updatefunc)(kstat_t *,int))179812725SMenno.Lageman@Sun.COM zone_kstat_create_common(zone_t *zone, char *name,
179912725SMenno.Lageman@Sun.COM     int (*updatefunc) (kstat_t *, int))
18003247Sgjelinek {
18013247Sgjelinek 	kstat_t *ksp;
18023247Sgjelinek 	zone_kstat_t *zk;
18033247Sgjelinek 
180412725SMenno.Lageman@Sun.COM 	ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
18053247Sgjelinek 	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
18063247Sgjelinek 	    KSTAT_FLAG_VIRTUAL);
18073247Sgjelinek 
18083247Sgjelinek 	if (ksp == NULL)
180912725SMenno.Lageman@Sun.COM 		return (NULL);
18103247Sgjelinek 
18113247Sgjelinek 	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
18123247Sgjelinek 	ksp->ks_data_size += strlen(zone->zone_name) + 1;
18133247Sgjelinek 	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
18143247Sgjelinek 	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
18153247Sgjelinek 	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
18163247Sgjelinek 	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
181712725SMenno.Lageman@Sun.COM 	ksp->ks_update = updatefunc;
18183247Sgjelinek 	ksp->ks_private = zone;
18193247Sgjelinek 	kstat_install(ksp);
182012725SMenno.Lageman@Sun.COM 	return (ksp);
182112725SMenno.Lageman@Sun.COM }
182212725SMenno.Lageman@Sun.COM 
182312725SMenno.Lageman@Sun.COM static void
zone_kstat_create(zone_t * zone)182412725SMenno.Lageman@Sun.COM zone_kstat_create(zone_t *zone)
182512725SMenno.Lageman@Sun.COM {
182612725SMenno.Lageman@Sun.COM 	zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
182712725SMenno.Lageman@Sun.COM 	    "lockedmem", zone_lockedmem_kstat_update);
182812725SMenno.Lageman@Sun.COM 	zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
182912725SMenno.Lageman@Sun.COM 	    "swapresv", zone_swapresv_kstat_update);
183012725SMenno.Lageman@Sun.COM 	zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
183112725SMenno.Lageman@Sun.COM 	    "nprocs", zone_nprocs_kstat_update);
183212725SMenno.Lageman@Sun.COM }
183312725SMenno.Lageman@Sun.COM 
183412725SMenno.Lageman@Sun.COM static void
zone_kstat_delete_common(kstat_t ** pkstat)183512725SMenno.Lageman@Sun.COM zone_kstat_delete_common(kstat_t **pkstat)
183612725SMenno.Lageman@Sun.COM {
183712725SMenno.Lageman@Sun.COM 	void *data;
183812725SMenno.Lageman@Sun.COM 
183912725SMenno.Lageman@Sun.COM 	if (*pkstat != NULL) {
184012725SMenno.Lageman@Sun.COM 		data = (*pkstat)->ks_data;
184112725SMenno.Lageman@Sun.COM 		kstat_delete(*pkstat);
184212725SMenno.Lageman@Sun.COM 		kmem_free(data, sizeof (zone_kstat_t));
184312725SMenno.Lageman@Sun.COM 		*pkstat = NULL;
184412725SMenno.Lageman@Sun.COM 	}
18453247Sgjelinek }
18463247Sgjelinek 
18473247Sgjelinek static void
zone_kstat_delete(zone_t * zone)18483247Sgjelinek zone_kstat_delete(zone_t *zone)
18493247Sgjelinek {
185012725SMenno.Lageman@Sun.COM 	zone_kstat_delete_common(&zone->zone_lockedmem_kstat);
185112725SMenno.Lageman@Sun.COM 	zone_kstat_delete_common(&zone->zone_swapresv_kstat);
185212725SMenno.Lageman@Sun.COM 	zone_kstat_delete_common(&zone->zone_nprocs_kstat);
18533247Sgjelinek }
18543247Sgjelinek 
18550Sstevel@tonic-gate /*
18560Sstevel@tonic-gate  * Called very early on in boot to initialize the ZSD list so that
18570Sstevel@tonic-gate  * zone_key_create() can be called before zone_init().  It also initializes
18580Sstevel@tonic-gate  * portions of zone0 which may be used before zone_init() is called.  The
18590Sstevel@tonic-gate  * variable "global_zone" will be set when zone0 is fully initialized by
18600Sstevel@tonic-gate  * zone_init().
18610Sstevel@tonic-gate  */
18620Sstevel@tonic-gate void
zone_zsd_init(void)18630Sstevel@tonic-gate zone_zsd_init(void)
18640Sstevel@tonic-gate {
18650Sstevel@tonic-gate 	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
18660Sstevel@tonic-gate 	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
18670Sstevel@tonic-gate 	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
18680Sstevel@tonic-gate 	    offsetof(struct zsd_entry, zsd_linkage));
18690Sstevel@tonic-gate 	list_create(&zone_active, sizeof (zone_t),
18700Sstevel@tonic-gate 	    offsetof(zone_t, zone_linkage));
18710Sstevel@tonic-gate 	list_create(&zone_deathrow, sizeof (zone_t),
18720Sstevel@tonic-gate 	    offsetof(zone_t, zone_linkage));
18730Sstevel@tonic-gate 
18740Sstevel@tonic-gate 	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
18750Sstevel@tonic-gate 	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
18763247Sgjelinek 	mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
18770Sstevel@tonic-gate 	zone0.zone_shares = 1;
18783247Sgjelinek 	zone0.zone_nlwps = 0;
18790Sstevel@tonic-gate 	zone0.zone_nlwps_ctl = INT_MAX;
188012725SMenno.Lageman@Sun.COM 	zone0.zone_nprocs = 0;
188112725SMenno.Lageman@Sun.COM 	zone0.zone_nprocs_ctl = INT_MAX;
18823247Sgjelinek 	zone0.zone_locked_mem = 0;
18833247Sgjelinek 	zone0.zone_locked_mem_ctl = UINT64_MAX;
18843247Sgjelinek 	ASSERT(zone0.zone_max_swap == 0);
18853247Sgjelinek 	zone0.zone_max_swap_ctl = UINT64_MAX;
188612633Sjohn.levon@sun.com 	zone0.zone_max_lofi = 0;
188712633Sjohn.levon@sun.com 	zone0.zone_max_lofi_ctl = UINT64_MAX;
18882677Sml93401 	zone0.zone_shmmax = 0;
18892677Sml93401 	zone0.zone_ipc.ipcq_shmmni = 0;
18902677Sml93401 	zone0.zone_ipc.ipcq_semmni = 0;
18912677Sml93401 	zone0.zone_ipc.ipcq_msgmni = 0;
18920Sstevel@tonic-gate 	zone0.zone_name = GLOBAL_ZONENAME;
18930Sstevel@tonic-gate 	zone0.zone_nodename = utsname.nodename;
18940Sstevel@tonic-gate 	zone0.zone_domain = srpc_domain;
18958662SJordan.Vaughan@Sun.com 	zone0.zone_hostid = HW_INVALID_HOSTID;
189612633Sjohn.levon@sun.com 	zone0.zone_fs_allowed = NULL;
18970Sstevel@tonic-gate 	zone0.zone_ref = 1;
18980Sstevel@tonic-gate 	zone0.zone_id = GLOBAL_ZONEID;
18990Sstevel@tonic-gate 	zone0.zone_status = ZONE_IS_RUNNING;
19000Sstevel@tonic-gate 	zone0.zone_rootpath = "/";
19010Sstevel@tonic-gate 	zone0.zone_rootpathlen = 2;
19020Sstevel@tonic-gate 	zone0.zone_psetid = ZONE_PS_INVAL;
19030Sstevel@tonic-gate 	zone0.zone_ncpus = 0;
19040Sstevel@tonic-gate 	zone0.zone_ncpus_online = 0;
19050Sstevel@tonic-gate 	zone0.zone_proc_initpid = 1;
19062267Sdp 	zone0.zone_initname = initname;
19073247Sgjelinek 	zone0.zone_lockedmem_kstat = NULL;
19083247Sgjelinek 	zone0.zone_swapresv_kstat = NULL;
190912725SMenno.Lageman@Sun.COM 	zone0.zone_nprocs_kstat = NULL;
1910*13096SJordan.Vaughan@Sun.com 	list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
1911*13096SJordan.Vaughan@Sun.com 	    offsetof(zone_ref_t, zref_linkage));
19120Sstevel@tonic-gate 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
19130Sstevel@tonic-gate 	    offsetof(struct zsd_entry, zsd_linkage));
19140Sstevel@tonic-gate 	list_insert_head(&zone_active, &zone0);
19150Sstevel@tonic-gate 
19160Sstevel@tonic-gate 	/*
19170Sstevel@tonic-gate 	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
19180Sstevel@tonic-gate 	 * to anything meaningful.  It is assigned to be 'rootdir' in
19190Sstevel@tonic-gate 	 * vfs_mountroot().
19200Sstevel@tonic-gate 	 */
19210Sstevel@tonic-gate 	zone0.zone_rootvp = NULL;
19220Sstevel@tonic-gate 	zone0.zone_vfslist = NULL;
19232267Sdp 	zone0.zone_bootargs = initargs;
19240Sstevel@tonic-gate 	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
19250Sstevel@tonic-gate 	/*
19260Sstevel@tonic-gate 	 * The global zone has all privileges
19270Sstevel@tonic-gate 	 */
19280Sstevel@tonic-gate 	priv_fillset(zone0.zone_privset);
19290Sstevel@tonic-gate 	/*
19300Sstevel@tonic-gate 	 * Add p0 to the global zone
19310Sstevel@tonic-gate 	 */
19320Sstevel@tonic-gate 	zone0.zone_zsched = &p0;
19330Sstevel@tonic-gate 	p0.p_zone = &zone0;
19340Sstevel@tonic-gate }
19350Sstevel@tonic-gate 
19360Sstevel@tonic-gate /*
19371676Sjpk  * Compute a hash value based on the contents of the label and the DOI.  The
19381676Sjpk  * hash algorithm is somewhat arbitrary, but is based on the observation that
19391676Sjpk  * humans will likely pick labels that differ by amounts that work out to be
19401676Sjpk  * multiples of the number of hash chains, and thus stirring in some primes
19411676Sjpk  * should help.
19421676Sjpk  */
19431676Sjpk static uint_t
hash_bylabel(void * hdata,mod_hash_key_t key)19441676Sjpk hash_bylabel(void *hdata, mod_hash_key_t key)
19451676Sjpk {
19461676Sjpk 	const ts_label_t *lab = (ts_label_t *)key;
19471676Sjpk 	const uint32_t *up, *ue;
19481676Sjpk 	uint_t hash;
19491676Sjpk 	int i;
19501676Sjpk 
19511676Sjpk 	_NOTE(ARGUNUSED(hdata));
19521676Sjpk 
19531676Sjpk 	hash = lab->tsl_doi + (lab->tsl_doi << 1);
19541676Sjpk 	/* we depend on alignment of label, but not representation */
19551676Sjpk 	up = (const uint32_t *)&lab->tsl_label;
19561676Sjpk 	ue = up + sizeof (lab->tsl_label) / sizeof (*up);
19571676Sjpk 	i = 1;
19581676Sjpk 	while (up < ue) {
19591676Sjpk 		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
19601676Sjpk 		hash += *up + (*up << ((i % 16) + 1));
19611676Sjpk 		up++;
19621676Sjpk 		i++;
19631676Sjpk 	}
19641676Sjpk 	return (hash);
19651676Sjpk }
19661676Sjpk 
19671676Sjpk /*
19681676Sjpk  * All that mod_hash cares about here is zero (equal) versus non-zero (not
19691676Sjpk  * equal).  This may need to be changed if less than / greater than is ever
19701676Sjpk  * needed.
19711676Sjpk  */
19721676Sjpk static int
hash_labelkey_cmp(mod_hash_key_t key1,mod_hash_key_t key2)19731676Sjpk hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
19741676Sjpk {
19751676Sjpk 	ts_label_t *lab1 = (ts_label_t *)key1;
19761676Sjpk 	ts_label_t *lab2 = (ts_label_t *)key2;
19771676Sjpk 
19781676Sjpk 	return (label_equal(lab1, lab2) ? 0 : 1);
19791676Sjpk }
19801676Sjpk 
19811676Sjpk /*
19820Sstevel@tonic-gate  * Called by main() to initialize the zones framework.
19830Sstevel@tonic-gate  */
19840Sstevel@tonic-gate void
zone_init(void)19850Sstevel@tonic-gate zone_init(void)
19860Sstevel@tonic-gate {
19870Sstevel@tonic-gate 	rctl_dict_entry_t *rde;
19880Sstevel@tonic-gate 	rctl_val_t *dval;
19890Sstevel@tonic-gate 	rctl_set_t *set;
19900Sstevel@tonic-gate 	rctl_alloc_gp_t *gp;
19910Sstevel@tonic-gate 	rctl_entity_p_t e;
19921166Sdstaff 	int res;
19930Sstevel@tonic-gate 
19940Sstevel@tonic-gate 	ASSERT(curproc == &p0);
19950Sstevel@tonic-gate 
19960Sstevel@tonic-gate 	/*
19970Sstevel@tonic-gate 	 * Create ID space for zone IDs.  ID 0 is reserved for the
19980Sstevel@tonic-gate 	 * global zone.
19990Sstevel@tonic-gate 	 */
20000Sstevel@tonic-gate 	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
20010Sstevel@tonic-gate 
20020Sstevel@tonic-gate 	/*
20030Sstevel@tonic-gate 	 * Initialize generic zone resource controls, if any.
20040Sstevel@tonic-gate 	 */
20050Sstevel@tonic-gate 	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
20060Sstevel@tonic-gate 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
20071996Sml93401 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
20083792Sakolb 	    FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
20093792Sakolb 
20103792Sakolb 	rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
20113792Sakolb 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
20123792Sakolb 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
20133792Sakolb 	    RCTL_GLOBAL_INFINITE,
20143792Sakolb 	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
20150Sstevel@tonic-gate 
20160Sstevel@tonic-gate 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
20170Sstevel@tonic-gate 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
20180Sstevel@tonic-gate 	    INT_MAX, INT_MAX, &zone_lwps_ops);
201912725SMenno.Lageman@Sun.COM 
202012725SMenno.Lageman@Sun.COM 	rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
202112725SMenno.Lageman@Sun.COM 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
202212725SMenno.Lageman@Sun.COM 	    INT_MAX, INT_MAX, &zone_procs_ops);
202312725SMenno.Lageman@Sun.COM 
20240Sstevel@tonic-gate 	/*
20252677Sml93401 	 * System V IPC resource controls
20262677Sml93401 	 */
20272677Sml93401 	rc_zone_msgmni = rctl_register("zone.max-msg-ids",
20282677Sml93401 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
20292677Sml93401 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
20302677Sml93401 
20312677Sml93401 	rc_zone_semmni = rctl_register("zone.max-sem-ids",
20322677Sml93401 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
20332677Sml93401 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
20342677Sml93401 
20352677Sml93401 	rc_zone_shmmni = rctl_register("zone.max-shm-ids",
20362677Sml93401 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
20372677Sml93401 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
20382677Sml93401 
20392677Sml93401 	rc_zone_shmmax = rctl_register("zone.max-shm-memory",
20402677Sml93401 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
20412677Sml93401 	    RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
20422677Sml93401 
20432677Sml93401 	/*
20440Sstevel@tonic-gate 	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
20450Sstevel@tonic-gate 	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
20460Sstevel@tonic-gate 	 */
20470Sstevel@tonic-gate 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
20480Sstevel@tonic-gate 	bzero(dval, sizeof (rctl_val_t));
20490Sstevel@tonic-gate 	dval->rcv_value = 1;
20500Sstevel@tonic-gate 	dval->rcv_privilege = RCPRIV_PRIVILEGED;
20510Sstevel@tonic-gate 	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
20520Sstevel@tonic-gate 	dval->rcv_action_recip_pid = -1;
20530Sstevel@tonic-gate 
20540Sstevel@tonic-gate 	rde = rctl_dict_lookup("zone.cpu-shares");
20550Sstevel@tonic-gate 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
20560Sstevel@tonic-gate 
20572768Ssl108498 	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
20582768Ssl108498 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
20592768Ssl108498 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
20602768Ssl108498 	    &zone_locked_mem_ops);
20613247Sgjelinek 
20623247Sgjelinek 	rc_zone_max_swap = rctl_register("zone.max-swap",
20633247Sgjelinek 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
20643247Sgjelinek 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
20653247Sgjelinek 	    &zone_max_swap_ops);
20663247Sgjelinek 
206712633Sjohn.levon@sun.com 	rc_zone_max_lofi = rctl_register("zone.max-lofi",
206812633Sjohn.levon@sun.com 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
206912633Sjohn.levon@sun.com 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
207012633Sjohn.levon@sun.com 	    &zone_max_lofi_ops);
207112633Sjohn.levon@sun.com 
20720Sstevel@tonic-gate 	/*
20730Sstevel@tonic-gate 	 * Initialize the ``global zone''.
20740Sstevel@tonic-gate 	 */
20750Sstevel@tonic-gate 	set = rctl_set_create();
20760Sstevel@tonic-gate 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
20770Sstevel@tonic-gate 	mutex_enter(&p0.p_lock);
20780Sstevel@tonic-gate 	e.rcep_p.zone = &zone0;
20790Sstevel@tonic-gate 	e.rcep_t = RCENTITY_ZONE;
20800Sstevel@tonic-gate 	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
20810Sstevel@tonic-gate 	    gp);
20820Sstevel@tonic-gate 
20830Sstevel@tonic-gate 	zone0.zone_nlwps = p0.p_lwpcnt;
208412725SMenno.Lageman@Sun.COM 	zone0.zone_nprocs = 1;
20850Sstevel@tonic-gate 	zone0.zone_ntasks = 1;
20860Sstevel@tonic-gate 	mutex_exit(&p0.p_lock);
20872712Snn35248 	zone0.zone_restart_init = B_TRUE;
20882712Snn35248 	zone0.zone_brand = &native_brand;
20890Sstevel@tonic-gate 	rctl_prealloc_destroy(gp);
20900Sstevel@tonic-gate 	/*
20913247Sgjelinek 	 * pool_default hasn't been initialized yet, so we let pool_init()
20923247Sgjelinek 	 * take care of making sure the global zone is in the default pool.
20930Sstevel@tonic-gate 	 */
20941676Sjpk 
20951676Sjpk 	/*
20963247Sgjelinek 	 * Initialize global zone kstats
20973247Sgjelinek 	 */
20983247Sgjelinek 	zone_kstat_create(&zone0);
20993247Sgjelinek 
21003247Sgjelinek 	/*
21011676Sjpk 	 * Initialize zone label.
21021676Sjpk 	 * mlp are initialized when tnzonecfg is loaded.
21031676Sjpk 	 */
21041676Sjpk 	zone0.zone_slabel = l_admin_low;
21051676Sjpk 	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
21061676Sjpk 	label_hold(l_admin_low);
21071676Sjpk 
210810910SRobert.Harris@Sun.COM 	/*
210910910SRobert.Harris@Sun.COM 	 * Initialise the lock for the database structure used by mntfs.
211010910SRobert.Harris@Sun.COM 	 */
211110910SRobert.Harris@Sun.COM 	rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
211210910SRobert.Harris@Sun.COM 
21130Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
21140Sstevel@tonic-gate 	zone_uniqid(&zone0);
21150Sstevel@tonic-gate 	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
21161676Sjpk 
21170Sstevel@tonic-gate 	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
21180Sstevel@tonic-gate 	    mod_hash_null_valdtor);
21190Sstevel@tonic-gate 	zonehashbyname = mod_hash_create_strhash("zone_by_name",
21200Sstevel@tonic-gate 	    zone_hash_size, mod_hash_null_valdtor);
21211676Sjpk 	/*
21221676Sjpk 	 * maintain zonehashbylabel only for labeled systems
21231676Sjpk 	 */
21241676Sjpk 	if (is_system_labeled())
21251676Sjpk 		zonehashbylabel = mod_hash_create_extended("zone_by_label",
21261676Sjpk 		    zone_hash_size, mod_hash_null_keydtor,
21271676Sjpk 		    mod_hash_null_valdtor, hash_bylabel, NULL,
21281676Sjpk 		    hash_labelkey_cmp, KM_SLEEP);
21290Sstevel@tonic-gate 	zonecount = 1;
21300Sstevel@tonic-gate 
21310Sstevel@tonic-gate 	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
21320Sstevel@tonic-gate 	    (mod_hash_val_t)&zone0);
21330Sstevel@tonic-gate 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
21340Sstevel@tonic-gate 	    (mod_hash_val_t)&zone0);
21351769Scarlsonj 	if (is_system_labeled()) {
21361769Scarlsonj 		zone0.zone_flags |= ZF_HASHED_LABEL;
21371676Sjpk 		(void) mod_hash_insert(zonehashbylabel,
21381676Sjpk 		    (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
21391769Scarlsonj 	}
21401676Sjpk 	mutex_exit(&zonehash_lock);
21411676Sjpk 
21420Sstevel@tonic-gate 	/*
21430Sstevel@tonic-gate 	 * We avoid setting zone_kcred until now, since kcred is initialized
21440Sstevel@tonic-gate 	 * sometime after zone_zsd_init() and before zone_init().
21450Sstevel@tonic-gate 	 */
21460Sstevel@tonic-gate 	zone0.zone_kcred = kcred;
21470Sstevel@tonic-gate 	/*
21480Sstevel@tonic-gate 	 * The global zone is fully initialized (except for zone_rootvp which
21490Sstevel@tonic-gate 	 * will be set when the root filesystem is mounted).
21500Sstevel@tonic-gate 	 */
21510Sstevel@tonic-gate 	global_zone = &zone0;
21521166Sdstaff 
21531166Sdstaff 	/*
21541166Sdstaff 	 * Setup an event channel to send zone status change notifications on
21551166Sdstaff 	 */
21561166Sdstaff 	res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
21571166Sdstaff 	    EVCH_CREAT);
21581166Sdstaff 
21591166Sdstaff 	if (res)
21601166Sdstaff 		panic("Sysevent_evc_bind failed during zone setup.\n");
21613247Sgjelinek 
21620Sstevel@tonic-gate }
21630Sstevel@tonic-gate 
21640Sstevel@tonic-gate static void
zone_free(zone_t * zone)21650Sstevel@tonic-gate zone_free(zone_t *zone)
21660Sstevel@tonic-gate {
21670Sstevel@tonic-gate 	ASSERT(zone != global_zone);
21680Sstevel@tonic-gate 	ASSERT(zone->zone_ntasks == 0);
21690Sstevel@tonic-gate 	ASSERT(zone->zone_nlwps == 0);
217012725SMenno.Lageman@Sun.COM 	ASSERT(zone->zone_nprocs == 0);
21710Sstevel@tonic-gate 	ASSERT(zone->zone_cred_ref == 0);
21720Sstevel@tonic-gate 	ASSERT(zone->zone_kcred == NULL);
21730Sstevel@tonic-gate 	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
21740Sstevel@tonic-gate 	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2175*13096SJordan.Vaughan@Sun.com 	ASSERT(list_is_empty(&zone->zone_ref_list));
21760Sstevel@tonic-gate 
21773792Sakolb 	/*
21783792Sakolb 	 * Remove any zone caps.
21793792Sakolb 	 */
21803792Sakolb 	cpucaps_zone_remove(zone);
21813792Sakolb 
21823792Sakolb 	ASSERT(zone->zone_cpucap == NULL);
21833792Sakolb 
21840Sstevel@tonic-gate 	/* remove from deathrow list */
21850Sstevel@tonic-gate 	if (zone_status_get(zone) == ZONE_IS_DEAD) {
21860Sstevel@tonic-gate 		ASSERT(zone->zone_ref == 0);
21870Sstevel@tonic-gate 		mutex_enter(&zone_deathrow_lock);
21880Sstevel@tonic-gate 		list_remove(&zone_deathrow, zone);
21890Sstevel@tonic-gate 		mutex_exit(&zone_deathrow_lock);
21900Sstevel@tonic-gate 	}
21910Sstevel@tonic-gate 
2192*13096SJordan.Vaughan@Sun.com 	list_destroy(&zone->zone_ref_list);
21930Sstevel@tonic-gate 	zone_free_zsd(zone);
2194789Sahrens 	zone_free_datasets(zone);
219510616SSebastien.Roy@Sun.COM 	list_destroy(&zone->zone_dl_list);
21960Sstevel@tonic-gate 
21970Sstevel@tonic-gate 	if (zone->zone_rootvp != NULL)
21980Sstevel@tonic-gate 		VN_RELE(zone->zone_rootvp);
21990Sstevel@tonic-gate 	if (zone->zone_rootpath)
22000Sstevel@tonic-gate 		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
22010Sstevel@tonic-gate 	if (zone->zone_name != NULL)
22020Sstevel@tonic-gate 		kmem_free(zone->zone_name, ZONENAME_MAX);
22031676Sjpk 	if (zone->zone_slabel != NULL)
22041676Sjpk 		label_rele(zone->zone_slabel);
22050Sstevel@tonic-gate 	if (zone->zone_nodename != NULL)
22060Sstevel@tonic-gate 		kmem_free(zone->zone_nodename, _SYS_NMLN);
22070Sstevel@tonic-gate 	if (zone->zone_domain != NULL)
22080Sstevel@tonic-gate 		kmem_free(zone->zone_domain, _SYS_NMLN);
22090Sstevel@tonic-gate 	if (zone->zone_privset != NULL)
22100Sstevel@tonic-gate 		kmem_free(zone->zone_privset, sizeof (priv_set_t));
22110Sstevel@tonic-gate 	if (zone->zone_rctls != NULL)
22120Sstevel@tonic-gate 		rctl_set_free(zone->zone_rctls);
22130Sstevel@tonic-gate 	if (zone->zone_bootargs != NULL)
221412633Sjohn.levon@sun.com 		strfree(zone->zone_bootargs);
22152267Sdp 	if (zone->zone_initname != NULL)
221612633Sjohn.levon@sun.com 		strfree(zone->zone_initname);
221712633Sjohn.levon@sun.com 	if (zone->zone_fs_allowed != NULL)
221812633Sjohn.levon@sun.com 		strfree(zone->zone_fs_allowed);
221912273SCasper.Dik@Sun.COM 	if (zone->zone_pfexecd != NULL)
222012273SCasper.Dik@Sun.COM 		klpd_freelist(&zone->zone_pfexecd);
22210Sstevel@tonic-gate 	id_free(zoneid_space, zone->zone_id);
22220Sstevel@tonic-gate 	mutex_destroy(&zone->zone_lock);
22230Sstevel@tonic-gate 	cv_destroy(&zone->zone_cv);
22241676Sjpk 	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
222510910SRobert.Harris@Sun.COM 	rw_destroy(&zone->zone_mntfs_db_lock);
22260Sstevel@tonic-gate 	kmem_free(zone, sizeof (zone_t));
22270Sstevel@tonic-gate }
22280Sstevel@tonic-gate 
22290Sstevel@tonic-gate /*
22300Sstevel@tonic-gate  * See block comment at the top of this file for information about zone
22310Sstevel@tonic-gate  * status values.
22320Sstevel@tonic-gate  */
22330Sstevel@tonic-gate /*
22340Sstevel@tonic-gate  * Convenience function for setting zone status.
22350Sstevel@tonic-gate  */
22360Sstevel@tonic-gate static void
zone_status_set(zone_t * zone,zone_status_t status)22370Sstevel@tonic-gate zone_status_set(zone_t *zone, zone_status_t status)
22380Sstevel@tonic-gate {
22391166Sdstaff 
22401166Sdstaff 	nvlist_t *nvl = NULL;
22410Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zone_status_lock));
22420Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
22430Sstevel@tonic-gate 	    status >= zone_status_get(zone));
22441166Sdstaff 
22451166Sdstaff 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
22461166Sdstaff 	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
22471166Sdstaff 	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
22482267Sdp 	    zone_status_table[status]) ||
22491166Sdstaff 	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
22502267Sdp 	    zone_status_table[zone->zone_status]) ||
22511166Sdstaff 	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
22521166Sdstaff 	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
22531166Sdstaff 	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
22542267Sdp 	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
22551166Sdstaff #ifdef DEBUG
22561166Sdstaff 		(void) printf(
22571166Sdstaff 		    "Failed to allocate and send zone state change event.\n");
22581166Sdstaff #endif
22591166Sdstaff 	}
22601166Sdstaff 	nvlist_free(nvl);
22611166Sdstaff 
22620Sstevel@tonic-gate 	zone->zone_status = status;
22631166Sdstaff 
22640Sstevel@tonic-gate 	cv_broadcast(&zone->zone_cv);
22650Sstevel@tonic-gate }
22660Sstevel@tonic-gate 
22670Sstevel@tonic-gate /*
22680Sstevel@tonic-gate  * Public function to retrieve the zone status.  The zone status may
22690Sstevel@tonic-gate  * change after it is retrieved.
22700Sstevel@tonic-gate  */
22710Sstevel@tonic-gate zone_status_t
zone_status_get(zone_t * zone)22720Sstevel@tonic-gate zone_status_get(zone_t *zone)
22730Sstevel@tonic-gate {
22740Sstevel@tonic-gate 	return (zone->zone_status);
22750Sstevel@tonic-gate }
22760Sstevel@tonic-gate 
22770Sstevel@tonic-gate static int
zone_set_bootargs(zone_t * zone,const char * zone_bootargs)22780Sstevel@tonic-gate zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
22790Sstevel@tonic-gate {
228012633Sjohn.levon@sun.com 	char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
22812267Sdp 	int err = 0;
22822267Sdp 
22832267Sdp 	ASSERT(zone != global_zone);
228412633Sjohn.levon@sun.com 	if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
22852267Sdp 		goto done;	/* EFAULT or ENAMETOOLONG */
22862267Sdp 
22872267Sdp 	if (zone->zone_bootargs != NULL)
228812633Sjohn.levon@sun.com 		strfree(zone->zone_bootargs);
228912633Sjohn.levon@sun.com 
229012633Sjohn.levon@sun.com 	zone->zone_bootargs = strdup(buf);
22912267Sdp 
22922267Sdp done:
229312633Sjohn.levon@sun.com 	kmem_free(buf, BOOTARGS_MAX);
22942267Sdp 	return (err);
22952267Sdp }
22962267Sdp 
22972267Sdp static int
zone_set_brand(zone_t * zone,const char * brand)22984141Sedp zone_set_brand(zone_t *zone, const char *brand)
22994141Sedp {
23004141Sedp 	struct brand_attr *attrp;
23014141Sedp 	brand_t *bp;
23024141Sedp 
23034141Sedp 	attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
23044141Sedp 	if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
23054141Sedp 		kmem_free(attrp, sizeof (struct brand_attr));
23064141Sedp 		return (EFAULT);
23074141Sedp 	}
23084141Sedp 
23094141Sedp 	bp = brand_register_zone(attrp);
23104141Sedp 	kmem_free(attrp, sizeof (struct brand_attr));
23114141Sedp 	if (bp == NULL)
23124141Sedp 		return (EINVAL);
23134141Sedp 
23144141Sedp 	/*
23154141Sedp 	 * This is the only place where a zone can change it's brand.
23164141Sedp 	 * We already need to hold zone_status_lock to check the zone
23174141Sedp 	 * status, so we'll just use that lock to serialize zone
23184141Sedp 	 * branding requests as well.
23194141Sedp 	 */
23204141Sedp 	mutex_enter(&zone_status_lock);
23214141Sedp 
23224141Sedp 	/* Re-Branding is not allowed and the zone can't be booted yet */
23234141Sedp 	if ((ZONE_IS_BRANDED(zone)) ||
23244141Sedp 	    (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
23254141Sedp 		mutex_exit(&zone_status_lock);
23264141Sedp 		brand_unregister_zone(bp);
23274141Sedp 		return (EINVAL);
23284141Sedp 	}
23294141Sedp 
23304888Seh208807 	/* set up the brand specific data */
23314141Sedp 	zone->zone_brand = bp;
23324888Seh208807 	ZBROP(zone)->b_init_brand_data(zone);
23334888Seh208807 
23344141Sedp 	mutex_exit(&zone_status_lock);
23354141Sedp 	return (0);
23364141Sedp }
23374141Sedp 
23384141Sedp static int
zone_set_fs_allowed(zone_t * zone,const char * zone_fs_allowed)233912633Sjohn.levon@sun.com zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
234012633Sjohn.levon@sun.com {
234112633Sjohn.levon@sun.com 	char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
234212633Sjohn.levon@sun.com 	int err = 0;
234312633Sjohn.levon@sun.com 
234412633Sjohn.levon@sun.com 	ASSERT(zone != global_zone);
234512633Sjohn.levon@sun.com 	if ((err = copyinstr(zone_fs_allowed, buf,
234612633Sjohn.levon@sun.com 	    ZONE_FS_ALLOWED_MAX, NULL)) != 0)
234712633Sjohn.levon@sun.com 		goto done;
234812633Sjohn.levon@sun.com 
234912633Sjohn.levon@sun.com 	if (zone->zone_fs_allowed != NULL)
235012633Sjohn.levon@sun.com 		strfree(zone->zone_fs_allowed);
235112633Sjohn.levon@sun.com 
235212633Sjohn.levon@sun.com 	zone->zone_fs_allowed = strdup(buf);
235312633Sjohn.levon@sun.com 
235412633Sjohn.levon@sun.com done:
235512633Sjohn.levon@sun.com 	kmem_free(buf, ZONE_FS_ALLOWED_MAX);
235612633Sjohn.levon@sun.com 	return (err);
235712633Sjohn.levon@sun.com }
235812633Sjohn.levon@sun.com 
235912633Sjohn.levon@sun.com static int
zone_set_initname(zone_t * zone,const char * zone_initname)23602267Sdp zone_set_initname(zone_t *zone, const char *zone_initname)
23612267Sdp {
23622267Sdp 	char initname[INITNAME_SZ];
23630Sstevel@tonic-gate 	size_t len;
23642267Sdp 	int err = 0;
23652267Sdp 
23662267Sdp 	ASSERT(zone != global_zone);
23672267Sdp 	if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
23680Sstevel@tonic-gate 		return (err);	/* EFAULT or ENAMETOOLONG */
23692267Sdp 
23702267Sdp 	if (zone->zone_initname != NULL)
237112633Sjohn.levon@sun.com 		strfree(zone->zone_initname);
23722267Sdp 
23732267Sdp 	zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
23742267Sdp 	(void) strcpy(zone->zone_initname, initname);
23750Sstevel@tonic-gate 	return (0);
23760Sstevel@tonic-gate }
23770Sstevel@tonic-gate 
23783247Sgjelinek static int
zone_set_phys_mcap(zone_t * zone,const uint64_t * zone_mcap)23793247Sgjelinek zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
23803247Sgjelinek {
23813247Sgjelinek 	uint64_t mcap;
23823247Sgjelinek 	int err = 0;
23833247Sgjelinek 
23843247Sgjelinek 	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
23853247Sgjelinek 		zone->zone_phys_mcap = mcap;
23863247Sgjelinek 
23873247Sgjelinek 	return (err);
23883247Sgjelinek }
23893247Sgjelinek 
23903247Sgjelinek static int
zone_set_sched_class(zone_t * zone,const char * new_class)23913247Sgjelinek zone_set_sched_class(zone_t *zone, const char *new_class)
23923247Sgjelinek {
23933247Sgjelinek 	char sched_class[PC_CLNMSZ];
23943247Sgjelinek 	id_t classid;
23953247Sgjelinek 	int err;
23963247Sgjelinek 
23973247Sgjelinek 	ASSERT(zone != global_zone);
23983247Sgjelinek 	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
23993247Sgjelinek 		return (err);	/* EFAULT or ENAMETOOLONG */
24003247Sgjelinek 
240111173SJonathan.Adams@Sun.COM 	if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
24023247Sgjelinek 		return (set_errno(EINVAL));
24033247Sgjelinek 	zone->zone_defaultcid = classid;
24043247Sgjelinek 	ASSERT(zone->zone_defaultcid > 0 &&
24053247Sgjelinek 	    zone->zone_defaultcid < loaded_classes);
24063247Sgjelinek 
24073247Sgjelinek 	return (0);
24083247Sgjelinek }
24093247Sgjelinek 
24100Sstevel@tonic-gate /*
24110Sstevel@tonic-gate  * Block indefinitely waiting for (zone_status >= status)
24120Sstevel@tonic-gate  */
24130Sstevel@tonic-gate void
zone_status_wait(zone_t * zone,zone_status_t status)24140Sstevel@tonic-gate zone_status_wait(zone_t *zone, zone_status_t status)
24150Sstevel@tonic-gate {
24160Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
24170Sstevel@tonic-gate 
24180Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
24190Sstevel@tonic-gate 	while (zone->zone_status < status) {
24200Sstevel@tonic-gate 		cv_wait(&zone->zone_cv, &zone_status_lock);
24210Sstevel@tonic-gate 	}
24220Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
24230Sstevel@tonic-gate }
24240Sstevel@tonic-gate 
24250Sstevel@tonic-gate /*
24260Sstevel@tonic-gate  * Private CPR-safe version of zone_status_wait().
24270Sstevel@tonic-gate  */
24280Sstevel@tonic-gate static void
zone_status_wait_cpr(zone_t * zone,zone_status_t status,char * str)24290Sstevel@tonic-gate zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
24300Sstevel@tonic-gate {
24310Sstevel@tonic-gate 	callb_cpr_t cprinfo;
24320Sstevel@tonic-gate 
24330Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
24340Sstevel@tonic-gate 
24350Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
24360Sstevel@tonic-gate 	    str);
24370Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
24380Sstevel@tonic-gate 	while (zone->zone_status < status) {
24390Sstevel@tonic-gate 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
24400Sstevel@tonic-gate 		cv_wait(&zone->zone_cv, &zone_status_lock);
24410Sstevel@tonic-gate 		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
24420Sstevel@tonic-gate 	}
24430Sstevel@tonic-gate 	/*
24440Sstevel@tonic-gate 	 * zone_status_lock is implicitly released by the following.
24450Sstevel@tonic-gate 	 */
24460Sstevel@tonic-gate 	CALLB_CPR_EXIT(&cprinfo);
24470Sstevel@tonic-gate }
24480Sstevel@tonic-gate 
24490Sstevel@tonic-gate /*
24500Sstevel@tonic-gate  * Block until zone enters requested state or signal is received.  Return (0)
24510Sstevel@tonic-gate  * if signaled, non-zero otherwise.
24520Sstevel@tonic-gate  */
24530Sstevel@tonic-gate int
zone_status_wait_sig(zone_t * zone,zone_status_t status)24540Sstevel@tonic-gate zone_status_wait_sig(zone_t *zone, zone_status_t status)
24550Sstevel@tonic-gate {
24560Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
24570Sstevel@tonic-gate 
24580Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
24590Sstevel@tonic-gate 	while (zone->zone_status < status) {
24600Sstevel@tonic-gate 		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
24610Sstevel@tonic-gate 			mutex_exit(&zone_status_lock);
24620Sstevel@tonic-gate 			return (0);
24630Sstevel@tonic-gate 		}
24640Sstevel@tonic-gate 	}
24650Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
24660Sstevel@tonic-gate 	return (1);
24670Sstevel@tonic-gate }
24680Sstevel@tonic-gate 
24690Sstevel@tonic-gate /*
24700Sstevel@tonic-gate  * Block until the zone enters the requested state or the timeout expires,
24710Sstevel@tonic-gate  * whichever happens first.  Return (-1) if operation timed out, time remaining
24720Sstevel@tonic-gate  * otherwise.
24730Sstevel@tonic-gate  */
24740Sstevel@tonic-gate clock_t
zone_status_timedwait(zone_t * zone,clock_t tim,zone_status_t status)24750Sstevel@tonic-gate zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
24760Sstevel@tonic-gate {
24770Sstevel@tonic-gate 	clock_t timeleft = 0;
24780Sstevel@tonic-gate 
24790Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
24800Sstevel@tonic-gate 
24810Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
24820Sstevel@tonic-gate 	while (zone->zone_status < status && timeleft != -1) {
24830Sstevel@tonic-gate 		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
24840Sstevel@tonic-gate 	}
24850Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
24860Sstevel@tonic-gate 	return (timeleft);
24870Sstevel@tonic-gate }
24880Sstevel@tonic-gate 
24890Sstevel@tonic-gate /*
24900Sstevel@tonic-gate  * Block until the zone enters the requested state, the current process is
24910Sstevel@tonic-gate  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
24920Sstevel@tonic-gate  * operation timed out, 0 if signaled, time remaining otherwise.
24930Sstevel@tonic-gate  */
24940Sstevel@tonic-gate clock_t
zone_status_timedwait_sig(zone_t * zone,clock_t tim,zone_status_t status)24950Sstevel@tonic-gate zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
24960Sstevel@tonic-gate {
249711066Srafael.vanoni@sun.com 	clock_t timeleft = tim - ddi_get_lbolt();
24980Sstevel@tonic-gate 
24990Sstevel@tonic-gate 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
25000Sstevel@tonic-gate 
25010Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
25020Sstevel@tonic-gate 	while (zone->zone_status < status) {
25030Sstevel@tonic-gate 		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
25040Sstevel@tonic-gate 		    tim);
25050Sstevel@tonic-gate 		if (timeleft <= 0)
25060Sstevel@tonic-gate 			break;
25070Sstevel@tonic-gate 	}
25080Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
25090Sstevel@tonic-gate 	return (timeleft);
25100Sstevel@tonic-gate }
25110Sstevel@tonic-gate 
25120Sstevel@tonic-gate /*
25130Sstevel@tonic-gate  * Zones have two reference counts: one for references from credential
25140Sstevel@tonic-gate  * structures (zone_cred_ref), and one (zone_ref) for everything else.
25150Sstevel@tonic-gate  * This is so we can allow a zone to be rebooted while there are still
25160Sstevel@tonic-gate  * outstanding cred references, since certain drivers cache dblks (which
25170Sstevel@tonic-gate  * implicitly results in cached creds).  We wait for zone_ref to drop to
25180Sstevel@tonic-gate  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
25190Sstevel@tonic-gate  * later freed when the zone_cred_ref drops to 0, though nothing other
25200Sstevel@tonic-gate  * than the zone id and privilege set should be accessed once the zone
25210Sstevel@tonic-gate  * is "dead".
25220Sstevel@tonic-gate  *
25230Sstevel@tonic-gate  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
25240Sstevel@tonic-gate  * to force halt/reboot to block waiting for the zone_cred_ref to drop
25250Sstevel@tonic-gate  * to 0.  This can be useful to flush out other sources of cached creds
25260Sstevel@tonic-gate  * that may be less innocuous than the driver case.
2527*13096SJordan.Vaughan@Sun.com  *
2528*13096SJordan.Vaughan@Sun.com  * Zones also provide a tracked reference counting mechanism in which zone
2529*13096SJordan.Vaughan@Sun.com  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2530*13096SJordan.Vaughan@Sun.com  * debuggers determine the sources of leaked zone references.  See
2531*13096SJordan.Vaughan@Sun.com  * zone_hold_ref() and zone_rele_ref() below for more information.
25320Sstevel@tonic-gate  */
25330Sstevel@tonic-gate 
25340Sstevel@tonic-gate int zone_wait_for_cred = 0;
25350Sstevel@tonic-gate 
25360Sstevel@tonic-gate static void
zone_hold_locked(zone_t * z)25370Sstevel@tonic-gate zone_hold_locked(zone_t *z)
25380Sstevel@tonic-gate {
25390Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&z->zone_lock));
25400Sstevel@tonic-gate 	z->zone_ref++;
25410Sstevel@tonic-gate 	ASSERT(z->zone_ref != 0);
25420Sstevel@tonic-gate }
25430Sstevel@tonic-gate 
2544*13096SJordan.Vaughan@Sun.com /*
2545*13096SJordan.Vaughan@Sun.com  * Increment the specified zone's reference count.  The zone's zone_t structure
2546*13096SJordan.Vaughan@Sun.com  * will not be freed as long as the zone's reference count is nonzero.
2547*13096SJordan.Vaughan@Sun.com  * Decrement the zone's reference count via zone_rele().
2548*13096SJordan.Vaughan@Sun.com  *
2549*13096SJordan.Vaughan@Sun.com  * NOTE: This function should only be used to hold zones for short periods of
2550*13096SJordan.Vaughan@Sun.com  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2551*13096SJordan.Vaughan@Sun.com  */
25520Sstevel@tonic-gate void
zone_hold(zone_t * z)25530Sstevel@tonic-gate zone_hold(zone_t *z)
25540Sstevel@tonic-gate {
25550Sstevel@tonic-gate 	mutex_enter(&z->zone_lock);
25560Sstevel@tonic-gate 	zone_hold_locked(z);
25570Sstevel@tonic-gate 	mutex_exit(&z->zone_lock);
25580Sstevel@tonic-gate }
25590Sstevel@tonic-gate 
25600Sstevel@tonic-gate /*
25610Sstevel@tonic-gate  * If the non-cred ref count drops to 1 and either the cred ref count
25620Sstevel@tonic-gate  * is 0 or we aren't waiting for cred references, the zone is ready to
25630Sstevel@tonic-gate  * be destroyed.
25640Sstevel@tonic-gate  */
25650Sstevel@tonic-gate #define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
25660Sstevel@tonic-gate 	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
25670Sstevel@tonic-gate 
2568*13096SJordan.Vaughan@Sun.com /*
2569*13096SJordan.Vaughan@Sun.com  * Common zone reference release function invoked by zone_rele() and
2570*13096SJordan.Vaughan@Sun.com  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2571*13096SJordan.Vaughan@Sun.com  * zone's subsystem-specific reference counters are not affected by the
2572*13096SJordan.Vaughan@Sun.com  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2573*13096SJordan.Vaughan@Sun.com  * removed from the specified zone's reference list.  ref must be non-NULL iff
2574*13096SJordan.Vaughan@Sun.com  * subsys is not ZONE_REF_NUM_SUBSYS.
2575*13096SJordan.Vaughan@Sun.com  */
2576*13096SJordan.Vaughan@Sun.com static void
zone_rele_common(zone_t * z,zone_ref_t * ref,zone_ref_subsys_t subsys)2577*13096SJordan.Vaughan@Sun.com zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
25780Sstevel@tonic-gate {
25790Sstevel@tonic-gate 	boolean_t wakeup;
25800Sstevel@tonic-gate 
25810Sstevel@tonic-gate 	mutex_enter(&z->zone_lock);
25820Sstevel@tonic-gate 	ASSERT(z->zone_ref != 0);
25830Sstevel@tonic-gate 	z->zone_ref--;
2584*13096SJordan.Vaughan@Sun.com 	if (subsys != ZONE_REF_NUM_SUBSYS) {
2585*13096SJordan.Vaughan@Sun.com 		ASSERT(z->zone_subsys_ref[subsys] != 0);
2586*13096SJordan.Vaughan@Sun.com 		z->zone_subsys_ref[subsys]--;
2587*13096SJordan.Vaughan@Sun.com 		list_remove(&z->zone_ref_list, ref);
2588*13096SJordan.Vaughan@Sun.com 	}
25890Sstevel@tonic-gate 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
25900Sstevel@tonic-gate 		/* no more refs, free the structure */
25910Sstevel@tonic-gate 		mutex_exit(&z->zone_lock);
25920Sstevel@tonic-gate 		zone_free(z);
25930Sstevel@tonic-gate 		return;
25940Sstevel@tonic-gate 	}
25950Sstevel@tonic-gate 	/* signal zone_destroy so the zone can finish halting */
25960Sstevel@tonic-gate 	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
25970Sstevel@tonic-gate 	mutex_exit(&z->zone_lock);
25980Sstevel@tonic-gate 
25990Sstevel@tonic-gate 	if (wakeup) {
26000Sstevel@tonic-gate 		/*
26010Sstevel@tonic-gate 		 * Grabbing zonehash_lock here effectively synchronizes with
26020Sstevel@tonic-gate 		 * zone_destroy() to avoid missed signals.
26030Sstevel@tonic-gate 		 */
26040Sstevel@tonic-gate 		mutex_enter(&zonehash_lock);
26050Sstevel@tonic-gate 		cv_broadcast(&zone_destroy_cv);
26060Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
26070Sstevel@tonic-gate 	}
26080Sstevel@tonic-gate }
26090Sstevel@tonic-gate 
2610*13096SJordan.Vaughan@Sun.com /*
2611*13096SJordan.Vaughan@Sun.com  * Decrement the specified zone's reference count.  The specified zone will
2612*13096SJordan.Vaughan@Sun.com  * cease to exist after this function returns if the reference count drops to
2613*13096SJordan.Vaughan@Sun.com  * zero.  This function should be paired with zone_hold().
2614*13096SJordan.Vaughan@Sun.com  */
2615*13096SJordan.Vaughan@Sun.com void
zone_rele(zone_t * z)2616*13096SJordan.Vaughan@Sun.com zone_rele(zone_t *z)
2617*13096SJordan.Vaughan@Sun.com {
2618*13096SJordan.Vaughan@Sun.com 	zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2619*13096SJordan.Vaughan@Sun.com }
2620*13096SJordan.Vaughan@Sun.com 
2621*13096SJordan.Vaughan@Sun.com /*
2622*13096SJordan.Vaughan@Sun.com  * Initialize a zone reference structure.  This function must be invoked for
2623*13096SJordan.Vaughan@Sun.com  * a reference structure before the structure is passed to zone_hold_ref().
2624*13096SJordan.Vaughan@Sun.com  */
2625*13096SJordan.Vaughan@Sun.com void
zone_init_ref(zone_ref_t * ref)2626*13096SJordan.Vaughan@Sun.com zone_init_ref(zone_ref_t *ref)
2627*13096SJordan.Vaughan@Sun.com {
2628*13096SJordan.Vaughan@Sun.com 	ref->zref_zone = NULL;
2629*13096SJordan.Vaughan@Sun.com 	list_link_init(&ref->zref_linkage);
2630*13096SJordan.Vaughan@Sun.com }
2631*13096SJordan.Vaughan@Sun.com 
2632*13096SJordan.Vaughan@Sun.com /*
2633*13096SJordan.Vaughan@Sun.com  * Acquire a reference to zone z.  The caller must specify the
2634*13096SJordan.Vaughan@Sun.com  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2635*13096SJordan.Vaughan@Sun.com  * zone_ref_t structure will represent a reference to the specified zone.  Use
2636*13096SJordan.Vaughan@Sun.com  * zone_rele_ref() to release the reference.
2637*13096SJordan.Vaughan@Sun.com  *
2638*13096SJordan.Vaughan@Sun.com  * The referenced zone_t structure will not be freed as long as the zone_t's
2639*13096SJordan.Vaughan@Sun.com  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2640*13096SJordan.Vaughan@Sun.com  * references.
2641*13096SJordan.Vaughan@Sun.com  *
2642*13096SJordan.Vaughan@Sun.com  * NOTE: The zone_ref_t structure must be initialized before it is used.
2643*13096SJordan.Vaughan@Sun.com  * See zone_init_ref() above.
2644*13096SJordan.Vaughan@Sun.com  */
2645*13096SJordan.Vaughan@Sun.com void
zone_hold_ref(zone_t * z,zone_ref_t * ref,zone_ref_subsys_t subsys)2646*13096SJordan.Vaughan@Sun.com zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2647*13096SJordan.Vaughan@Sun.com {
2648*13096SJordan.Vaughan@Sun.com 	ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2649*13096SJordan.Vaughan@Sun.com 
2650*13096SJordan.Vaughan@Sun.com 	/*
2651*13096SJordan.Vaughan@Sun.com 	 * Prevent consumers from reusing a reference structure before
2652*13096SJordan.Vaughan@Sun.com 	 * releasing it.
2653*13096SJordan.Vaughan@Sun.com 	 */
2654*13096SJordan.Vaughan@Sun.com 	VERIFY(ref->zref_zone == NULL);
2655*13096SJordan.Vaughan@Sun.com 
2656*13096SJordan.Vaughan@Sun.com 	ref->zref_zone = z;
2657*13096SJordan.Vaughan@Sun.com 	mutex_enter(&z->zone_lock);
2658*13096SJordan.Vaughan@Sun.com 	zone_hold_locked(z);
2659*13096SJordan.Vaughan@Sun.com 	z->zone_subsys_ref[subsys]++;
2660*13096SJordan.Vaughan@Sun.com 	ASSERT(z->zone_subsys_ref[subsys] != 0);
2661*13096SJordan.Vaughan@Sun.com 	list_insert_head(&z->zone_ref_list, ref);
2662*13096SJordan.Vaughan@Sun.com 	mutex_exit(&z->zone_lock);
2663*13096SJordan.Vaughan@Sun.com }
2664*13096SJordan.Vaughan@Sun.com 
2665*13096SJordan.Vaughan@Sun.com /*
2666*13096SJordan.Vaughan@Sun.com  * Release the zone reference represented by the specified zone_ref_t.
2667*13096SJordan.Vaughan@Sun.com  * The reference is invalid after it's released; however, the zone_ref_t
2668*13096SJordan.Vaughan@Sun.com  * structure can be reused without having to invoke zone_init_ref().
2669*13096SJordan.Vaughan@Sun.com  * subsys should be the same value that was passed to zone_hold_ref()
2670*13096SJordan.Vaughan@Sun.com  * when the reference was acquired.
2671*13096SJordan.Vaughan@Sun.com  */
2672*13096SJordan.Vaughan@Sun.com void
zone_rele_ref(zone_ref_t * ref,zone_ref_subsys_t subsys)2673*13096SJordan.Vaughan@Sun.com zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2674*13096SJordan.Vaughan@Sun.com {
2675*13096SJordan.Vaughan@Sun.com 	zone_rele_common(ref->zref_zone, ref, subsys);
2676*13096SJordan.Vaughan@Sun.com 
2677*13096SJordan.Vaughan@Sun.com 	/*
2678*13096SJordan.Vaughan@Sun.com 	 * Set the zone_ref_t's zref_zone field to NULL to generate panics
2679*13096SJordan.Vaughan@Sun.com 	 * when consumers dereference the reference.  This helps us catch
2680*13096SJordan.Vaughan@Sun.com 	 * consumers who use released references.  Furthermore, this lets
2681*13096SJordan.Vaughan@Sun.com 	 * consumers reuse the zone_ref_t structure without having to
2682*13096SJordan.Vaughan@Sun.com 	 * invoke zone_init_ref().
2683*13096SJordan.Vaughan@Sun.com 	 */
2684*13096SJordan.Vaughan@Sun.com 	ref->zref_zone = NULL;
2685*13096SJordan.Vaughan@Sun.com }
2686*13096SJordan.Vaughan@Sun.com 
26870Sstevel@tonic-gate void
zone_cred_hold(zone_t * z)26880Sstevel@tonic-gate zone_cred_hold(zone_t *z)
26890Sstevel@tonic-gate {
26900Sstevel@tonic-gate 	mutex_enter(&z->zone_lock);
26910Sstevel@tonic-gate 	z->zone_cred_ref++;
26920Sstevel@tonic-gate 	ASSERT(z->zone_cred_ref != 0);
26930Sstevel@tonic-gate 	mutex_exit(&z->zone_lock);
26940Sstevel@tonic-gate }
26950Sstevel@tonic-gate 
26960Sstevel@tonic-gate void
zone_cred_rele(zone_t * z)26970Sstevel@tonic-gate zone_cred_rele(zone_t *z)
26980Sstevel@tonic-gate {
26990Sstevel@tonic-gate 	boolean_t wakeup;
27000Sstevel@tonic-gate 
27010Sstevel@tonic-gate 	mutex_enter(&z->zone_lock);
27020Sstevel@tonic-gate 	ASSERT(z->zone_cred_ref != 0);
27030Sstevel@tonic-gate 	z->zone_cred_ref--;
27040Sstevel@tonic-gate 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
27050Sstevel@tonic-gate 		/* no more refs, free the structure */
27060Sstevel@tonic-gate 		mutex_exit(&z->zone_lock);
27070Sstevel@tonic-gate 		zone_free(z);
27080Sstevel@tonic-gate 		return;
27090Sstevel@tonic-gate 	}
27100Sstevel@tonic-gate 	/*
27110Sstevel@tonic-gate 	 * If zone_destroy is waiting for the cred references to drain
27120Sstevel@tonic-gate 	 * out, and they have, signal it.
27130Sstevel@tonic-gate 	 */
27140Sstevel@tonic-gate 	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
27150Sstevel@tonic-gate 	    zone_status_get(z) >= ZONE_IS_DEAD);
27160Sstevel@tonic-gate 	mutex_exit(&z->zone_lock);
27170Sstevel@tonic-gate 
27180Sstevel@tonic-gate 	if (wakeup) {
27190Sstevel@tonic-gate 		/*
27200Sstevel@tonic-gate 		 * Grabbing zonehash_lock here effectively synchronizes with
27210Sstevel@tonic-gate 		 * zone_destroy() to avoid missed signals.
27220Sstevel@tonic-gate 		 */
27230Sstevel@tonic-gate 		mutex_enter(&zonehash_lock);
27240Sstevel@tonic-gate 		cv_broadcast(&zone_destroy_cv);
27250Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
27260Sstevel@tonic-gate 	}
27270Sstevel@tonic-gate }
27280Sstevel@tonic-gate 
27290Sstevel@tonic-gate void
zone_task_hold(zone_t * z)27300Sstevel@tonic-gate zone_task_hold(zone_t *z)
27310Sstevel@tonic-gate {
27320Sstevel@tonic-gate 	mutex_enter(&z->zone_lock);
27330Sstevel@tonic-gate 	z->zone_ntasks++;
27340Sstevel@tonic-gate 	ASSERT(z->zone_ntasks != 0);
27350Sstevel@tonic-gate 	mutex_exit(&z->zone_lock);
27360Sstevel@tonic-gate }
27370Sstevel@tonic-gate 
27380Sstevel@tonic-gate void
zone_task_rele(zone_t * zone)27390Sstevel@tonic-gate zone_task_rele(zone_t *zone)
27400Sstevel@tonic-gate {
27410Sstevel@tonic-gate 	uint_t refcnt;
27420Sstevel@tonic-gate 
27430Sstevel@tonic-gate 	mutex_enter(&zone->zone_lock);
27440Sstevel@tonic-gate 	ASSERT(zone->zone_ntasks != 0);
27450Sstevel@tonic-gate 	refcnt = --zone->zone_ntasks;
27460Sstevel@tonic-gate 	if (refcnt > 1)	{	/* Common case */
27470Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
27480Sstevel@tonic-gate 		return;
27490Sstevel@tonic-gate 	}
27500Sstevel@tonic-gate 	zone_hold_locked(zone);	/* so we can use the zone_t later */
27510Sstevel@tonic-gate 	mutex_exit(&zone->zone_lock);
27520Sstevel@tonic-gate 	if (refcnt == 1) {
27530Sstevel@tonic-gate 		/*
27540Sstevel@tonic-gate 		 * See if the zone is shutting down.
27550Sstevel@tonic-gate 		 */
27560Sstevel@tonic-gate 		mutex_enter(&zone_status_lock);
27570Sstevel@tonic-gate 		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
27580Sstevel@tonic-gate 			goto out;
27590Sstevel@tonic-gate 		}
27600Sstevel@tonic-gate 
27610Sstevel@tonic-gate 		/*
27620Sstevel@tonic-gate 		 * Make sure the ntasks didn't change since we
27630Sstevel@tonic-gate 		 * dropped zone_lock.
27640Sstevel@tonic-gate 		 */
27650Sstevel@tonic-gate 		mutex_enter(&zone->zone_lock);
27660Sstevel@tonic-gate 		if (refcnt != zone->zone_ntasks) {
27670Sstevel@tonic-gate 			mutex_exit(&zone->zone_lock);
27680Sstevel@tonic-gate 			goto out;
27690Sstevel@tonic-gate 		}
27700Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
27710Sstevel@tonic-gate 
27720Sstevel@tonic-gate 		/*
27730Sstevel@tonic-gate 		 * No more user processes in the zone.  The zone is empty.
27740Sstevel@tonic-gate 		 */
27750Sstevel@tonic-gate 		zone_status_set(zone, ZONE_IS_EMPTY);
27760Sstevel@tonic-gate 		goto out;
27770Sstevel@tonic-gate 	}
27780Sstevel@tonic-gate 
27790Sstevel@tonic-gate 	ASSERT(refcnt == 0);
27800Sstevel@tonic-gate 	/*
27810Sstevel@tonic-gate 	 * zsched has exited; the zone is dead.
27820Sstevel@tonic-gate 	 */
27830Sstevel@tonic-gate 	zone->zone_zsched = NULL;		/* paranoia */
27840Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
27850Sstevel@tonic-gate 	zone_status_set(zone, ZONE_IS_DEAD);
27860Sstevel@tonic-gate out:
27870Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
27880Sstevel@tonic-gate 	zone_rele(zone);
27890Sstevel@tonic-gate }
27900Sstevel@tonic-gate 
27910Sstevel@tonic-gate zoneid_t
getzoneid(void)27920Sstevel@tonic-gate getzoneid(void)
27930Sstevel@tonic-gate {
27940Sstevel@tonic-gate 	return (curproc->p_zone->zone_id);
27950Sstevel@tonic-gate }
27960Sstevel@tonic-gate 
27970Sstevel@tonic-gate /*
27980Sstevel@tonic-gate  * Internal versions of zone_find_by_*().  These don't zone_hold() or
27990Sstevel@tonic-gate  * check the validity of a zone's state.
28000Sstevel@tonic-gate  */
28010Sstevel@tonic-gate static zone_t *
zone_find_all_by_id(zoneid_t zoneid)28020Sstevel@tonic-gate zone_find_all_by_id(zoneid_t zoneid)
28030Sstevel@tonic-gate {
28040Sstevel@tonic-gate 	mod_hash_val_t hv;
28050Sstevel@tonic-gate 	zone_t *zone = NULL;
28060Sstevel@tonic-gate 
28070Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
28080Sstevel@tonic-gate 
28090Sstevel@tonic-gate 	if (mod_hash_find(zonehashbyid,
28100Sstevel@tonic-gate 	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
28110Sstevel@tonic-gate 		zone = (zone_t *)hv;
28120Sstevel@tonic-gate 	return (zone);
28130Sstevel@tonic-gate }
28140Sstevel@tonic-gate 
28150Sstevel@tonic-gate static zone_t *
zone_find_all_by_label(const ts_label_t * label)28161676Sjpk zone_find_all_by_label(const ts_label_t *label)
28171676Sjpk {
28181676Sjpk 	mod_hash_val_t hv;
28191676Sjpk 	zone_t *zone = NULL;
28201676Sjpk 
28211676Sjpk 	ASSERT(MUTEX_HELD(&zonehash_lock));
28221676Sjpk 
28231676Sjpk 	/*
28241676Sjpk 	 * zonehashbylabel is not maintained for unlabeled systems
28251676Sjpk 	 */
28261676Sjpk 	if (!is_system_labeled())
28271676Sjpk 		return (NULL);
28281676Sjpk 	if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
28291676Sjpk 		zone = (zone_t *)hv;
28301676Sjpk 	return (zone);
28311676Sjpk }
28321676Sjpk 
28331676Sjpk static zone_t *
zone_find_all_by_name(char * name)28340Sstevel@tonic-gate zone_find_all_by_name(char *name)
28350Sstevel@tonic-gate {
28360Sstevel@tonic-gate 	mod_hash_val_t hv;
28370Sstevel@tonic-gate 	zone_t *zone = NULL;
28380Sstevel@tonic-gate 
28390Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
28400Sstevel@tonic-gate 
28410Sstevel@tonic-gate 	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
28420Sstevel@tonic-gate 		zone = (zone_t *)hv;
28430Sstevel@tonic-gate 	return (zone);
28440Sstevel@tonic-gate }
28450Sstevel@tonic-gate 
28460Sstevel@tonic-gate /*
28470Sstevel@tonic-gate  * Public interface for looking up a zone by zoneid.  Only returns the zone if
28480Sstevel@tonic-gate  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
28490Sstevel@tonic-gate  * Caller must call zone_rele() once it is done with the zone.
28500Sstevel@tonic-gate  *
28510Sstevel@tonic-gate  * The zone may begin the zone_destroy() sequence immediately after this
28520Sstevel@tonic-gate  * function returns, but may be safely used until zone_rele() is called.
28530Sstevel@tonic-gate  */
28540Sstevel@tonic-gate zone_t *
zone_find_by_id(zoneid_t zoneid)28550Sstevel@tonic-gate zone_find_by_id(zoneid_t zoneid)
28560Sstevel@tonic-gate {
28570Sstevel@tonic-gate 	zone_t *zone;
28580Sstevel@tonic-gate 	zone_status_t status;
28590Sstevel@tonic-gate 
28600Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
28610Sstevel@tonic-gate 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
28620Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
28630Sstevel@tonic-gate 		return (NULL);
28640Sstevel@tonic-gate 	}
28650Sstevel@tonic-gate 	status = zone_status_get(zone);
28660Sstevel@tonic-gate 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
28670Sstevel@tonic-gate 		/*
28680Sstevel@tonic-gate 		 * For all practical purposes the zone doesn't exist.
28690Sstevel@tonic-gate 		 */
28700Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
28710Sstevel@tonic-gate 		return (NULL);
28720Sstevel@tonic-gate 	}
28730Sstevel@tonic-gate 	zone_hold(zone);
28740Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
28750Sstevel@tonic-gate 	return (zone);
28760Sstevel@tonic-gate }
28770Sstevel@tonic-gate 
28780Sstevel@tonic-gate /*
28791676Sjpk  * Similar to zone_find_by_id, but using zone label as the key.
28801676Sjpk  */
28811676Sjpk zone_t *
zone_find_by_label(const ts_label_t * label)28821676Sjpk zone_find_by_label(const ts_label_t *label)
28831676Sjpk {
28841676Sjpk 	zone_t *zone;
28852110Srica 	zone_status_t status;
28861676Sjpk 
28871676Sjpk 	mutex_enter(&zonehash_lock);
28881676Sjpk 	if ((zone = zone_find_all_by_label(label)) == NULL) {
28891676Sjpk 		mutex_exit(&zonehash_lock);
28901676Sjpk 		return (NULL);
28911676Sjpk 	}
28922110Srica 
28932110Srica 	status = zone_status_get(zone);
28942110Srica 	if (status > ZONE_IS_DOWN) {
28951676Sjpk 		/*
28961676Sjpk 		 * For all practical purposes the zone doesn't exist.
28971676Sjpk 		 */
28982110Srica 		mutex_exit(&zonehash_lock);
28992110Srica 		return (NULL);
29001676Sjpk 	}
29012110Srica 	zone_hold(zone);
29021676Sjpk 	mutex_exit(&zonehash_lock);
29031676Sjpk 	return (zone);
29041676Sjpk }
29051676Sjpk 
29061676Sjpk /*
29070Sstevel@tonic-gate  * Similar to zone_find_by_id, but using zone name as the key.
29080Sstevel@tonic-gate  */
29090Sstevel@tonic-gate zone_t *
zone_find_by_name(char * name)29100Sstevel@tonic-gate zone_find_by_name(char *name)
29110Sstevel@tonic-gate {
29120Sstevel@tonic-gate 	zone_t *zone;
29130Sstevel@tonic-gate 	zone_status_t status;
29140Sstevel@tonic-gate 
29150Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
29160Sstevel@tonic-gate 	if ((zone = zone_find_all_by_name(name)) == NULL) {
29170Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
29180Sstevel@tonic-gate 		return (NULL);
29190Sstevel@tonic-gate 	}
29200Sstevel@tonic-gate 	status = zone_status_get(zone);
29210Sstevel@tonic-gate 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
29220Sstevel@tonic-gate 		/*
29230Sstevel@tonic-gate 		 * For all practical purposes the zone doesn't exist.
29240Sstevel@tonic-gate 		 */
29250Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
29260Sstevel@tonic-gate 		return (NULL);
29270Sstevel@tonic-gate 	}
29280Sstevel@tonic-gate 	zone_hold(zone);
29290Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
29300Sstevel@tonic-gate 	return (zone);
29310Sstevel@tonic-gate }
29320Sstevel@tonic-gate 
29330Sstevel@tonic-gate /*
29340Sstevel@tonic-gate  * Similar to zone_find_by_id(), using the path as a key.  For instance,
29350Sstevel@tonic-gate  * if there is a zone "foo" rooted at /foo/root, and the path argument
29360Sstevel@tonic-gate  * is "/foo/root/proc", it will return the held zone_t corresponding to
29370Sstevel@tonic-gate  * zone "foo".
29380Sstevel@tonic-gate  *
29390Sstevel@tonic-gate  * zone_find_by_path() always returns a non-NULL value, since at the
29400Sstevel@tonic-gate  * very least every path will be contained in the global zone.
29410Sstevel@tonic-gate  *
29420Sstevel@tonic-gate  * As with the other zone_find_by_*() functions, the caller is
29430Sstevel@tonic-gate  * responsible for zone_rele()ing the return value of this function.
29440Sstevel@tonic-gate  */
29450Sstevel@tonic-gate zone_t *
zone_find_by_path(const char * path)29460Sstevel@tonic-gate zone_find_by_path(const char *path)
29470Sstevel@tonic-gate {
29480Sstevel@tonic-gate 	zone_t *zone;
29490Sstevel@tonic-gate 	zone_t *zret = NULL;
29500Sstevel@tonic-gate 	zone_status_t status;
29510Sstevel@tonic-gate 
29520Sstevel@tonic-gate 	if (path == NULL) {
29530Sstevel@tonic-gate 		/*
29540Sstevel@tonic-gate 		 * Call from rootconf().
29550Sstevel@tonic-gate 		 */
29560Sstevel@tonic-gate 		zone_hold(global_zone);
29570Sstevel@tonic-gate 		return (global_zone);
29580Sstevel@tonic-gate 	}
29590Sstevel@tonic-gate 	ASSERT(*path == '/');
29600Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
29610Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
29620Sstevel@tonic-gate 	    zone = list_next(&zone_active, zone)) {
29630Sstevel@tonic-gate 		if (ZONE_PATH_VISIBLE(path, zone))
29640Sstevel@tonic-gate 			zret = zone;
29650Sstevel@tonic-gate 	}
29660Sstevel@tonic-gate 	ASSERT(zret != NULL);
29670Sstevel@tonic-gate 	status = zone_status_get(zret);
29680Sstevel@tonic-gate 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
29690Sstevel@tonic-gate 		/*
29700Sstevel@tonic-gate 		 * Zone practically doesn't exist.
29710Sstevel@tonic-gate 		 */
29720Sstevel@tonic-gate 		zret = global_zone;
29730Sstevel@tonic-gate 	}
29740Sstevel@tonic-gate 	zone_hold(zret);
29750Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
29760Sstevel@tonic-gate 	return (zret);
29770Sstevel@tonic-gate }
29780Sstevel@tonic-gate 
29790Sstevel@tonic-gate /*
29800Sstevel@tonic-gate  * Get the number of cpus visible to this zone.  The system-wide global
29810Sstevel@tonic-gate  * 'ncpus' is returned if pools are disabled, the caller is in the
29820Sstevel@tonic-gate  * global zone, or a NULL zone argument is passed in.
29830Sstevel@tonic-gate  */
29840Sstevel@tonic-gate int
zone_ncpus_get(zone_t * zone)29850Sstevel@tonic-gate zone_ncpus_get(zone_t *zone)
29860Sstevel@tonic-gate {
29870Sstevel@tonic-gate 	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
29880Sstevel@tonic-gate 
29890Sstevel@tonic-gate 	return (myncpus != 0 ? myncpus : ncpus);
29900Sstevel@tonic-gate }
29910Sstevel@tonic-gate 
29920Sstevel@tonic-gate /*
29930Sstevel@tonic-gate  * Get the number of online cpus visible to this zone.  The system-wide
29940Sstevel@tonic-gate  * global 'ncpus_online' is returned if pools are disabled, the caller
29950Sstevel@tonic-gate  * is in the global zone, or a NULL zone argument is passed in.
29960Sstevel@tonic-gate  */
29970Sstevel@tonic-gate int
zone_ncpus_online_get(zone_t * zone)29980Sstevel@tonic-gate zone_ncpus_online_get(zone_t *zone)
29990Sstevel@tonic-gate {
30000Sstevel@tonic-gate 	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
30010Sstevel@tonic-gate 
30020Sstevel@tonic-gate 	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
30030Sstevel@tonic-gate }
30040Sstevel@tonic-gate 
30050Sstevel@tonic-gate /*
30060Sstevel@tonic-gate  * Return the pool to which the zone is currently bound.
30070Sstevel@tonic-gate  */
30080Sstevel@tonic-gate pool_t *
zone_pool_get(zone_t * zone)30090Sstevel@tonic-gate zone_pool_get(zone_t *zone)
30100Sstevel@tonic-gate {
30110Sstevel@tonic-gate 	ASSERT(pool_lock_held());
30120Sstevel@tonic-gate 
30130Sstevel@tonic-gate 	return (zone->zone_pool);
30140Sstevel@tonic-gate }
30150Sstevel@tonic-gate 
30160Sstevel@tonic-gate /*
30170Sstevel@tonic-gate  * Set the zone's pool pointer and update the zone's visibility to match
30180Sstevel@tonic-gate  * the resources in the new pool.
30190Sstevel@tonic-gate  */
30200Sstevel@tonic-gate void
zone_pool_set(zone_t * zone,pool_t * pool)30210Sstevel@tonic-gate zone_pool_set(zone_t *zone, pool_t *pool)
30220Sstevel@tonic-gate {
30230Sstevel@tonic-gate 	ASSERT(pool_lock_held());
30240Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
30250Sstevel@tonic-gate 
30260Sstevel@tonic-gate 	zone->zone_pool = pool;
30270Sstevel@tonic-gate 	zone_pset_set(zone, pool->pool_pset->pset_id);
30280Sstevel@tonic-gate }
30290Sstevel@tonic-gate 
30300Sstevel@tonic-gate /*
30310Sstevel@tonic-gate  * Return the cached value of the id of the processor set to which the
30320Sstevel@tonic-gate  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
30330Sstevel@tonic-gate  * facility is disabled.
30340Sstevel@tonic-gate  */
30350Sstevel@tonic-gate psetid_t
zone_pset_get(zone_t * zone)30360Sstevel@tonic-gate zone_pset_get(zone_t *zone)
30370Sstevel@tonic-gate {
30380Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
30390Sstevel@tonic-gate 
30400Sstevel@tonic-gate 	return (zone->zone_psetid);
30410Sstevel@tonic-gate }
30420Sstevel@tonic-gate 
30430Sstevel@tonic-gate /*
30440Sstevel@tonic-gate  * Set the cached value of the id of the processor set to which the zone
30450Sstevel@tonic-gate  * is currently bound.  Also update the zone's visibility to match the
30460Sstevel@tonic-gate  * resources in the new processor set.
30470Sstevel@tonic-gate  */
30480Sstevel@tonic-gate void
zone_pset_set(zone_t * zone,psetid_t newpsetid)30490Sstevel@tonic-gate zone_pset_set(zone_t *zone, psetid_t newpsetid)
30500Sstevel@tonic-gate {
30510Sstevel@tonic-gate 	psetid_t oldpsetid;
30520Sstevel@tonic-gate 
30530Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
30540Sstevel@tonic-gate 	oldpsetid = zone_pset_get(zone);
30550Sstevel@tonic-gate 
30560Sstevel@tonic-gate 	if (oldpsetid == newpsetid)
30570Sstevel@tonic-gate 		return;
30580Sstevel@tonic-gate 	/*
30590Sstevel@tonic-gate 	 * Global zone sees all.
30600Sstevel@tonic-gate 	 */
30610Sstevel@tonic-gate 	if (zone != global_zone) {
30620Sstevel@tonic-gate 		zone->zone_psetid = newpsetid;
30630Sstevel@tonic-gate 		if (newpsetid != ZONE_PS_INVAL)
30640Sstevel@tonic-gate 			pool_pset_visibility_add(newpsetid, zone);
30650Sstevel@tonic-gate 		if (oldpsetid != ZONE_PS_INVAL)
30660Sstevel@tonic-gate 			pool_pset_visibility_remove(oldpsetid, zone);
30670Sstevel@tonic-gate 	}
30680Sstevel@tonic-gate 	/*
30690Sstevel@tonic-gate 	 * Disabling pools, so we should start using the global values
30700Sstevel@tonic-gate 	 * for ncpus and ncpus_online.
30710Sstevel@tonic-gate 	 */
30720Sstevel@tonic-gate 	if (newpsetid == ZONE_PS_INVAL) {
30730Sstevel@tonic-gate 		zone->zone_ncpus = 0;
30740Sstevel@tonic-gate 		zone->zone_ncpus_online = 0;
30750Sstevel@tonic-gate 	}
30760Sstevel@tonic-gate }
30770Sstevel@tonic-gate 
30780Sstevel@tonic-gate /*
30790Sstevel@tonic-gate  * Walk the list of active zones and issue the provided callback for
30800Sstevel@tonic-gate  * each of them.
30810Sstevel@tonic-gate  *
30820Sstevel@tonic-gate  * Caller must not be holding any locks that may be acquired under
30830Sstevel@tonic-gate  * zonehash_lock.  See comment at the beginning of the file for a list of
30840Sstevel@tonic-gate  * common locks and their interactions with zones.
30850Sstevel@tonic-gate  */
30860Sstevel@tonic-gate int
zone_walk(int (* cb)(zone_t *,void *),void * data)30870Sstevel@tonic-gate zone_walk(int (*cb)(zone_t *, void *), void *data)
30880Sstevel@tonic-gate {
30890Sstevel@tonic-gate 	zone_t *zone;
30900Sstevel@tonic-gate 	int ret = 0;
30910Sstevel@tonic-gate 	zone_status_t status;
30920Sstevel@tonic-gate 
30930Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
30940Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
30950Sstevel@tonic-gate 	    zone = list_next(&zone_active, zone)) {
30960Sstevel@tonic-gate 		/*
30970Sstevel@tonic-gate 		 * Skip zones that shouldn't be externally visible.
30980Sstevel@tonic-gate 		 */
30990Sstevel@tonic-gate 		status = zone_status_get(zone);
31000Sstevel@tonic-gate 		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
31010Sstevel@tonic-gate 			continue;
31020Sstevel@tonic-gate 		/*
31030Sstevel@tonic-gate 		 * Bail immediately if any callback invocation returns a
31040Sstevel@tonic-gate 		 * non-zero value.
31050Sstevel@tonic-gate 		 */
31060Sstevel@tonic-gate 		ret = (*cb)(zone, data);
31070Sstevel@tonic-gate 		if (ret != 0)
31080Sstevel@tonic-gate 			break;
31090Sstevel@tonic-gate 	}
31100Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
31110Sstevel@tonic-gate 	return (ret);
31120Sstevel@tonic-gate }
31130Sstevel@tonic-gate 
31140Sstevel@tonic-gate static int
zone_set_root(zone_t * zone,const char * upath)31150Sstevel@tonic-gate zone_set_root(zone_t *zone, const char *upath)
31160Sstevel@tonic-gate {
31170Sstevel@tonic-gate 	vnode_t *vp;
31180Sstevel@tonic-gate 	int trycount;
31190Sstevel@tonic-gate 	int error = 0;
31200Sstevel@tonic-gate 	char *path;
31210Sstevel@tonic-gate 	struct pathname upn, pn;
31220Sstevel@tonic-gate 	size_t pathlen;
31230Sstevel@tonic-gate 
31240Sstevel@tonic-gate 	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
31250Sstevel@tonic-gate 		return (error);
31260Sstevel@tonic-gate 
31270Sstevel@tonic-gate 	pn_alloc(&pn);
31280Sstevel@tonic-gate 
31290Sstevel@tonic-gate 	/* prevent infinite loop */
31300Sstevel@tonic-gate 	trycount = 10;
31310Sstevel@tonic-gate 	for (;;) {
31320Sstevel@tonic-gate 		if (--trycount <= 0) {
31330Sstevel@tonic-gate 			error = ESTALE;
31340Sstevel@tonic-gate 			goto out;
31350Sstevel@tonic-gate 		}
31360Sstevel@tonic-gate 
31370Sstevel@tonic-gate 		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
31380Sstevel@tonic-gate 			/*
31390Sstevel@tonic-gate 			 * VOP_ACCESS() may cover 'vp' with a new
31400Sstevel@tonic-gate 			 * filesystem, if 'vp' is an autoFS vnode.
31410Sstevel@tonic-gate 			 * Get the new 'vp' if so.
31420Sstevel@tonic-gate 			 */
31435331Samw 			if ((error =
31445331Samw 			    VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
31454417Seh208807 			    (!vn_ismntpt(vp) ||
31460Sstevel@tonic-gate 			    (error = traverse(&vp)) == 0)) {
31470Sstevel@tonic-gate 				pathlen = pn.pn_pathlen + 2;
31480Sstevel@tonic-gate 				path = kmem_alloc(pathlen, KM_SLEEP);
31490Sstevel@tonic-gate 				(void) strncpy(path, pn.pn_path,
31500Sstevel@tonic-gate 				    pn.pn_pathlen + 1);
31510Sstevel@tonic-gate 				path[pathlen - 2] = '/';
31520Sstevel@tonic-gate 				path[pathlen - 1] = '\0';
31530Sstevel@tonic-gate 				pn_free(&pn);
31540Sstevel@tonic-gate 				pn_free(&upn);
31550Sstevel@tonic-gate 
31560Sstevel@tonic-gate 				/* Success! */
31570Sstevel@tonic-gate 				break;
31580Sstevel@tonic-gate 			}
31590Sstevel@tonic-gate 			VN_RELE(vp);
31600Sstevel@tonic-gate 		}
31610Sstevel@tonic-gate 		if (error != ESTALE)
31620Sstevel@tonic-gate 			goto out;
31630Sstevel@tonic-gate 	}
31640Sstevel@tonic-gate 
31650Sstevel@tonic-gate 	ASSERT(error == 0);
31660Sstevel@tonic-gate 	zone->zone_rootvp = vp;		/* we hold a reference to vp */
31670Sstevel@tonic-gate 	zone->zone_rootpath = path;
31680Sstevel@tonic-gate 	zone->zone_rootpathlen = pathlen;
31691769Scarlsonj 	if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
31701769Scarlsonj 		zone->zone_flags |= ZF_IS_SCRATCH;
31710Sstevel@tonic-gate 	return (0);
31720Sstevel@tonic-gate 
31730Sstevel@tonic-gate out:
31740Sstevel@tonic-gate 	pn_free(&pn);
31750Sstevel@tonic-gate 	pn_free(&upn);
31760Sstevel@tonic-gate 	return (error);
31770Sstevel@tonic-gate }
31780Sstevel@tonic-gate 
31790Sstevel@tonic-gate #define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
31800Sstevel@tonic-gate 			((c) >= 'a' && (c) <= 'z') || \
31810Sstevel@tonic-gate 			((c) >= 'A' && (c) <= 'Z'))
31820Sstevel@tonic-gate 
31830Sstevel@tonic-gate static int
zone_set_name(zone_t * zone,const char * uname)31840Sstevel@tonic-gate zone_set_name(zone_t *zone, const char *uname)
31850Sstevel@tonic-gate {
31860Sstevel@tonic-gate 	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
31870Sstevel@tonic-gate 	size_t len;
31880Sstevel@tonic-gate 	int i, err;
31890Sstevel@tonic-gate 
31900Sstevel@tonic-gate 	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
31910Sstevel@tonic-gate 		kmem_free(kname, ZONENAME_MAX);
31920Sstevel@tonic-gate 		return (err);	/* EFAULT or ENAMETOOLONG */
31930Sstevel@tonic-gate 	}
31940Sstevel@tonic-gate 
31950Sstevel@tonic-gate 	/* must be less than ZONENAME_MAX */
31960Sstevel@tonic-gate 	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
31970Sstevel@tonic-gate 		kmem_free(kname, ZONENAME_MAX);
31980Sstevel@tonic-gate 		return (EINVAL);
31990Sstevel@tonic-gate 	}
32000Sstevel@tonic-gate 
32010Sstevel@tonic-gate 	/*
32020Sstevel@tonic-gate 	 * Name must start with an alphanumeric and must contain only
32030Sstevel@tonic-gate 	 * alphanumerics, '-', '_' and '.'.
32040Sstevel@tonic-gate 	 */
32050Sstevel@tonic-gate 	if (!isalnum(kname[0])) {
32060Sstevel@tonic-gate 		kmem_free(kname, ZONENAME_MAX);
32070Sstevel@tonic-gate 		return (EINVAL);
32080Sstevel@tonic-gate 	}
32090Sstevel@tonic-gate 	for (i = 1; i < len - 1; i++) {
32100Sstevel@tonic-gate 		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
32110Sstevel@tonic-gate 		    kname[i] != '.') {
32120Sstevel@tonic-gate 			kmem_free(kname, ZONENAME_MAX);
32130Sstevel@tonic-gate 			return (EINVAL);
32140Sstevel@tonic-gate 		}
32150Sstevel@tonic-gate 	}
32160Sstevel@tonic-gate 
32170Sstevel@tonic-gate 	zone->zone_name = kname;
32180Sstevel@tonic-gate 	return (0);
32190Sstevel@tonic-gate }
32200Sstevel@tonic-gate 
32210Sstevel@tonic-gate /*
32228662SJordan.Vaughan@Sun.com  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
32238662SJordan.Vaughan@Sun.com  * is NULL or it points to a zone with no hostid emulation, then the machine's
32248662SJordan.Vaughan@Sun.com  * hostid (i.e., the global zone's hostid) is returned.  This function returns
32258662SJordan.Vaughan@Sun.com  * zero if neither the zone nor the host machine (global zone) have hostids.  It
32268662SJordan.Vaughan@Sun.com  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
32278662SJordan.Vaughan@Sun.com  * hostid and the machine's hostid is invalid.
32288662SJordan.Vaughan@Sun.com  */
32298662SJordan.Vaughan@Sun.com uint32_t
zone_get_hostid(zone_t * zonep)32308662SJordan.Vaughan@Sun.com zone_get_hostid(zone_t *zonep)
32318662SJordan.Vaughan@Sun.com {
32328662SJordan.Vaughan@Sun.com 	unsigned long machine_hostid;
32338662SJordan.Vaughan@Sun.com 
32348662SJordan.Vaughan@Sun.com 	if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
32358662SJordan.Vaughan@Sun.com 		if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
32368662SJordan.Vaughan@Sun.com 			return (HW_INVALID_HOSTID);
32378662SJordan.Vaughan@Sun.com 		return ((uint32_t)machine_hostid);
32388662SJordan.Vaughan@Sun.com 	}
32398662SJordan.Vaughan@Sun.com 	return (zonep->zone_hostid);
32408662SJordan.Vaughan@Sun.com }
32418662SJordan.Vaughan@Sun.com 
32428662SJordan.Vaughan@Sun.com /*
32430Sstevel@tonic-gate  * Similar to thread_create(), but makes sure the thread is in the appropriate
32440Sstevel@tonic-gate  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
32450Sstevel@tonic-gate  */
32460Sstevel@tonic-gate /*ARGSUSED*/
32470Sstevel@tonic-gate kthread_t *
zthread_create(caddr_t stk,size_t stksize,void (* proc)(),void * arg,size_t len,pri_t pri)32480Sstevel@tonic-gate zthread_create(
32490Sstevel@tonic-gate     caddr_t stk,
32500Sstevel@tonic-gate     size_t stksize,
32510Sstevel@tonic-gate     void (*proc)(),
32520Sstevel@tonic-gate     void *arg,
32530Sstevel@tonic-gate     size_t len,
32540Sstevel@tonic-gate     pri_t pri)
32550Sstevel@tonic-gate {
32560Sstevel@tonic-gate 	kthread_t *t;
32570Sstevel@tonic-gate 	zone_t *zone = curproc->p_zone;
32580Sstevel@tonic-gate 	proc_t *pp = zone->zone_zsched;
32590Sstevel@tonic-gate 
32600Sstevel@tonic-gate 	zone_hold(zone);	/* Reference to be dropped when thread exits */
32610Sstevel@tonic-gate 
32620Sstevel@tonic-gate 	/*
32630Sstevel@tonic-gate 	 * No-one should be trying to create threads if the zone is shutting
32640Sstevel@tonic-gate 	 * down and there aren't any kernel threads around.  See comment
32650Sstevel@tonic-gate 	 * in zthread_exit().
32660Sstevel@tonic-gate 	 */
32670Sstevel@tonic-gate 	ASSERT(!(zone->zone_kthreads == NULL &&
32680Sstevel@tonic-gate 	    zone_status_get(zone) >= ZONE_IS_EMPTY));
32690Sstevel@tonic-gate 	/*
32700Sstevel@tonic-gate 	 * Create a thread, but don't let it run until we've finished setting
32710Sstevel@tonic-gate 	 * things up.
32720Sstevel@tonic-gate 	 */
32730Sstevel@tonic-gate 	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
32740Sstevel@tonic-gate 	ASSERT(t->t_forw == NULL);
32750Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
32760Sstevel@tonic-gate 	if (zone->zone_kthreads == NULL) {
32770Sstevel@tonic-gate 		t->t_forw = t->t_back = t;
32780Sstevel@tonic-gate 	} else {
32790Sstevel@tonic-gate 		kthread_t *tx = zone->zone_kthreads;
32800Sstevel@tonic-gate 
32810Sstevel@tonic-gate 		t->t_forw = tx;
32820Sstevel@tonic-gate 		t->t_back = tx->t_back;
32830Sstevel@tonic-gate 		tx->t_back->t_forw = t;
32840Sstevel@tonic-gate 		tx->t_back = t;
32850Sstevel@tonic-gate 	}
32860Sstevel@tonic-gate 	zone->zone_kthreads = t;
32870Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
32880Sstevel@tonic-gate 
32890Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
32900Sstevel@tonic-gate 	t->t_proc_flag |= TP_ZTHREAD;
32910Sstevel@tonic-gate 	project_rele(t->t_proj);
32920Sstevel@tonic-gate 	t->t_proj = project_hold(pp->p_task->tk_proj);
32930Sstevel@tonic-gate 
32940Sstevel@tonic-gate 	/*
32950Sstevel@tonic-gate 	 * Setup complete, let it run.
32960Sstevel@tonic-gate 	 */
32970Sstevel@tonic-gate 	thread_lock(t);
32980Sstevel@tonic-gate 	t->t_schedflag |= TS_ALLSTART;
32990Sstevel@tonic-gate 	setrun_locked(t);
33000Sstevel@tonic-gate 	thread_unlock(t);
33010Sstevel@tonic-gate 
33020Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
33030Sstevel@tonic-gate 
33040Sstevel@tonic-gate 	return (t);
33050Sstevel@tonic-gate }
33060Sstevel@tonic-gate 
33070Sstevel@tonic-gate /*
33080Sstevel@tonic-gate  * Similar to thread_exit().  Must be called by threads created via
33090Sstevel@tonic-gate  * zthread_exit().
33100Sstevel@tonic-gate  */
33110Sstevel@tonic-gate void
zthread_exit(void)33120Sstevel@tonic-gate zthread_exit(void)
33130Sstevel@tonic-gate {
33140Sstevel@tonic-gate 	kthread_t *t = curthread;
33150Sstevel@tonic-gate 	proc_t *pp = curproc;
33160Sstevel@tonic-gate 	zone_t *zone = pp->p_zone;
33170Sstevel@tonic-gate 
33180Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
33190Sstevel@tonic-gate 
33200Sstevel@tonic-gate 	/*
33210Sstevel@tonic-gate 	 * Reparent to p0
33220Sstevel@tonic-gate 	 */
33231075Sjosephb 	kpreempt_disable();
33240Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
33250Sstevel@tonic-gate 	t->t_proc_flag &= ~TP_ZTHREAD;
33260Sstevel@tonic-gate 	t->t_procp = &p0;
33270Sstevel@tonic-gate 	hat_thread_exit(t);
33280Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
33291075Sjosephb 	kpreempt_enable();
33300Sstevel@tonic-gate 
33310Sstevel@tonic-gate 	if (t->t_back == t) {
33320Sstevel@tonic-gate 		ASSERT(t->t_forw == t);
33330Sstevel@tonic-gate 		/*
33340Sstevel@tonic-gate 		 * If the zone is empty, once the thread count
33350Sstevel@tonic-gate 		 * goes to zero no further kernel threads can be
33360Sstevel@tonic-gate 		 * created.  This is because if the creator is a process
33370Sstevel@tonic-gate 		 * in the zone, then it must have exited before the zone
33380Sstevel@tonic-gate 		 * state could be set to ZONE_IS_EMPTY.
33390Sstevel@tonic-gate 		 * Otherwise, if the creator is a kernel thread in the
33400Sstevel@tonic-gate 		 * zone, the thread count is non-zero.
33410Sstevel@tonic-gate 		 *
33420Sstevel@tonic-gate 		 * This really means that non-zone kernel threads should
33430Sstevel@tonic-gate 		 * not create zone kernel threads.
33440Sstevel@tonic-gate 		 */
33450Sstevel@tonic-gate 		zone->zone_kthreads = NULL;
33460Sstevel@tonic-gate 		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
33470Sstevel@tonic-gate 			zone_status_set(zone, ZONE_IS_DOWN);
33483792Sakolb 			/*
33493792Sakolb 			 * Remove any CPU caps on this zone.
33503792Sakolb 			 */
33513792Sakolb 			cpucaps_zone_remove(zone);
33520Sstevel@tonic-gate 		}
33530Sstevel@tonic-gate 	} else {
33540Sstevel@tonic-gate 		t->t_forw->t_back = t->t_back;
33550Sstevel@tonic-gate 		t->t_back->t_forw = t->t_forw;
33560Sstevel@tonic-gate 		if (zone->zone_kthreads == t)
33570Sstevel@tonic-gate 			zone->zone_kthreads = t->t_forw;
33580Sstevel@tonic-gate 	}
33590Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
33600Sstevel@tonic-gate 	zone_rele(zone);
33610Sstevel@tonic-gate 	thread_exit();
33620Sstevel@tonic-gate 	/* NOTREACHED */
33630Sstevel@tonic-gate }
33640Sstevel@tonic-gate 
33650Sstevel@tonic-gate static void
zone_chdir(vnode_t * vp,vnode_t ** vpp,proc_t * pp)33660Sstevel@tonic-gate zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
33670Sstevel@tonic-gate {
33680Sstevel@tonic-gate 	vnode_t *oldvp;
33690Sstevel@tonic-gate 
33700Sstevel@tonic-gate 	/* we're going to hold a reference here to the directory */
33710Sstevel@tonic-gate 	VN_HOLD(vp);
33720Sstevel@tonic-gate 
337311861SMarek.Pospisil@Sun.COM 	/* update abs cwd/root path see c2/audit.c */
337411861SMarek.Pospisil@Sun.COM 	if (AU_AUDITING())
33750Sstevel@tonic-gate 		audit_chdirec(vp, vpp);
33760Sstevel@tonic-gate 
33770Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
33780Sstevel@tonic-gate 	oldvp = *vpp;
33790Sstevel@tonic-gate 	*vpp = vp;
33800Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
33810Sstevel@tonic-gate 	if (oldvp != NULL)
33820Sstevel@tonic-gate 		VN_RELE(oldvp);
33830Sstevel@tonic-gate }
33840Sstevel@tonic-gate 
33850Sstevel@tonic-gate /*
33860Sstevel@tonic-gate  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
33870Sstevel@tonic-gate  */
33880Sstevel@tonic-gate static int
nvlist2rctlval(nvlist_t * nvl,rctl_val_t * rv)33890Sstevel@tonic-gate nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
33900Sstevel@tonic-gate {
33910Sstevel@tonic-gate 	nvpair_t *nvp = NULL;
33920Sstevel@tonic-gate 	boolean_t priv_set = B_FALSE;
33930Sstevel@tonic-gate 	boolean_t limit_set = B_FALSE;
33940Sstevel@tonic-gate 	boolean_t action_set = B_FALSE;
33950Sstevel@tonic-gate 
33960Sstevel@tonic-gate 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
33970Sstevel@tonic-gate 		const char *name;
33980Sstevel@tonic-gate 		uint64_t ui64;
33990Sstevel@tonic-gate 
34000Sstevel@tonic-gate 		name = nvpair_name(nvp);
34010Sstevel@tonic-gate 		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
34020Sstevel@tonic-gate 			return (EINVAL);
34030Sstevel@tonic-gate 		(void) nvpair_value_uint64(nvp, &ui64);
34040Sstevel@tonic-gate 		if (strcmp(name, "privilege") == 0) {
34050Sstevel@tonic-gate 			/*
34060Sstevel@tonic-gate 			 * Currently only privileged values are allowed, but
34070Sstevel@tonic-gate 			 * this may change in the future.
34080Sstevel@tonic-gate 			 */
34090Sstevel@tonic-gate 			if (ui64 != RCPRIV_PRIVILEGED)
34100Sstevel@tonic-gate 				return (EINVAL);
34110Sstevel@tonic-gate 			rv->rcv_privilege = ui64;
34120Sstevel@tonic-gate 			priv_set = B_TRUE;
34130Sstevel@tonic-gate 		} else if (strcmp(name, "limit") == 0) {
34140Sstevel@tonic-gate 			rv->rcv_value = ui64;
34150Sstevel@tonic-gate 			limit_set = B_TRUE;
34160Sstevel@tonic-gate 		} else if (strcmp(name, "action") == 0) {
34170Sstevel@tonic-gate 			if (ui64 != RCTL_LOCAL_NOACTION &&
34180Sstevel@tonic-gate 			    ui64 != RCTL_LOCAL_DENY)
34190Sstevel@tonic-gate 				return (EINVAL);
34200Sstevel@tonic-gate 			rv->rcv_flagaction = ui64;
34210Sstevel@tonic-gate 			action_set = B_TRUE;
34220Sstevel@tonic-gate 		} else {
34230Sstevel@tonic-gate 			return (EINVAL);
34240Sstevel@tonic-gate 		}
34250Sstevel@tonic-gate 	}
34260Sstevel@tonic-gate 
34270Sstevel@tonic-gate 	if (!(priv_set && limit_set && action_set))
34280Sstevel@tonic-gate 		return (EINVAL);
34290Sstevel@tonic-gate 	rv->rcv_action_signal = 0;
34300Sstevel@tonic-gate 	rv->rcv_action_recipient = NULL;
34310Sstevel@tonic-gate 	rv->rcv_action_recip_pid = -1;
34320Sstevel@tonic-gate 	rv->rcv_firing_time = 0;
34330Sstevel@tonic-gate 
34340Sstevel@tonic-gate 	return (0);
34350Sstevel@tonic-gate }
34360Sstevel@tonic-gate 
34372267Sdp /*
34382267Sdp  * Non-global zone version of start_init.
34392267Sdp  */
34400Sstevel@tonic-gate void
zone_start_init(void)34412267Sdp zone_start_init(void)
34420Sstevel@tonic-gate {
34430Sstevel@tonic-gate 	proc_t *p = ttoproc(curthread);
34442712Snn35248 	zone_t *z = p->p_zone;
34452267Sdp 
34462267Sdp 	ASSERT(!INGLOBALZONE(curproc));
34470Sstevel@tonic-gate 
34480Sstevel@tonic-gate 	/*
34492712Snn35248 	 * For all purposes (ZONE_ATTR_INITPID and restart_init),
34502712Snn35248 	 * storing just the pid of init is sufficient.
34512712Snn35248 	 */
34522712Snn35248 	z->zone_proc_initpid = p->p_pid;
34532712Snn35248 
34542712Snn35248 	/*
34552267Sdp 	 * We maintain zone_boot_err so that we can return the cause of the
34562267Sdp 	 * failure back to the caller of the zone_boot syscall.
34570Sstevel@tonic-gate 	 */
34582267Sdp 	p->p_zone->zone_boot_err = start_init_common();
34590Sstevel@tonic-gate 
34608364SJordan.Vaughan@Sun.com 	/*
34618364SJordan.Vaughan@Sun.com 	 * We will prevent booting zones from becoming running zones if the
34628364SJordan.Vaughan@Sun.com 	 * global zone is shutting down.
34638364SJordan.Vaughan@Sun.com 	 */
34640Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
34658364SJordan.Vaughan@Sun.com 	if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
34668364SJordan.Vaughan@Sun.com 	    ZONE_IS_SHUTTING_DOWN) {
34670Sstevel@tonic-gate 		/*
34680Sstevel@tonic-gate 		 * Make sure we are still in the booting state-- we could have
34690Sstevel@tonic-gate 		 * raced and already be shutting down, or even further along.
34700Sstevel@tonic-gate 		 */
34713792Sakolb 		if (zone_status_get(z) == ZONE_IS_BOOTING) {
34722712Snn35248 			zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
34733792Sakolb 		}
34740Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
34750Sstevel@tonic-gate 		/* It's gone bad, dispose of the process */
34762712Snn35248 		if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3477390Sraf 			mutex_enter(&p->p_lock);
3478390Sraf 			ASSERT(p->p_flag & SEXITLWPS);
34790Sstevel@tonic-gate 			lwp_exit();
34800Sstevel@tonic-gate 		}
34810Sstevel@tonic-gate 	} else {
34822712Snn35248 		if (zone_status_get(z) == ZONE_IS_BOOTING)
34832712Snn35248 			zone_status_set(z, ZONE_IS_RUNNING);
34840Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
34850Sstevel@tonic-gate 		/* cause the process to return to userland. */
34860Sstevel@tonic-gate 		lwp_rtt();
34870Sstevel@tonic-gate 	}
34880Sstevel@tonic-gate }
34890Sstevel@tonic-gate 
34900Sstevel@tonic-gate struct zsched_arg {
34910Sstevel@tonic-gate 	zone_t *zone;
34920Sstevel@tonic-gate 	nvlist_t *nvlist;
34930Sstevel@tonic-gate };
34940Sstevel@tonic-gate 
34950Sstevel@tonic-gate /*
34960Sstevel@tonic-gate  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
34970Sstevel@tonic-gate  * anything to do with scheduling, but rather with the fact that
34980Sstevel@tonic-gate  * per-zone kernel threads are parented to zsched, just like regular
34990Sstevel@tonic-gate  * kernel threads are parented to sched (p0).
35000Sstevel@tonic-gate  *
35010Sstevel@tonic-gate  * zsched is also responsible for launching init for the zone.
35020Sstevel@tonic-gate  */
35030Sstevel@tonic-gate static void
zsched(void * arg)35040Sstevel@tonic-gate zsched(void *arg)
35050Sstevel@tonic-gate {
35060Sstevel@tonic-gate 	struct zsched_arg *za = arg;
35070Sstevel@tonic-gate 	proc_t *pp = curproc;
35080Sstevel@tonic-gate 	proc_t *initp = proc_init;
35090Sstevel@tonic-gate 	zone_t *zone = za->zone;
35100Sstevel@tonic-gate 	cred_t *cr, *oldcred;
35110Sstevel@tonic-gate 	rctl_set_t *set;
35120Sstevel@tonic-gate 	rctl_alloc_gp_t *gp;
35130Sstevel@tonic-gate 	contract_t *ct = NULL;
35140Sstevel@tonic-gate 	task_t *tk, *oldtk;
35150Sstevel@tonic-gate 	rctl_entity_p_t e;
35160Sstevel@tonic-gate 	kproject_t *pj;
35170Sstevel@tonic-gate 
35180Sstevel@tonic-gate 	nvlist_t *nvl = za->nvlist;
35190Sstevel@tonic-gate 	nvpair_t *nvp = NULL;
35200Sstevel@tonic-gate 
35213446Smrj 	bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
35223446Smrj 	bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
35233446Smrj 	PTOU(pp)->u_argc = 0;
35243446Smrj 	PTOU(pp)->u_argv = NULL;
35253446Smrj 	PTOU(pp)->u_envp = NULL;
35260Sstevel@tonic-gate 	closeall(P_FINFO(pp));
35270Sstevel@tonic-gate 
35280Sstevel@tonic-gate 	/*
35290Sstevel@tonic-gate 	 * We are this zone's "zsched" process.  As the zone isn't generally
35300Sstevel@tonic-gate 	 * visible yet we don't need to grab any locks before initializing its
35310Sstevel@tonic-gate 	 * zone_proc pointer.
35320Sstevel@tonic-gate 	 */
35330Sstevel@tonic-gate 	zone_hold(zone);  /* this hold is released by zone_destroy() */
35340Sstevel@tonic-gate 	zone->zone_zsched = pp;
35350Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
35360Sstevel@tonic-gate 	pp->p_zone = zone;
35370Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
35380Sstevel@tonic-gate 
35390Sstevel@tonic-gate 	/*
35400Sstevel@tonic-gate 	 * Disassociate process from its 'parent'; parent ourselves to init
35410Sstevel@tonic-gate 	 * (pid 1) and change other values as needed.
35420Sstevel@tonic-gate 	 */
35430Sstevel@tonic-gate 	sess_create();
35440Sstevel@tonic-gate 
35450Sstevel@tonic-gate 	mutex_enter(&pidlock);
35460Sstevel@tonic-gate 	proc_detach(pp);
35470Sstevel@tonic-gate 	pp->p_ppid = 1;
35480Sstevel@tonic-gate 	pp->p_flag |= SZONETOP;
35490Sstevel@tonic-gate 	pp->p_ancpid = 1;
35500Sstevel@tonic-gate 	pp->p_parent = initp;
35510Sstevel@tonic-gate 	pp->p_psibling = NULL;
35520Sstevel@tonic-gate 	if (initp->p_child)
35530Sstevel@tonic-gate 		initp->p_child->p_psibling = pp;
35540Sstevel@tonic-gate 	pp->p_sibling = initp->p_child;
35550Sstevel@tonic-gate 	initp->p_child = pp;
35560Sstevel@tonic-gate 
35570Sstevel@tonic-gate 	/* Decrement what newproc() incremented. */
35580Sstevel@tonic-gate 	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
35590Sstevel@tonic-gate 	/*
35600Sstevel@tonic-gate 	 * Our credentials are about to become kcred-like, so we don't care
35610Sstevel@tonic-gate 	 * about the caller's ruid.
35620Sstevel@tonic-gate 	 */
35630Sstevel@tonic-gate 	upcount_inc(crgetruid(kcred), zone->zone_id);
35640Sstevel@tonic-gate 	mutex_exit(&pidlock);
35650Sstevel@tonic-gate 
35660Sstevel@tonic-gate 	/*
356712725SMenno.Lageman@Sun.COM 	 * getting out of global zone, so decrement lwp and process counts
35680Sstevel@tonic-gate 	 */
35690Sstevel@tonic-gate 	pj = pp->p_task->tk_proj;
35700Sstevel@tonic-gate 	mutex_enter(&global_zone->zone_nlwps_lock);
35710Sstevel@tonic-gate 	pj->kpj_nlwps -= pp->p_lwpcnt;
35720Sstevel@tonic-gate 	global_zone->zone_nlwps -= pp->p_lwpcnt;
357312725SMenno.Lageman@Sun.COM 	pj->kpj_nprocs--;
357412725SMenno.Lageman@Sun.COM 	global_zone->zone_nprocs--;
35750Sstevel@tonic-gate 	mutex_exit(&global_zone->zone_nlwps_lock);
35760Sstevel@tonic-gate 
35770Sstevel@tonic-gate 	/*
35782768Ssl108498 	 * Decrement locked memory counts on old zone and project.
35792768Ssl108498 	 */
35803247Sgjelinek 	mutex_enter(&global_zone->zone_mem_lock);
35812768Ssl108498 	global_zone->zone_locked_mem -= pp->p_locked_mem;
35822768Ssl108498 	pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
35833247Sgjelinek 	mutex_exit(&global_zone->zone_mem_lock);
35842768Ssl108498 
35852768Ssl108498 	/*
35860Sstevel@tonic-gate 	 * Create and join a new task in project '0' of this zone.
35870Sstevel@tonic-gate 	 *
35880Sstevel@tonic-gate 	 * We don't need to call holdlwps() since we know we're the only lwp in
35890Sstevel@tonic-gate 	 * this process.
35900Sstevel@tonic-gate 	 *
35910Sstevel@tonic-gate 	 * task_join() returns with p_lock held.
35920Sstevel@tonic-gate 	 */
35930Sstevel@tonic-gate 	tk = task_create(0, zone);
35940Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
35950Sstevel@tonic-gate 	oldtk = task_join(tk, 0);
35962768Ssl108498 
35972768Ssl108498 	pj = pp->p_task->tk_proj;
35982768Ssl108498 
35993247Sgjelinek 	mutex_enter(&zone->zone_mem_lock);
36002768Ssl108498 	zone->zone_locked_mem += pp->p_locked_mem;
36012768Ssl108498 	pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
36023247Sgjelinek 	mutex_exit(&zone->zone_mem_lock);
36030Sstevel@tonic-gate 
36040Sstevel@tonic-gate 	/*
360512725SMenno.Lageman@Sun.COM 	 * add lwp and process counts to zsched's zone, and increment
360612725SMenno.Lageman@Sun.COM 	 * project's task and process count due to the task created in
360712725SMenno.Lageman@Sun.COM 	 * the above task_create.
36080Sstevel@tonic-gate 	 */
36090Sstevel@tonic-gate 	mutex_enter(&zone->zone_nlwps_lock);
36100Sstevel@tonic-gate 	pj->kpj_nlwps += pp->p_lwpcnt;
36110Sstevel@tonic-gate 	pj->kpj_ntasks += 1;
36120Sstevel@tonic-gate 	zone->zone_nlwps += pp->p_lwpcnt;
361312725SMenno.Lageman@Sun.COM 	pj->kpj_nprocs++;
361412725SMenno.Lageman@Sun.COM 	zone->zone_nprocs++;
36150Sstevel@tonic-gate 	mutex_exit(&zone->zone_nlwps_lock);
36160Sstevel@tonic-gate 
36172768Ssl108498 	mutex_exit(&curproc->p_lock);
36182768Ssl108498 	mutex_exit(&cpu_lock);
36192768Ssl108498 	task_rele(oldtk);
36202768Ssl108498 
36210Sstevel@tonic-gate 	/*
36220Sstevel@tonic-gate 	 * The process was created by a process in the global zone, hence the
36230Sstevel@tonic-gate 	 * credentials are wrong.  We might as well have kcred-ish credentials.
36240Sstevel@tonic-gate 	 */
36250Sstevel@tonic-gate 	cr = zone->zone_kcred;
36260Sstevel@tonic-gate 	crhold(cr);
36270Sstevel@tonic-gate 	mutex_enter(&pp->p_crlock);
36280Sstevel@tonic-gate 	oldcred = pp->p_cred;
36290Sstevel@tonic-gate 	pp->p_cred = cr;
36300Sstevel@tonic-gate 	mutex_exit(&pp->p_crlock);
36310Sstevel@tonic-gate 	crfree(oldcred);
36320Sstevel@tonic-gate 
36330Sstevel@tonic-gate 	/*
36340Sstevel@tonic-gate 	 * Hold credentials again (for thread)
36350Sstevel@tonic-gate 	 */
36360Sstevel@tonic-gate 	crhold(cr);
36370Sstevel@tonic-gate 
36380Sstevel@tonic-gate 	/*
36390Sstevel@tonic-gate 	 * p_lwpcnt can't change since this is a kernel process.
36400Sstevel@tonic-gate 	 */
36410Sstevel@tonic-gate 	crset(pp, cr);
36420Sstevel@tonic-gate 
36430Sstevel@tonic-gate 	/*
36440Sstevel@tonic-gate 	 * Chroot
36450Sstevel@tonic-gate 	 */
36460Sstevel@tonic-gate 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
36470Sstevel@tonic-gate 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
36480Sstevel@tonic-gate 
36490Sstevel@tonic-gate 	/*
36500Sstevel@tonic-gate 	 * Initialize zone's rctl set.
36510Sstevel@tonic-gate 	 */
36520Sstevel@tonic-gate 	set = rctl_set_create();
36530Sstevel@tonic-gate 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
36540Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
36550Sstevel@tonic-gate 	e.rcep_p.zone = zone;
36560Sstevel@tonic-gate 	e.rcep_t = RCENTITY_ZONE;
36570Sstevel@tonic-gate 	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
36580Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
36590Sstevel@tonic-gate 	rctl_prealloc_destroy(gp);
36600Sstevel@tonic-gate 
36610Sstevel@tonic-gate 	/*
36620Sstevel@tonic-gate 	 * Apply the rctls passed in to zone_create().  This is basically a list
36630Sstevel@tonic-gate 	 * assignment: all of the old values are removed and the new ones
36640Sstevel@tonic-gate 	 * inserted.  That is, if an empty list is passed in, all values are
36650Sstevel@tonic-gate 	 * removed.
36660Sstevel@tonic-gate 	 */
36670Sstevel@tonic-gate 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
36680Sstevel@tonic-gate 		rctl_dict_entry_t *rde;
36690Sstevel@tonic-gate 		rctl_hndl_t hndl;
36700Sstevel@tonic-gate 		char *name;
36710Sstevel@tonic-gate 		nvlist_t **nvlarray;
36720Sstevel@tonic-gate 		uint_t i, nelem;
36730Sstevel@tonic-gate 		int error;	/* For ASSERT()s */
36740Sstevel@tonic-gate 
36750Sstevel@tonic-gate 		name = nvpair_name(nvp);
36760Sstevel@tonic-gate 		hndl = rctl_hndl_lookup(name);
36770Sstevel@tonic-gate 		ASSERT(hndl != -1);
36780Sstevel@tonic-gate 		rde = rctl_dict_lookup_hndl(hndl);
36790Sstevel@tonic-gate 		ASSERT(rde != NULL);
36800Sstevel@tonic-gate 
36810Sstevel@tonic-gate 		for (; /* ever */; ) {
36820Sstevel@tonic-gate 			rctl_val_t oval;
36830Sstevel@tonic-gate 
36840Sstevel@tonic-gate 			mutex_enter(&pp->p_lock);
36850Sstevel@tonic-gate 			error = rctl_local_get(hndl, NULL, &oval, pp);
36860Sstevel@tonic-gate 			mutex_exit(&pp->p_lock);
36870Sstevel@tonic-gate 			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
36880Sstevel@tonic-gate 			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
36890Sstevel@tonic-gate 			if (oval.rcv_privilege == RCPRIV_SYSTEM)
36900Sstevel@tonic-gate 				break;
36910Sstevel@tonic-gate 			mutex_enter(&pp->p_lock);
36920Sstevel@tonic-gate 			error = rctl_local_delete(hndl, &oval, pp);
36930Sstevel@tonic-gate 			mutex_exit(&pp->p_lock);
36940Sstevel@tonic-gate 			ASSERT(error == 0);
36950Sstevel@tonic-gate 		}
36960Sstevel@tonic-gate 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
36970Sstevel@tonic-gate 		ASSERT(error == 0);
36980Sstevel@tonic-gate 		for (i = 0; i < nelem; i++) {
36990Sstevel@tonic-gate 			rctl_val_t *nvalp;
37000Sstevel@tonic-gate 
37010Sstevel@tonic-gate 			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
37020Sstevel@tonic-gate 			error = nvlist2rctlval(nvlarray[i], nvalp);
37030Sstevel@tonic-gate 			ASSERT(error == 0);
37040Sstevel@tonic-gate 			/*
37050Sstevel@tonic-gate 			 * rctl_local_insert can fail if the value being
37060Sstevel@tonic-gate 			 * inserted is a duplicate; this is OK.
37070Sstevel@tonic-gate 			 */
37080Sstevel@tonic-gate 			mutex_enter(&pp->p_lock);
37090Sstevel@tonic-gate 			if (rctl_local_insert(hndl, nvalp, pp) != 0)
37100Sstevel@tonic-gate 				kmem_cache_free(rctl_val_cache, nvalp);
37110Sstevel@tonic-gate 			mutex_exit(&pp->p_lock);
37120Sstevel@tonic-gate 		}
37130Sstevel@tonic-gate 	}
37140Sstevel@tonic-gate 	/*
37150Sstevel@tonic-gate 	 * Tell the world that we're done setting up.
37160Sstevel@tonic-gate 	 *
37175880Snordmark 	 * At this point we want to set the zone status to ZONE_IS_INITIALIZED
37180Sstevel@tonic-gate 	 * and atomically set the zone's processor set visibility.  Once
37190Sstevel@tonic-gate 	 * we drop pool_lock() this zone will automatically get updated
37200Sstevel@tonic-gate 	 * to reflect any future changes to the pools configuration.
37215880Snordmark 	 *
37225880Snordmark 	 * Note that after we drop the locks below (zonehash_lock in
37235880Snordmark 	 * particular) other operations such as a zone_getattr call can
37245880Snordmark 	 * now proceed and observe the zone. That is the reason for doing a
37255880Snordmark 	 * state transition to the INITIALIZED state.
37260Sstevel@tonic-gate 	 */
37270Sstevel@tonic-gate 	pool_lock();
37280Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
37290Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
37300Sstevel@tonic-gate 	zone_uniqid(zone);
37310Sstevel@tonic-gate 	zone_zsd_configure(zone);
37320Sstevel@tonic-gate 	if (pool_state == POOL_ENABLED)
37330Sstevel@tonic-gate 		zone_pset_set(zone, pool_default->pool_pset->pset_id);
37340Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
37350Sstevel@tonic-gate 	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
37365880Snordmark 	zone_status_set(zone, ZONE_IS_INITIALIZED);
37370Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
37380Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
37390Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
37400Sstevel@tonic-gate 	pool_unlock();
37410Sstevel@tonic-gate 
37425880Snordmark 	/* Now call the create callback for this key */
37435880Snordmark 	zsd_apply_all_keys(zsd_apply_create, zone);
37445880Snordmark 
37455880Snordmark 	/* The callbacks are complete. Mark ZONE_IS_READY */
37465880Snordmark 	mutex_enter(&zone_status_lock);
37475880Snordmark 	ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
37485880Snordmark 	zone_status_set(zone, ZONE_IS_READY);
37495880Snordmark 	mutex_exit(&zone_status_lock);
37505880Snordmark 
37510Sstevel@tonic-gate 	/*
37520Sstevel@tonic-gate 	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
37530Sstevel@tonic-gate 	 * we launch init, and set the state to running.
37540Sstevel@tonic-gate 	 */
37550Sstevel@tonic-gate 	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
37560Sstevel@tonic-gate 
37570Sstevel@tonic-gate 	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
37580Sstevel@tonic-gate 		id_t cid;
37590Sstevel@tonic-gate 
37600Sstevel@tonic-gate 		/*
37610Sstevel@tonic-gate 		 * Ok, this is a little complicated.  We need to grab the
37620Sstevel@tonic-gate 		 * zone's pool's scheduling class ID; note that by now, we
37630Sstevel@tonic-gate 		 * are already bound to a pool if we need to be (zoneadmd
37640Sstevel@tonic-gate 		 * will have done that to us while we're in the READY
37650Sstevel@tonic-gate 		 * state).  *But* the scheduling class for the zone's 'init'
37660Sstevel@tonic-gate 		 * must be explicitly passed to newproc, which doesn't
37670Sstevel@tonic-gate 		 * respect pool bindings.
37680Sstevel@tonic-gate 		 *
37690Sstevel@tonic-gate 		 * We hold the pool_lock across the call to newproc() to
37700Sstevel@tonic-gate 		 * close the obvious race: the pool's scheduling class
37710Sstevel@tonic-gate 		 * could change before we manage to create the LWP with
37720Sstevel@tonic-gate 		 * classid 'cid'.
37730Sstevel@tonic-gate 		 */
37740Sstevel@tonic-gate 		pool_lock();
37753247Sgjelinek 		if (zone->zone_defaultcid > 0)
37763247Sgjelinek 			cid = zone->zone_defaultcid;
37773247Sgjelinek 		else
37783247Sgjelinek 			cid = pool_get_class(zone->zone_pool);
37790Sstevel@tonic-gate 		if (cid == -1)
37800Sstevel@tonic-gate 			cid = defaultcid;
37810Sstevel@tonic-gate 
37820Sstevel@tonic-gate 		/*
37830Sstevel@tonic-gate 		 * If this fails, zone_boot will ultimately fail.  The
37840Sstevel@tonic-gate 		 * state of the zone will be set to SHUTTING_DOWN-- userland
37850Sstevel@tonic-gate 		 * will have to tear down the zone, and fail, or try again.
37860Sstevel@tonic-gate 		 */
37872267Sdp 		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
378811173SJonathan.Adams@Sun.COM 		    minclsyspri - 1, &ct, 0)) != 0) {
37890Sstevel@tonic-gate 			mutex_enter(&zone_status_lock);
37900Sstevel@tonic-gate 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
37910Sstevel@tonic-gate 			mutex_exit(&zone_status_lock);
37920Sstevel@tonic-gate 		}
37930Sstevel@tonic-gate 		pool_unlock();
37940Sstevel@tonic-gate 	}
37950Sstevel@tonic-gate 
37960Sstevel@tonic-gate 	/*
37970Sstevel@tonic-gate 	 * Wait for zone_destroy() to be called.  This is what we spend
37980Sstevel@tonic-gate 	 * most of our life doing.
37990Sstevel@tonic-gate 	 */
38000Sstevel@tonic-gate 	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
38010Sstevel@tonic-gate 
38020Sstevel@tonic-gate 	if (ct)
38030Sstevel@tonic-gate 		/*
38040Sstevel@tonic-gate 		 * At this point the process contract should be empty.
38050Sstevel@tonic-gate 		 * (Though if it isn't, it's not the end of the world.)
38060Sstevel@tonic-gate 		 */
38070Sstevel@tonic-gate 		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
38080Sstevel@tonic-gate 
38090Sstevel@tonic-gate 	/*
38100Sstevel@tonic-gate 	 * Allow kcred to be freed when all referring processes
38110Sstevel@tonic-gate 	 * (including this one) go away.  We can't just do this in
38120Sstevel@tonic-gate 	 * zone_free because we need to wait for the zone_cred_ref to
38130Sstevel@tonic-gate 	 * drop to 0 before calling zone_free, and the existence of
38140Sstevel@tonic-gate 	 * zone_kcred will prevent that.  Thus, we call crfree here to
38150Sstevel@tonic-gate 	 * balance the crdup in zone_create.  The crhold calls earlier
38160Sstevel@tonic-gate 	 * in zsched will be dropped when the thread and process exit.
38170Sstevel@tonic-gate 	 */
38180Sstevel@tonic-gate 	crfree(zone->zone_kcred);
38190Sstevel@tonic-gate 	zone->zone_kcred = NULL;
38200Sstevel@tonic-gate 
38210Sstevel@tonic-gate 	exit(CLD_EXITED, 0);
38220Sstevel@tonic-gate }
38230Sstevel@tonic-gate 
38240Sstevel@tonic-gate /*
38250Sstevel@tonic-gate  * Helper function to determine if there are any submounts of the
38260Sstevel@tonic-gate  * provided path.  Used to make sure the zone doesn't "inherit" any
38270Sstevel@tonic-gate  * mounts from before it is created.
38280Sstevel@tonic-gate  */
38290Sstevel@tonic-gate static uint_t
zone_mount_count(const char * rootpath)38300Sstevel@tonic-gate zone_mount_count(const char *rootpath)
38310Sstevel@tonic-gate {
38320Sstevel@tonic-gate 	vfs_t *vfsp;
38330Sstevel@tonic-gate 	uint_t count = 0;
38340Sstevel@tonic-gate 	size_t rootpathlen = strlen(rootpath);
38350Sstevel@tonic-gate 
38360Sstevel@tonic-gate 	/*
38370Sstevel@tonic-gate 	 * Holding zonehash_lock prevents race conditions with
38380Sstevel@tonic-gate 	 * vfs_list_add()/vfs_list_remove() since we serialize with
38390Sstevel@tonic-gate 	 * zone_find_by_path().
38400Sstevel@tonic-gate 	 */
38410Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
38420Sstevel@tonic-gate 	/*
38430Sstevel@tonic-gate 	 * The rootpath must end with a '/'
38440Sstevel@tonic-gate 	 */
38450Sstevel@tonic-gate 	ASSERT(rootpath[rootpathlen - 1] == '/');
38460Sstevel@tonic-gate 
38470Sstevel@tonic-gate 	/*
38480Sstevel@tonic-gate 	 * This intentionally does not count the rootpath itself if that
38490Sstevel@tonic-gate 	 * happens to be a mount point.
38500Sstevel@tonic-gate 	 */
38510Sstevel@tonic-gate 	vfs_list_read_lock();
38520Sstevel@tonic-gate 	vfsp = rootvfs;
38530Sstevel@tonic-gate 	do {
38540Sstevel@tonic-gate 		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
38550Sstevel@tonic-gate 		    rootpathlen) == 0)
38560Sstevel@tonic-gate 			count++;
38570Sstevel@tonic-gate 		vfsp = vfsp->vfs_next;
38580Sstevel@tonic-gate 	} while (vfsp != rootvfs);
38590Sstevel@tonic-gate 	vfs_list_unlock();
38600Sstevel@tonic-gate 	return (count);
38610Sstevel@tonic-gate }
38620Sstevel@tonic-gate 
38630Sstevel@tonic-gate /*
38640Sstevel@tonic-gate  * Helper function to make sure that a zone created on 'rootpath'
38650Sstevel@tonic-gate  * wouldn't end up containing other zones' rootpaths.
38660Sstevel@tonic-gate  */
38670Sstevel@tonic-gate static boolean_t
zone_is_nested(const char * rootpath)38680Sstevel@tonic-gate zone_is_nested(const char *rootpath)
38690Sstevel@tonic-gate {
38700Sstevel@tonic-gate 	zone_t *zone;
38710Sstevel@tonic-gate 	size_t rootpathlen = strlen(rootpath);
38720Sstevel@tonic-gate 	size_t len;
38730Sstevel@tonic-gate 
38740Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&zonehash_lock));
38750Sstevel@tonic-gate 
38768799SDhanaraj.M@Sun.COM 	/*
38778799SDhanaraj.M@Sun.COM 	 * zone_set_root() appended '/' and '\0' at the end of rootpath
38788799SDhanaraj.M@Sun.COM 	 */
38798799SDhanaraj.M@Sun.COM 	if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
38808799SDhanaraj.M@Sun.COM 	    (rootpath[1] == '/') && (rootpath[2] == '\0'))
38818799SDhanaraj.M@Sun.COM 		return (B_TRUE);
38828799SDhanaraj.M@Sun.COM 
38830Sstevel@tonic-gate 	for (zone = list_head(&zone_active); zone != NULL;
38840Sstevel@tonic-gate 	    zone = list_next(&zone_active, zone)) {
38850Sstevel@tonic-gate 		if (zone == global_zone)
38860Sstevel@tonic-gate 			continue;
38870Sstevel@tonic-gate 		len = strlen(zone->zone_rootpath);
38880Sstevel@tonic-gate 		if (strncmp(rootpath, zone->zone_rootpath,
38890Sstevel@tonic-gate 		    MIN(rootpathlen, len)) == 0)
38900Sstevel@tonic-gate 			return (B_TRUE);
38910Sstevel@tonic-gate 	}
38920Sstevel@tonic-gate 	return (B_FALSE);
38930Sstevel@tonic-gate }
38940Sstevel@tonic-gate 
38950Sstevel@tonic-gate static int
zone_set_privset(zone_t * zone,const priv_set_t * zone_privs,size_t zone_privssz)3896813Sdp zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
3897813Sdp     size_t zone_privssz)
38980Sstevel@tonic-gate {
389912820Sdp@eng.sun.com 	priv_set_t *privs;
39000Sstevel@tonic-gate 
3901813Sdp 	if (zone_privssz < sizeof (priv_set_t))
390212820Sdp@eng.sun.com 		return (ENOMEM);
390312820Sdp@eng.sun.com 
390412820Sdp@eng.sun.com 	privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
3905813Sdp 
39060Sstevel@tonic-gate 	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
39070Sstevel@tonic-gate 		kmem_free(privs, sizeof (priv_set_t));
39080Sstevel@tonic-gate 		return (EFAULT);
39090Sstevel@tonic-gate 	}
39100Sstevel@tonic-gate 
39110Sstevel@tonic-gate 	zone->zone_privset = privs;
39120Sstevel@tonic-gate 	return (0);
39130Sstevel@tonic-gate }
39140Sstevel@tonic-gate 
39150Sstevel@tonic-gate /*
39160Sstevel@tonic-gate  * We make creative use of nvlists to pass in rctls from userland.  The list is
39170Sstevel@tonic-gate  * a list of the following structures:
39180Sstevel@tonic-gate  *
39190Sstevel@tonic-gate  * (name = rctl_name, value = nvpair_list_array)
39200Sstevel@tonic-gate  *
39210Sstevel@tonic-gate  * Where each element of the nvpair_list_array is of the form:
39220Sstevel@tonic-gate  *
39230Sstevel@tonic-gate  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
39240Sstevel@tonic-gate  * 	(name = "limit", value = uint64_t),
39250Sstevel@tonic-gate  * 	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
39260Sstevel@tonic-gate  */
39270Sstevel@tonic-gate static int
parse_rctls(caddr_t ubuf,size_t buflen,nvlist_t ** nvlp)39280Sstevel@tonic-gate parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
39290Sstevel@tonic-gate {
39300Sstevel@tonic-gate 	nvpair_t *nvp = NULL;
39310Sstevel@tonic-gate 	nvlist_t *nvl = NULL;
39320Sstevel@tonic-gate 	char *kbuf;
39330Sstevel@tonic-gate 	int error;
39340Sstevel@tonic-gate 	rctl_val_t rv;
39350Sstevel@tonic-gate 
39360Sstevel@tonic-gate 	*nvlp = NULL;
39370Sstevel@tonic-gate 
39380Sstevel@tonic-gate 	if (buflen == 0)
39390Sstevel@tonic-gate 		return (0);
39400Sstevel@tonic-gate 
39410Sstevel@tonic-gate 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
39420Sstevel@tonic-gate 		return (ENOMEM);
39430Sstevel@tonic-gate 	if (copyin(ubuf, kbuf, buflen)) {
39440Sstevel@tonic-gate 		error = EFAULT;
39450Sstevel@tonic-gate 		goto out;
39460Sstevel@tonic-gate 	}
39470Sstevel@tonic-gate 	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
39480Sstevel@tonic-gate 		/*
39490Sstevel@tonic-gate 		 * nvl may have been allocated/free'd, but the value set to
39500Sstevel@tonic-gate 		 * non-NULL, so we reset it here.
39510Sstevel@tonic-gate 		 */
39520Sstevel@tonic-gate 		nvl = NULL;
39530Sstevel@tonic-gate 		error = EINVAL;
39540Sstevel@tonic-gate 		goto out;
39550Sstevel@tonic-gate 	}
39560Sstevel@tonic-gate 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
39570Sstevel@tonic-gate 		rctl_dict_entry_t *rde;
39580Sstevel@tonic-gate 		rctl_hndl_t hndl;
39590Sstevel@tonic-gate 		nvlist_t **nvlarray;
39600Sstevel@tonic-gate 		uint_t i, nelem;
39610Sstevel@tonic-gate 		char *name;
39620Sstevel@tonic-gate 
39630Sstevel@tonic-gate 		error = EINVAL;
39640Sstevel@tonic-gate 		name = nvpair_name(nvp);
39650Sstevel@tonic-gate 		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
39660Sstevel@tonic-gate 		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
39670Sstevel@tonic-gate 			goto out;
39680Sstevel@tonic-gate 		}
39690Sstevel@tonic-gate 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
39700Sstevel@tonic-gate 			goto out;
39710Sstevel@tonic-gate 		}
39720Sstevel@tonic-gate 		rde = rctl_dict_lookup_hndl(hndl);
39730Sstevel@tonic-gate 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
39740Sstevel@tonic-gate 		ASSERT(error == 0);
39750Sstevel@tonic-gate 		for (i = 0; i < nelem; i++) {
39760Sstevel@tonic-gate 			if (error = nvlist2rctlval(nvlarray[i], &rv))
39770Sstevel@tonic-gate 				goto out;
39780Sstevel@tonic-gate 		}
39790Sstevel@tonic-gate 		if (rctl_invalid_value(rde, &rv)) {
39800Sstevel@tonic-gate 			error = EINVAL;
39810Sstevel@tonic-gate 			goto out;
39820Sstevel@tonic-gate 		}
39830Sstevel@tonic-gate 	}
39840Sstevel@tonic-gate 	error = 0;
39850Sstevel@tonic-gate 	*nvlp = nvl;
39860Sstevel@tonic-gate out:
39870Sstevel@tonic-gate 	kmem_free(kbuf, buflen);
39880Sstevel@tonic-gate 	if (error && nvl != NULL)
39890Sstevel@tonic-gate 		nvlist_free(nvl);
39900Sstevel@tonic-gate 	return (error);
39910Sstevel@tonic-gate }
39920Sstevel@tonic-gate 
39930Sstevel@tonic-gate int
zone_create_error(int er_error,int er_ext,int * er_out)39940Sstevel@tonic-gate zone_create_error(int er_error, int er_ext, int *er_out) {
39950Sstevel@tonic-gate 	if (er_out != NULL) {
39960Sstevel@tonic-gate 		if (copyout(&er_ext, er_out, sizeof (int))) {
39970Sstevel@tonic-gate 			return (set_errno(EFAULT));
39980Sstevel@tonic-gate 		}
39990Sstevel@tonic-gate 	}
40000Sstevel@tonic-gate 	return (set_errno(er_error));
40010Sstevel@tonic-gate }
40020Sstevel@tonic-gate 
40031676Sjpk static int
zone_set_label(zone_t * zone,const bslabel_t * lab,uint32_t doi)40041676Sjpk zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
40051676Sjpk {
40061676Sjpk 	ts_label_t *tsl;
40071676Sjpk 	bslabel_t blab;
40081676Sjpk 
40091676Sjpk 	/* Get label from user */
40101676Sjpk 	if (copyin(lab, &blab, sizeof (blab)) != 0)
40111676Sjpk 		return (EFAULT);
40121676Sjpk 	tsl = labelalloc(&blab, doi, KM_NOSLEEP);
40131676Sjpk 	if (tsl == NULL)
40141676Sjpk 		return (ENOMEM);
40151676Sjpk 
40161676Sjpk 	zone->zone_slabel = tsl;
40171676Sjpk 	return (0);
40181676Sjpk }
40191676Sjpk 
40200Sstevel@tonic-gate /*
4021789Sahrens  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4022789Sahrens  */
4023789Sahrens static int
parse_zfs(zone_t * zone,caddr_t ubuf,size_t buflen)4024789Sahrens parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4025789Sahrens {
4026789Sahrens 	char *kbuf;
4027789Sahrens 	char *dataset, *next;
4028789Sahrens 	zone_dataset_t *zd;
4029789Sahrens 	size_t len;
4030789Sahrens 
4031789Sahrens 	if (ubuf == NULL || buflen == 0)
4032789Sahrens 		return (0);
4033789Sahrens 
4034789Sahrens 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4035789Sahrens 		return (ENOMEM);
4036789Sahrens 
4037789Sahrens 	if (copyin(ubuf, kbuf, buflen) != 0) {
4038789Sahrens 		kmem_free(kbuf, buflen);
4039789Sahrens 		return (EFAULT);
4040789Sahrens 	}
4041789Sahrens 
4042789Sahrens 	dataset = next = kbuf;
4043789Sahrens 	for (;;) {
4044789Sahrens 		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4045789Sahrens 
4046789Sahrens 		next = strchr(dataset, ',');
4047789Sahrens 
4048789Sahrens 		if (next == NULL)
4049789Sahrens 			len = strlen(dataset);
4050789Sahrens 		else
4051789Sahrens 			len = next - dataset;
4052789Sahrens 
4053789Sahrens 		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4054789Sahrens 		bcopy(dataset, zd->zd_dataset, len);
4055789Sahrens 		zd->zd_dataset[len] = '\0';
4056789Sahrens 
4057789Sahrens 		list_insert_head(&zone->zone_datasets, zd);
4058789Sahrens 
4059789Sahrens 		if (next == NULL)
4060789Sahrens 			break;
4061789Sahrens 
4062789Sahrens 		dataset = next + 1;
4063789Sahrens 	}
4064789Sahrens 
4065789Sahrens 	kmem_free(kbuf, buflen);
4066789Sahrens 	return (0);
4067789Sahrens }
4068789Sahrens 
4069789Sahrens /*
40700Sstevel@tonic-gate  * System call to create/initialize a new zone named 'zone_name', rooted
40710Sstevel@tonic-gate  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
40721676Sjpk  * and initialized with the zone-wide rctls described in 'rctlbuf', and
40731676Sjpk  * with labeling set by 'match', 'doi', and 'label'.
40740Sstevel@tonic-gate  *
40750Sstevel@tonic-gate  * If extended error is non-null, we may use it to return more detailed
40760Sstevel@tonic-gate  * error information.
40770Sstevel@tonic-gate  */
40780Sstevel@tonic-gate static zoneid_t
zone_create(const char * zone_name,const char * zone_root,const priv_set_t * zone_privs,size_t zone_privssz,caddr_t rctlbuf,size_t rctlbufsz,caddr_t zfsbuf,size_t zfsbufsz,int * extended_error,int match,uint32_t doi,const bslabel_t * label,int flags)40790Sstevel@tonic-gate zone_create(const char *zone_name, const char *zone_root,
4080813Sdp     const priv_set_t *zone_privs, size_t zone_privssz,
4081813Sdp     caddr_t rctlbuf, size_t rctlbufsz,
40821676Sjpk     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
40833448Sdh155122     int match, uint32_t doi, const bslabel_t *label,
40843448Sdh155122     int flags)
40850Sstevel@tonic-gate {
40860Sstevel@tonic-gate 	struct zsched_arg zarg;
40870Sstevel@tonic-gate 	nvlist_t *rctls = NULL;
40880Sstevel@tonic-gate 	proc_t *pp = curproc;
40890Sstevel@tonic-gate 	zone_t *zone, *ztmp;
40900Sstevel@tonic-gate 	zoneid_t zoneid;
40910Sstevel@tonic-gate 	int error;
40920Sstevel@tonic-gate 	int error2 = 0;
40930Sstevel@tonic-gate 	char *str;
40940Sstevel@tonic-gate 	cred_t *zkcr;
40951769Scarlsonj 	boolean_t insert_label_hash;
40960Sstevel@tonic-gate 
40970Sstevel@tonic-gate 	if (secpolicy_zone_config(CRED()) != 0)
40980Sstevel@tonic-gate 		return (set_errno(EPERM));
40990Sstevel@tonic-gate 
41000Sstevel@tonic-gate 	/* can't boot zone from within chroot environment */
41010Sstevel@tonic-gate 	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
41020Sstevel@tonic-gate 		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4103813Sdp 		    extended_error));
41040Sstevel@tonic-gate 
41050Sstevel@tonic-gate 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
41060Sstevel@tonic-gate 	zoneid = zone->zone_id = id_alloc(zoneid_space);
41070Sstevel@tonic-gate 	zone->zone_status = ZONE_IS_UNINITIALIZED;
41080Sstevel@tonic-gate 	zone->zone_pool = pool_default;
41090Sstevel@tonic-gate 	zone->zone_pool_mod = gethrtime();
41100Sstevel@tonic-gate 	zone->zone_psetid = ZONE_PS_INVAL;
41110Sstevel@tonic-gate 	zone->zone_ncpus = 0;
41120Sstevel@tonic-gate 	zone->zone_ncpus_online = 0;
41132712Snn35248 	zone->zone_restart_init = B_TRUE;
41142712Snn35248 	zone->zone_brand = &native_brand;
41152712Snn35248 	zone->zone_initname = NULL;
41160Sstevel@tonic-gate 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
41170Sstevel@tonic-gate 	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
41183247Sgjelinek 	mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
41190Sstevel@tonic-gate 	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4120*13096SJordan.Vaughan@Sun.com 	list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4121*13096SJordan.Vaughan@Sun.com 	    offsetof(zone_ref_t, zref_linkage));
41220Sstevel@tonic-gate 	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
41230Sstevel@tonic-gate 	    offsetof(struct zsd_entry, zsd_linkage));
4124789Sahrens 	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4125789Sahrens 	    offsetof(zone_dataset_t, zd_linkage));
412610616SSebastien.Roy@Sun.COM 	list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
412710616SSebastien.Roy@Sun.COM 	    offsetof(zone_dl_t, zdl_linkage));
41281676Sjpk 	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
412910910SRobert.Harris@Sun.COM 	rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
41300Sstevel@tonic-gate 
41313448Sdh155122 	if (flags & ZCF_NET_EXCL) {
41323448Sdh155122 		zone->zone_flags |= ZF_NET_EXCL;
41333448Sdh155122 	}
41343448Sdh155122 
41350Sstevel@tonic-gate 	if ((error = zone_set_name(zone, zone_name)) != 0) {
41360Sstevel@tonic-gate 		zone_free(zone);
41370Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
41380Sstevel@tonic-gate 	}
41390Sstevel@tonic-gate 
41400Sstevel@tonic-gate 	if ((error = zone_set_root(zone, zone_root)) != 0) {
41410Sstevel@tonic-gate 		zone_free(zone);
41420Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
41430Sstevel@tonic-gate 	}
4144813Sdp 	if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
41450Sstevel@tonic-gate 		zone_free(zone);
41460Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
41470Sstevel@tonic-gate 	}
41480Sstevel@tonic-gate 
41490Sstevel@tonic-gate 	/* initialize node name to be the same as zone name */
41500Sstevel@tonic-gate 	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
41510Sstevel@tonic-gate 	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
41520Sstevel@tonic-gate 	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
41530Sstevel@tonic-gate 
41540Sstevel@tonic-gate 	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
41550Sstevel@tonic-gate 	zone->zone_domain[0] = '\0';
41568662SJordan.Vaughan@Sun.com 	zone->zone_hostid = HW_INVALID_HOSTID;
41570Sstevel@tonic-gate 	zone->zone_shares = 1;
41582677Sml93401 	zone->zone_shmmax = 0;
41592677Sml93401 	zone->zone_ipc.ipcq_shmmni = 0;
41602677Sml93401 	zone->zone_ipc.ipcq_semmni = 0;
41612677Sml93401 	zone->zone_ipc.ipcq_msgmni = 0;
41620Sstevel@tonic-gate 	zone->zone_bootargs = NULL;
416312633Sjohn.levon@sun.com 	zone->zone_fs_allowed = NULL;
41642267Sdp 	zone->zone_initname =
41652267Sdp 	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
41662267Sdp 	(void) strcpy(zone->zone_initname, zone_default_initname);
41673247Sgjelinek 	zone->zone_nlwps = 0;
41683247Sgjelinek 	zone->zone_nlwps_ctl = INT_MAX;
416912725SMenno.Lageman@Sun.COM 	zone->zone_nprocs = 0;
417012725SMenno.Lageman@Sun.COM 	zone->zone_nprocs_ctl = INT_MAX;
41712768Ssl108498 	zone->zone_locked_mem = 0;
41722768Ssl108498 	zone->zone_locked_mem_ctl = UINT64_MAX;
41733247Sgjelinek 	zone->zone_max_swap = 0;
41743247Sgjelinek 	zone->zone_max_swap_ctl = UINT64_MAX;
417512633Sjohn.levon@sun.com 	zone->zone_max_lofi = 0;
417612633Sjohn.levon@sun.com 	zone->zone_max_lofi_ctl = UINT64_MAX;
41773247Sgjelinek 	zone0.zone_lockedmem_kstat = NULL;
41783247Sgjelinek 	zone0.zone_swapresv_kstat = NULL;
41790Sstevel@tonic-gate 
41800Sstevel@tonic-gate 	/*
41810Sstevel@tonic-gate 	 * Zsched initializes the rctls.
41820Sstevel@tonic-gate 	 */
41830Sstevel@tonic-gate 	zone->zone_rctls = NULL;
41840Sstevel@tonic-gate 
41850Sstevel@tonic-gate 	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
41860Sstevel@tonic-gate 		zone_free(zone);
41870Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
41880Sstevel@tonic-gate 	}
41890Sstevel@tonic-gate 
4190789Sahrens 	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4191789Sahrens 		zone_free(zone);
4192789Sahrens 		return (set_errno(error));
4193789Sahrens 	}
4194789Sahrens 
41950Sstevel@tonic-gate 	/*
41961676Sjpk 	 * Read in the trusted system parameters:
41971676Sjpk 	 * match flag and sensitivity label.
41981676Sjpk 	 */
41991676Sjpk 	zone->zone_match = match;
42001769Scarlsonj 	if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
42014462Skp158701 		/* Fail if requested to set doi to anything but system's doi */
42024462Skp158701 		if (doi != 0 && doi != default_doi) {
42034462Skp158701 			zone_free(zone);
42044462Skp158701 			return (set_errno(EINVAL));
42054462Skp158701 		}
42064462Skp158701 		/* Always apply system's doi to the zone */
42074462Skp158701 		error = zone_set_label(zone, label, default_doi);
42081676Sjpk 		if (error != 0) {
42091676Sjpk 			zone_free(zone);
42101676Sjpk 			return (set_errno(error));
42111676Sjpk 		}
42121769Scarlsonj 		insert_label_hash = B_TRUE;
42131676Sjpk 	} else {
42141676Sjpk 		/* all zones get an admin_low label if system is not labeled */
42151676Sjpk 		zone->zone_slabel = l_admin_low;
42161676Sjpk 		label_hold(l_admin_low);
42171769Scarlsonj 		insert_label_hash = B_FALSE;
42181676Sjpk 	}
42191676Sjpk 
42201676Sjpk 	/*
42210Sstevel@tonic-gate 	 * Stop all lwps since that's what normally happens as part of fork().
42220Sstevel@tonic-gate 	 * This needs to happen before we grab any locks to avoid deadlock
42230Sstevel@tonic-gate 	 * (another lwp in the process could be waiting for the held lock).
42240Sstevel@tonic-gate 	 */
42250Sstevel@tonic-gate 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
42260Sstevel@tonic-gate 		zone_free(zone);
42270Sstevel@tonic-gate 		if (rctls)
42280Sstevel@tonic-gate 			nvlist_free(rctls);
42290Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
42300Sstevel@tonic-gate 	}
42310Sstevel@tonic-gate 
42320Sstevel@tonic-gate 	if (block_mounts() == 0) {
42330Sstevel@tonic-gate 		mutex_enter(&pp->p_lock);
42340Sstevel@tonic-gate 		if (curthread != pp->p_agenttp)
42350Sstevel@tonic-gate 			continuelwps(pp);
42360Sstevel@tonic-gate 		mutex_exit(&pp->p_lock);
42370Sstevel@tonic-gate 		zone_free(zone);
42380Sstevel@tonic-gate 		if (rctls)
42390Sstevel@tonic-gate 			nvlist_free(rctls);
42400Sstevel@tonic-gate 		return (zone_create_error(error, 0, extended_error));
42410Sstevel@tonic-gate 	}
42420Sstevel@tonic-gate 
42430Sstevel@tonic-gate 	/*
42440Sstevel@tonic-gate 	 * Set up credential for kernel access.  After this, any errors
42450Sstevel@tonic-gate 	 * should go through the dance in errout rather than calling
42460Sstevel@tonic-gate 	 * zone_free directly.
42470Sstevel@tonic-gate 	 */
42480Sstevel@tonic-gate 	zone->zone_kcred = crdup(kcred);
42490Sstevel@tonic-gate 	crsetzone(zone->zone_kcred, zone);
42500Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
42510Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
42520Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
42530Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
42540Sstevel@tonic-gate 
42550Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
42560Sstevel@tonic-gate 	/*
42570Sstevel@tonic-gate 	 * Make sure zone doesn't already exist.
42581676Sjpk 	 *
42591676Sjpk 	 * If the system and zone are labeled,
42601676Sjpk 	 * make sure no other zone exists that has the same label.
42610Sstevel@tonic-gate 	 */
42621676Sjpk 	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
42631769Scarlsonj 	    (insert_label_hash &&
42641676Sjpk 	    (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
42650Sstevel@tonic-gate 		zone_status_t status;
42660Sstevel@tonic-gate 
42670Sstevel@tonic-gate 		status = zone_status_get(ztmp);
42680Sstevel@tonic-gate 		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
42690Sstevel@tonic-gate 			error = EEXIST;
42700Sstevel@tonic-gate 		else
42710Sstevel@tonic-gate 			error = EBUSY;
42724791Ston 
42734791Ston 		if (insert_label_hash)
42744791Ston 			error2 = ZE_LABELINUSE;
42754791Ston 
42760Sstevel@tonic-gate 		goto errout;
42770Sstevel@tonic-gate 	}
42780Sstevel@tonic-gate 
42790Sstevel@tonic-gate 	/*
42800Sstevel@tonic-gate 	 * Don't allow zone creations which would cause one zone's rootpath to
42810Sstevel@tonic-gate 	 * be accessible from that of another (non-global) zone.
42820Sstevel@tonic-gate 	 */
42830Sstevel@tonic-gate 	if (zone_is_nested(zone->zone_rootpath)) {
42840Sstevel@tonic-gate 		error = EBUSY;
42850Sstevel@tonic-gate 		goto errout;
42860Sstevel@tonic-gate 	}
42870Sstevel@tonic-gate 
42880Sstevel@tonic-gate 	ASSERT(zonecount != 0);		/* check for leaks */
42890Sstevel@tonic-gate 	if (zonecount + 1 > maxzones) {
42900Sstevel@tonic-gate 		error = ENOMEM;
42910Sstevel@tonic-gate 		goto errout;
42920Sstevel@tonic-gate 	}
42930Sstevel@tonic-gate 
42940Sstevel@tonic-gate 	if (zone_mount_count(zone->zone_rootpath) != 0) {
42950Sstevel@tonic-gate 		error = EBUSY;
42960Sstevel@tonic-gate 		error2 = ZE_AREMOUNTS;
42970Sstevel@tonic-gate 		goto errout;
42980Sstevel@tonic-gate 	}
42990Sstevel@tonic-gate 
43000Sstevel@tonic-gate 	/*
43010Sstevel@tonic-gate 	 * Zone is still incomplete, but we need to drop all locks while
43020Sstevel@tonic-gate 	 * zsched() initializes this zone's kernel process.  We
43030Sstevel@tonic-gate 	 * optimistically add the zone to the hashtable and associated
43040Sstevel@tonic-gate 	 * lists so a parallel zone_create() doesn't try to create the
43050Sstevel@tonic-gate 	 * same zone.
43060Sstevel@tonic-gate 	 */
43070Sstevel@tonic-gate 	zonecount++;
43080Sstevel@tonic-gate 	(void) mod_hash_insert(zonehashbyid,
43090Sstevel@tonic-gate 	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
43100Sstevel@tonic-gate 	    (mod_hash_val_t)(uintptr_t)zone);
43110Sstevel@tonic-gate 	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
43120Sstevel@tonic-gate 	(void) strcpy(str, zone->zone_name);
43130Sstevel@tonic-gate 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
43140Sstevel@tonic-gate 	    (mod_hash_val_t)(uintptr_t)zone);
43151769Scarlsonj 	if (insert_label_hash) {
43161676Sjpk 		(void) mod_hash_insert(zonehashbylabel,
43171676Sjpk 		    (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
43181769Scarlsonj 		zone->zone_flags |= ZF_HASHED_LABEL;
43191676Sjpk 	}
43201676Sjpk 
43210Sstevel@tonic-gate 	/*
43220Sstevel@tonic-gate 	 * Insert into active list.  At this point there are no 'hold's
43230Sstevel@tonic-gate 	 * on the zone, but everyone else knows not to use it, so we can
43240Sstevel@tonic-gate 	 * continue to use it.  zsched() will do a zone_hold() if the
43250Sstevel@tonic-gate 	 * newproc() is successful.
43260Sstevel@tonic-gate 	 */
43270Sstevel@tonic-gate 	list_insert_tail(&zone_active, zone);
43280Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
43290Sstevel@tonic-gate 
43300Sstevel@tonic-gate 	zarg.zone = zone;
43310Sstevel@tonic-gate 	zarg.nvlist = rctls;
43320Sstevel@tonic-gate 	/*
43330Sstevel@tonic-gate 	 * The process, task, and project rctls are probably wrong;
43340Sstevel@tonic-gate 	 * we need an interface to get the default values of all rctls,
43350Sstevel@tonic-gate 	 * and initialize zsched appropriately.  I'm not sure that that
43360Sstevel@tonic-gate 	 * makes much of a difference, though.
43370Sstevel@tonic-gate 	 */
433811173SJonathan.Adams@Sun.COM 	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
433911173SJonathan.Adams@Sun.COM 	if (error != 0) {
43400Sstevel@tonic-gate 		/*
43410Sstevel@tonic-gate 		 * We need to undo all globally visible state.
43420Sstevel@tonic-gate 		 */
43430Sstevel@tonic-gate 		mutex_enter(&zonehash_lock);
43440Sstevel@tonic-gate 		list_remove(&zone_active, zone);
43451769Scarlsonj 		if (zone->zone_flags & ZF_HASHED_LABEL) {
43461676Sjpk 			ASSERT(zone->zone_slabel != NULL);
43471676Sjpk 			(void) mod_hash_destroy(zonehashbylabel,
43481676Sjpk 			    (mod_hash_key_t)zone->zone_slabel);
43491676Sjpk 		}
43500Sstevel@tonic-gate 		(void) mod_hash_destroy(zonehashbyname,
43510Sstevel@tonic-gate 		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
43520Sstevel@tonic-gate 		(void) mod_hash_destroy(zonehashbyid,
43530Sstevel@tonic-gate 		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
43540Sstevel@tonic-gate 		ASSERT(zonecount > 1);
43550Sstevel@tonic-gate 		zonecount--;
43560Sstevel@tonic-gate 		goto errout;
43570Sstevel@tonic-gate 	}
43580Sstevel@tonic-gate 
43590Sstevel@tonic-gate 	/*
43600Sstevel@tonic-gate 	 * Zone creation can't fail from now on.
43610Sstevel@tonic-gate 	 */
43620Sstevel@tonic-gate 
43630Sstevel@tonic-gate 	/*
43643247Sgjelinek 	 * Create zone kstats
43653247Sgjelinek 	 */
43663247Sgjelinek 	zone_kstat_create(zone);
43673247Sgjelinek 
43683247Sgjelinek 	/*
43690Sstevel@tonic-gate 	 * Let the other lwps continue.
43700Sstevel@tonic-gate 	 */
43710Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
43720Sstevel@tonic-gate 	if (curthread != pp->p_agenttp)
43730Sstevel@tonic-gate 		continuelwps(pp);
43740Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
43750Sstevel@tonic-gate 
43760Sstevel@tonic-gate 	/*
43770Sstevel@tonic-gate 	 * Wait for zsched to finish initializing the zone.
43780Sstevel@tonic-gate 	 */
43790Sstevel@tonic-gate 	zone_status_wait(zone, ZONE_IS_READY);
43800Sstevel@tonic-gate 	/*
43810Sstevel@tonic-gate 	 * The zone is fully visible, so we can let mounts progress.
43820Sstevel@tonic-gate 	 */
43830Sstevel@tonic-gate 	resume_mounts();
43840Sstevel@tonic-gate 	if (rctls)
43850Sstevel@tonic-gate 		nvlist_free(rctls);
43860Sstevel@tonic-gate 
43870Sstevel@tonic-gate 	return (zoneid);
43880Sstevel@tonic-gate 
43890Sstevel@tonic-gate errout:
43900Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
43910Sstevel@tonic-gate 	/*
43920Sstevel@tonic-gate 	 * Let the other lwps continue.
43930Sstevel@tonic-gate 	 */
43940Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
43950Sstevel@tonic-gate 	if (curthread != pp->p_agenttp)
43960Sstevel@tonic-gate 		continuelwps(pp);
43970Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
43980Sstevel@tonic-gate 
43990Sstevel@tonic-gate 	resume_mounts();
44000Sstevel@tonic-gate 	if (rctls)
44010Sstevel@tonic-gate 		nvlist_free(rctls);
44020Sstevel@tonic-gate 	/*
44030Sstevel@tonic-gate 	 * There is currently one reference to the zone, a cred_ref from
44040Sstevel@tonic-gate 	 * zone_kcred.  To free the zone, we call crfree, which will call
44050Sstevel@tonic-gate 	 * zone_cred_rele, which will call zone_free.
44060Sstevel@tonic-gate 	 */
4407*13096SJordan.Vaughan@Sun.com 	ASSERT(zone->zone_cred_ref == 1);
44080Sstevel@tonic-gate 	ASSERT(zone->zone_kcred->cr_ref == 1);
44090Sstevel@tonic-gate 	ASSERT(zone->zone_ref == 0);
44100Sstevel@tonic-gate 	zkcr = zone->zone_kcred;
44110Sstevel@tonic-gate 	zone->zone_kcred = NULL;
44120Sstevel@tonic-gate 	crfree(zkcr);				/* triggers call to zone_free */
44130Sstevel@tonic-gate 	return (zone_create_error(error, error2, extended_error));
44140Sstevel@tonic-gate }
44150Sstevel@tonic-gate 
44160Sstevel@tonic-gate /*
44170Sstevel@tonic-gate  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
44182267Sdp  * the heavy lifting.  initname is the path to the program to launch
44192267Sdp  * at the "top" of the zone; if this is NULL, we use the system default,
44202267Sdp  * which is stored at zone_default_initname.
44210Sstevel@tonic-gate  */
44220Sstevel@tonic-gate static int
zone_boot(zoneid_t zoneid)44232267Sdp zone_boot(zoneid_t zoneid)
44240Sstevel@tonic-gate {
44250Sstevel@tonic-gate 	int err;
44260Sstevel@tonic-gate 	zone_t *zone;
44270Sstevel@tonic-gate 
44280Sstevel@tonic-gate 	if (secpolicy_zone_config(CRED()) != 0)
44290Sstevel@tonic-gate 		return (set_errno(EPERM));
44300Sstevel@tonic-gate 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
44310Sstevel@tonic-gate 		return (set_errno(EINVAL));
44320Sstevel@tonic-gate 
44330Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
44340Sstevel@tonic-gate 	/*
44350Sstevel@tonic-gate 	 * Look for zone under hash lock to prevent races with calls to
44360Sstevel@tonic-gate 	 * zone_shutdown, zone_destroy, etc.
44370Sstevel@tonic-gate 	 */
44380Sstevel@tonic-gate 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
44390Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
44400Sstevel@tonic-gate 		return (set_errno(EINVAL));
44410Sstevel@tonic-gate 	}
44420Sstevel@tonic-gate 
44430Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
44440Sstevel@tonic-gate 	if (zone_status_get(zone) != ZONE_IS_READY) {
44450Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
44460Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
44470Sstevel@tonic-gate 		return (set_errno(EINVAL));
44480Sstevel@tonic-gate 	}
44490Sstevel@tonic-gate 	zone_status_set(zone, ZONE_IS_BOOTING);
44500Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
44510Sstevel@tonic-gate 
44520Sstevel@tonic-gate 	zone_hold(zone);	/* so we can use the zone_t later */
44530Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
44540Sstevel@tonic-gate 
44550Sstevel@tonic-gate 	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
44560Sstevel@tonic-gate 		zone_rele(zone);
44570Sstevel@tonic-gate 		return (set_errno(EINTR));
44580Sstevel@tonic-gate 	}
44590Sstevel@tonic-gate 
44600Sstevel@tonic-gate 	/*
44610Sstevel@tonic-gate 	 * Boot (starting init) might have failed, in which case the zone
44620Sstevel@tonic-gate 	 * will go to the SHUTTING_DOWN state; an appropriate errno will
44630Sstevel@tonic-gate 	 * be placed in zone->zone_boot_err, and so we return that.
44640Sstevel@tonic-gate 	 */
44650Sstevel@tonic-gate 	err = zone->zone_boot_err;
44660Sstevel@tonic-gate 	zone_rele(zone);
44670Sstevel@tonic-gate 	return (err ? set_errno(err) : 0);
44680Sstevel@tonic-gate }
44690Sstevel@tonic-gate 
44700Sstevel@tonic-gate /*
44710Sstevel@tonic-gate  * Kills all user processes in the zone, waiting for them all to exit
44720Sstevel@tonic-gate  * before returning.
44730Sstevel@tonic-gate  */
44740Sstevel@tonic-gate static int
zone_empty(zone_t * zone)44750Sstevel@tonic-gate zone_empty(zone_t *zone)
44760Sstevel@tonic-gate {
44770Sstevel@tonic-gate 	int waitstatus;
44780Sstevel@tonic-gate 
44790Sstevel@tonic-gate 	/*
44800Sstevel@tonic-gate 	 * We need to drop zonehash_lock before killing all
44810Sstevel@tonic-gate 	 * processes, otherwise we'll deadlock with zone_find_*
44820Sstevel@tonic-gate 	 * which can be called from the exit path.
44830Sstevel@tonic-gate 	 */
44840Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
448511066Srafael.vanoni@sun.com 	while ((waitstatus = zone_status_timedwait_sig(zone,
448611066Srafael.vanoni@sun.com 	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
44870Sstevel@tonic-gate 		killall(zone->zone_id);
44880Sstevel@tonic-gate 	}
44890Sstevel@tonic-gate 	/*
44900Sstevel@tonic-gate 	 * return EINTR if we were signaled
44910Sstevel@tonic-gate 	 */
44920Sstevel@tonic-gate 	if (waitstatus == 0)
44930Sstevel@tonic-gate 		return (EINTR);
44940Sstevel@tonic-gate 	return (0);
44950Sstevel@tonic-gate }
44960Sstevel@tonic-gate 
44970Sstevel@tonic-gate /*
44981676Sjpk  * This function implements the policy for zone visibility.
44991676Sjpk  *
45001676Sjpk  * In standard Solaris, a non-global zone can only see itself.
45011676Sjpk  *
45021676Sjpk  * In Trusted Extensions, a labeled zone can lookup any zone whose label
45031676Sjpk  * it dominates. For this test, the label of the global zone is treated as
45041676Sjpk  * admin_high so it is special-cased instead of being checked for dominance.
45051676Sjpk  *
45061676Sjpk  * Returns true if zone attributes are viewable, false otherwise.
45071676Sjpk  */
45081676Sjpk static boolean_t
zone_list_access(zone_t * zone)45091676Sjpk zone_list_access(zone_t *zone)
45101676Sjpk {
45111676Sjpk 
45121676Sjpk 	if (curproc->p_zone == global_zone ||
45131676Sjpk 	    curproc->p_zone == zone) {
45141676Sjpk 		return (B_TRUE);
45151769Scarlsonj 	} else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
45161676Sjpk 		bslabel_t *curproc_label;
45171676Sjpk 		bslabel_t *zone_label;
45181676Sjpk 
45191676Sjpk 		curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
45201676Sjpk 		zone_label = label2bslabel(zone->zone_slabel);
45211676Sjpk 
45221676Sjpk 		if (zone->zone_id != GLOBAL_ZONEID &&
45231676Sjpk 		    bldominates(curproc_label, zone_label)) {
45241676Sjpk 			return (B_TRUE);
45251676Sjpk 		} else {
45261676Sjpk 			return (B_FALSE);
45271676Sjpk 		}
45281676Sjpk 	} else {
45291676Sjpk 		return (B_FALSE);
45301676Sjpk 	}
45311676Sjpk }
45321676Sjpk 
45331676Sjpk /*
45340Sstevel@tonic-gate  * Systemcall to start the zone's halt sequence.  By the time this
45350Sstevel@tonic-gate  * function successfully returns, all user processes and kernel threads
45360Sstevel@tonic-gate  * executing in it will have exited, ZSD shutdown callbacks executed,
45370Sstevel@tonic-gate  * and the zone status set to ZONE_IS_DOWN.
45380Sstevel@tonic-gate  *
45390Sstevel@tonic-gate  * It is possible that the call will interrupt itself if the caller is the
45400Sstevel@tonic-gate  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
45410Sstevel@tonic-gate  */
45420Sstevel@tonic-gate static int
zone_shutdown(zoneid_t zoneid)45430Sstevel@tonic-gate zone_shutdown(zoneid_t zoneid)
45440Sstevel@tonic-gate {
45450Sstevel@tonic-gate 	int error;
45460Sstevel@tonic-gate 	zone_t *zone;
45470Sstevel@tonic-gate 	zone_status_t status;
45480Sstevel@tonic-gate 
45490Sstevel@tonic-gate 	if (secpolicy_zone_config(CRED()) != 0)
45500Sstevel@tonic-gate 		return (set_errno(EPERM));
45510Sstevel@tonic-gate 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
45520Sstevel@tonic-gate 		return (set_errno(EINVAL));
45530Sstevel@tonic-gate 
45540Sstevel@tonic-gate 	/*
45550Sstevel@tonic-gate 	 * Block mounts so that VFS_MOUNT() can get an accurate view of
45560Sstevel@tonic-gate 	 * the zone's status with regards to ZONE_IS_SHUTTING down.
45570Sstevel@tonic-gate 	 *
45580Sstevel@tonic-gate 	 * e.g. NFS can fail the mount if it determines that the zone
45590Sstevel@tonic-gate 	 * has already begun the shutdown sequence.
45600Sstevel@tonic-gate 	 */
45610Sstevel@tonic-gate 	if (block_mounts() == 0)
45620Sstevel@tonic-gate 		return (set_errno(EINTR));
45630Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
45640Sstevel@tonic-gate 	/*
45650Sstevel@tonic-gate 	 * Look for zone under hash lock to prevent races with other
45660Sstevel@tonic-gate 	 * calls to zone_shutdown and zone_destroy.
45670Sstevel@tonic-gate 	 */
45680Sstevel@tonic-gate 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
45690Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
45700Sstevel@tonic-gate 		resume_mounts();
45710Sstevel@tonic-gate 		return (set_errno(EINVAL));
45720Sstevel@tonic-gate 	}
45730Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
45740Sstevel@tonic-gate 	status = zone_status_get(zone);
45750Sstevel@tonic-gate 	/*
45760Sstevel@tonic-gate 	 * Fail if the zone isn't fully initialized yet.
45770Sstevel@tonic-gate 	 */
45780Sstevel@tonic-gate 	if (status < ZONE_IS_READY) {
45790Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
45800Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
45810Sstevel@tonic-gate 		resume_mounts();
45820Sstevel@tonic-gate 		return (set_errno(EINVAL));
45830Sstevel@tonic-gate 	}
45840Sstevel@tonic-gate 	/*
45850Sstevel@tonic-gate 	 * If conditions required for zone_shutdown() to return have been met,
45860Sstevel@tonic-gate 	 * return success.
45870Sstevel@tonic-gate 	 */
45880Sstevel@tonic-gate 	if (status >= ZONE_IS_DOWN) {
45890Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
45900Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
45910Sstevel@tonic-gate 		resume_mounts();
45920Sstevel@tonic-gate 		return (0);
45930Sstevel@tonic-gate 	}
45940Sstevel@tonic-gate 	/*
45950Sstevel@tonic-gate 	 * If zone_shutdown() hasn't been called before, go through the motions.
45960Sstevel@tonic-gate 	 * If it has, there's nothing to do but wait for the kernel threads to
45970Sstevel@tonic-gate 	 * drain.
45980Sstevel@tonic-gate 	 */
45990Sstevel@tonic-gate 	if (status < ZONE_IS_EMPTY) {
46000Sstevel@tonic-gate 		uint_t ntasks;
46010Sstevel@tonic-gate 
46020Sstevel@tonic-gate 		mutex_enter(&zone->zone_lock);
46030Sstevel@tonic-gate 		if ((ntasks = zone->zone_ntasks) != 1) {
46040Sstevel@tonic-gate 			/*
46050Sstevel@tonic-gate 			 * There's still stuff running.
46060Sstevel@tonic-gate 			 */
46070Sstevel@tonic-gate 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
46080Sstevel@tonic-gate 		}
46090Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
46100Sstevel@tonic-gate 		if (ntasks == 1) {
46110Sstevel@tonic-gate 			/*
46120Sstevel@tonic-gate 			 * The only way to create another task is through
46130Sstevel@tonic-gate 			 * zone_enter(), which will block until we drop
46140Sstevel@tonic-gate 			 * zonehash_lock.  The zone is empty.
46150Sstevel@tonic-gate 			 */
46160Sstevel@tonic-gate 			if (zone->zone_kthreads == NULL) {
46170Sstevel@tonic-gate 				/*
46180Sstevel@tonic-gate 				 * Skip ahead to ZONE_IS_DOWN
46190Sstevel@tonic-gate 				 */
46200Sstevel@tonic-gate 				zone_status_set(zone, ZONE_IS_DOWN);
46210Sstevel@tonic-gate 			} else {
46220Sstevel@tonic-gate 				zone_status_set(zone, ZONE_IS_EMPTY);
46230Sstevel@tonic-gate 			}
46240Sstevel@tonic-gate 		}
46250Sstevel@tonic-gate 	}
46260Sstevel@tonic-gate 	zone_hold(zone);	/* so we can use the zone_t later */
46270Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
46280Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
46290Sstevel@tonic-gate 	resume_mounts();
46300Sstevel@tonic-gate 
46310Sstevel@tonic-gate 	if (error = zone_empty(zone)) {
46320Sstevel@tonic-gate 		zone_rele(zone);
46330Sstevel@tonic-gate 		return (set_errno(error));
46340Sstevel@tonic-gate 	}
46350Sstevel@tonic-gate 	/*
46360Sstevel@tonic-gate 	 * After the zone status goes to ZONE_IS_DOWN this zone will no
46370Sstevel@tonic-gate 	 * longer be notified of changes to the pools configuration, so
46380Sstevel@tonic-gate 	 * in order to not end up with a stale pool pointer, we point
46390Sstevel@tonic-gate 	 * ourselves at the default pool and remove all resource
46400Sstevel@tonic-gate 	 * visibility.  This is especially important as the zone_t may
46410Sstevel@tonic-gate 	 * languish on the deathrow for a very long time waiting for
46420Sstevel@tonic-gate 	 * cred's to drain out.
46430Sstevel@tonic-gate 	 *
46440Sstevel@tonic-gate 	 * This rebinding of the zone can happen multiple times
46450Sstevel@tonic-gate 	 * (presumably due to interrupted or parallel systemcalls)
46460Sstevel@tonic-gate 	 * without any adverse effects.
46470Sstevel@tonic-gate 	 */
46480Sstevel@tonic-gate 	if (pool_lock_intr() != 0) {
46490Sstevel@tonic-gate 		zone_rele(zone);
46500Sstevel@tonic-gate 		return (set_errno(EINTR));
46510Sstevel@tonic-gate 	}
46520Sstevel@tonic-gate 	if (pool_state == POOL_ENABLED) {
46530Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
46540Sstevel@tonic-gate 		zone_pool_set(zone, pool_default);
46550Sstevel@tonic-gate 		/*
46560Sstevel@tonic-gate 		 * The zone no longer needs to be able to see any cpus.
46570Sstevel@tonic-gate 		 */
46580Sstevel@tonic-gate 		zone_pset_set(zone, ZONE_PS_INVAL);
46590Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
46600Sstevel@tonic-gate 	}
46610Sstevel@tonic-gate 	pool_unlock();
46620Sstevel@tonic-gate 
46630Sstevel@tonic-gate 	/*
46640Sstevel@tonic-gate 	 * ZSD shutdown callbacks can be executed multiple times, hence
46650Sstevel@tonic-gate 	 * it is safe to not be holding any locks across this call.
46660Sstevel@tonic-gate 	 */
46670Sstevel@tonic-gate 	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
46680Sstevel@tonic-gate 
46690Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
46700Sstevel@tonic-gate 	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
46710Sstevel@tonic-gate 		zone_status_set(zone, ZONE_IS_DOWN);
46720Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
46730Sstevel@tonic-gate 
46740Sstevel@tonic-gate 	/*
46750Sstevel@tonic-gate 	 * Wait for kernel threads to drain.
46760Sstevel@tonic-gate 	 */
46770Sstevel@tonic-gate 	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
46780Sstevel@tonic-gate 		zone_rele(zone);
46790Sstevel@tonic-gate 		return (set_errno(EINTR));
46800Sstevel@tonic-gate 	}
46812712Snn35248 
46823671Ssl108498 	/*
46833671Ssl108498 	 * Zone can be become down/destroyable even if the above wait
46843671Ssl108498 	 * returns EINTR, so any code added here may never execute.
46853671Ssl108498 	 * (i.e. don't add code here)
46863671Ssl108498 	 */
46872712Snn35248 
46880Sstevel@tonic-gate 	zone_rele(zone);
46890Sstevel@tonic-gate 	return (0);
46900Sstevel@tonic-gate }
46910Sstevel@tonic-gate 
46920Sstevel@tonic-gate /*
4693*13096SJordan.Vaughan@Sun.com  * Log the specified zone's reference counts.  The caller should not be
4694*13096SJordan.Vaughan@Sun.com  * holding the zone's zone_lock.
4695*13096SJordan.Vaughan@Sun.com  */
4696*13096SJordan.Vaughan@Sun.com static void
zone_log_refcounts(zone_t * zone)4697*13096SJordan.Vaughan@Sun.com zone_log_refcounts(zone_t *zone)
4698*13096SJordan.Vaughan@Sun.com {
4699*13096SJordan.Vaughan@Sun.com 	char *buffer;
4700*13096SJordan.Vaughan@Sun.com 	char *buffer_position;
4701*13096SJordan.Vaughan@Sun.com 	uint32_t buffer_size;
4702*13096SJordan.Vaughan@Sun.com 	uint32_t index;
4703*13096SJordan.Vaughan@Sun.com 	uint_t ref;
4704*13096SJordan.Vaughan@Sun.com 	uint_t cred_ref;
4705*13096SJordan.Vaughan@Sun.com 
4706*13096SJordan.Vaughan@Sun.com 	/*
4707*13096SJordan.Vaughan@Sun.com 	 * Construct a string representing the subsystem-specific reference
4708*13096SJordan.Vaughan@Sun.com 	 * counts.  The counts are printed in ascending order by index into the
4709*13096SJordan.Vaughan@Sun.com 	 * zone_t::zone_subsys_ref array.  The list will be surrounded by
4710*13096SJordan.Vaughan@Sun.com 	 * square brackets [] and will only contain nonzero reference counts.
4711*13096SJordan.Vaughan@Sun.com 	 *
4712*13096SJordan.Vaughan@Sun.com 	 * The buffer will hold two square bracket characters plus ten digits,
4713*13096SJordan.Vaughan@Sun.com 	 * one colon, one space, one comma, and some characters for a
4714*13096SJordan.Vaughan@Sun.com 	 * subsystem name per subsystem-specific reference count.  (Unsigned 32-
4715*13096SJordan.Vaughan@Sun.com 	 * bit integers have at most ten decimal digits.)  The last
4716*13096SJordan.Vaughan@Sun.com 	 * reference count's comma is replaced by the closing square
4717*13096SJordan.Vaughan@Sun.com 	 * bracket and a NULL character to terminate the string.
4718*13096SJordan.Vaughan@Sun.com 	 *
4719*13096SJordan.Vaughan@Sun.com 	 * NOTE: We have to grab the zone's zone_lock to create a consistent
4720*13096SJordan.Vaughan@Sun.com 	 * snapshot of the zone's reference counters.
4721*13096SJordan.Vaughan@Sun.com 	 *
4722*13096SJordan.Vaughan@Sun.com 	 * First, figure out how much space the string buffer will need.
4723*13096SJordan.Vaughan@Sun.com 	 * The buffer's size is stored in buffer_size.
4724*13096SJordan.Vaughan@Sun.com 	 */
4725*13096SJordan.Vaughan@Sun.com 	buffer_size = 2;			/* for the square brackets */
4726*13096SJordan.Vaughan@Sun.com 	mutex_enter(&zone->zone_lock);
4727*13096SJordan.Vaughan@Sun.com 	zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
4728*13096SJordan.Vaughan@Sun.com 	ref = zone->zone_ref;
4729*13096SJordan.Vaughan@Sun.com 	cred_ref = zone->zone_cred_ref;
4730*13096SJordan.Vaughan@Sun.com 	for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
4731*13096SJordan.Vaughan@Sun.com 		if (zone->zone_subsys_ref[index] != 0)
4732*13096SJordan.Vaughan@Sun.com 			buffer_size += strlen(zone_ref_subsys_names[index]) +
4733*13096SJordan.Vaughan@Sun.com 			    13;
4734*13096SJordan.Vaughan@Sun.com 	if (buffer_size == 2) {
4735*13096SJordan.Vaughan@Sun.com 		/*
4736*13096SJordan.Vaughan@Sun.com 		 * No subsystems had nonzero reference counts.  Don't bother
4737*13096SJordan.Vaughan@Sun.com 		 * with allocating a buffer; just log the general-purpose and
4738*13096SJordan.Vaughan@Sun.com 		 * credential reference counts.
4739*13096SJordan.Vaughan@Sun.com 		 */
4740*13096SJordan.Vaughan@Sun.com 		mutex_exit(&zone->zone_lock);
4741*13096SJordan.Vaughan@Sun.com 		(void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4742*13096SJordan.Vaughan@Sun.com 		    "Zone '%s' (ID: %d) is shutting down, but %u zone "
4743*13096SJordan.Vaughan@Sun.com 		    "references and %u credential references are still extant",
4744*13096SJordan.Vaughan@Sun.com 		    zone->zone_name, zone->zone_id, ref, cred_ref);
4745*13096SJordan.Vaughan@Sun.com 		return;
4746*13096SJordan.Vaughan@Sun.com 	}
4747*13096SJordan.Vaughan@Sun.com 
4748*13096SJordan.Vaughan@Sun.com 	/*
4749*13096SJordan.Vaughan@Sun.com 	 * buffer_size contains the exact number of characters that the
4750*13096SJordan.Vaughan@Sun.com 	 * buffer will need.  Allocate the buffer and fill it with nonzero
4751*13096SJordan.Vaughan@Sun.com 	 * subsystem-specific reference counts.  Surround the results with
4752*13096SJordan.Vaughan@Sun.com 	 * square brackets afterwards.
4753*13096SJordan.Vaughan@Sun.com 	 */
4754*13096SJordan.Vaughan@Sun.com 	buffer = kmem_alloc(buffer_size, KM_SLEEP);
4755*13096SJordan.Vaughan@Sun.com 	buffer_position = &buffer[1];
4756*13096SJordan.Vaughan@Sun.com 	for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
4757*13096SJordan.Vaughan@Sun.com 		/*
4758*13096SJordan.Vaughan@Sun.com 		 * NOTE: The DDI's version of sprintf() returns a pointer to
4759*13096SJordan.Vaughan@Sun.com 		 * the modified buffer rather than the number of bytes written
4760*13096SJordan.Vaughan@Sun.com 		 * (as in snprintf(3C)).  This is unfortunate and annoying.
4761*13096SJordan.Vaughan@Sun.com 		 * Therefore, we'll use snprintf() with INT_MAX to get the
4762*13096SJordan.Vaughan@Sun.com 		 * number of bytes written.  Using INT_MAX is safe because
4763*13096SJordan.Vaughan@Sun.com 		 * the buffer is perfectly sized for the data: we'll never
4764*13096SJordan.Vaughan@Sun.com 		 * overrun the buffer.
4765*13096SJordan.Vaughan@Sun.com 		 */
4766*13096SJordan.Vaughan@Sun.com 		if (zone->zone_subsys_ref[index] != 0)
4767*13096SJordan.Vaughan@Sun.com 			buffer_position += snprintf(buffer_position, INT_MAX,
4768*13096SJordan.Vaughan@Sun.com 			    "%s: %u,", zone_ref_subsys_names[index],
4769*13096SJordan.Vaughan@Sun.com 			    zone->zone_subsys_ref[index]);
4770*13096SJordan.Vaughan@Sun.com 	}
4771*13096SJordan.Vaughan@Sun.com 	mutex_exit(&zone->zone_lock);
4772*13096SJordan.Vaughan@Sun.com 	buffer[0] = '[';
4773*13096SJordan.Vaughan@Sun.com 	ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
4774*13096SJordan.Vaughan@Sun.com 	ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
4775*13096SJordan.Vaughan@Sun.com 	buffer_position[-1] = ']';
4776*13096SJordan.Vaughan@Sun.com 
4777*13096SJordan.Vaughan@Sun.com 	/*
4778*13096SJordan.Vaughan@Sun.com 	 * Log the reference counts and free the message buffer.
4779*13096SJordan.Vaughan@Sun.com 	 */
4780*13096SJordan.Vaughan@Sun.com 	(void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4781*13096SJordan.Vaughan@Sun.com 	    "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
4782*13096SJordan.Vaughan@Sun.com 	    "%u credential references are still extant %s", zone->zone_name,
4783*13096SJordan.Vaughan@Sun.com 	    zone->zone_id, ref, cred_ref, buffer);
4784*13096SJordan.Vaughan@Sun.com 	kmem_free(buffer, buffer_size);
4785*13096SJordan.Vaughan@Sun.com }
4786*13096SJordan.Vaughan@Sun.com 
4787*13096SJordan.Vaughan@Sun.com /*
47880Sstevel@tonic-gate  * Systemcall entry point to finalize the zone halt process.  The caller
47892677Sml93401  * must have already successfully called zone_shutdown().
47900Sstevel@tonic-gate  *
47910Sstevel@tonic-gate  * Upon successful completion, the zone will have been fully destroyed:
47920Sstevel@tonic-gate  * zsched will have exited, destructor callbacks executed, and the zone
47930Sstevel@tonic-gate  * removed from the list of active zones.
47940Sstevel@tonic-gate  */
47950Sstevel@tonic-gate static int
zone_destroy(zoneid_t zoneid)47960Sstevel@tonic-gate zone_destroy(zoneid_t zoneid)
47970Sstevel@tonic-gate {
47980Sstevel@tonic-gate 	uint64_t uniqid;
47990Sstevel@tonic-gate 	zone_t *zone;
48000Sstevel@tonic-gate 	zone_status_t status;
4801*13096SJordan.Vaughan@Sun.com 	clock_t wait_time;
4802*13096SJordan.Vaughan@Sun.com 	boolean_t log_refcounts;
48030Sstevel@tonic-gate 
48040Sstevel@tonic-gate 	if (secpolicy_zone_config(CRED()) != 0)
48050Sstevel@tonic-gate 		return (set_errno(EPERM));
48060Sstevel@tonic-gate 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
48070Sstevel@tonic-gate 		return (set_errno(EINVAL));
48080Sstevel@tonic-gate 
48090Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
48100Sstevel@tonic-gate 	/*
48110Sstevel@tonic-gate 	 * Look for zone under hash lock to prevent races with other
48120Sstevel@tonic-gate 	 * calls to zone_destroy.
48130Sstevel@tonic-gate 	 */
48140Sstevel@tonic-gate 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
48150Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
48160Sstevel@tonic-gate 		return (set_errno(EINVAL));
48170Sstevel@tonic-gate 	}
48180Sstevel@tonic-gate 
48190Sstevel@tonic-gate 	if (zone_mount_count(zone->zone_rootpath) != 0) {
48200Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
48210Sstevel@tonic-gate 		return (set_errno(EBUSY));
48220Sstevel@tonic-gate 	}
48230Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
48240Sstevel@tonic-gate 	status = zone_status_get(zone);
48250Sstevel@tonic-gate 	if (status < ZONE_IS_DOWN) {
48260Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
48270Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
48280Sstevel@tonic-gate 		return (set_errno(EBUSY));
48290Sstevel@tonic-gate 	} else if (status == ZONE_IS_DOWN) {
48300Sstevel@tonic-gate 		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
48310Sstevel@tonic-gate 	}
48320Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
48330Sstevel@tonic-gate 	zone_hold(zone);
48340Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
48350Sstevel@tonic-gate 
48360Sstevel@tonic-gate 	/*
48370Sstevel@tonic-gate 	 * wait for zsched to exit
48380Sstevel@tonic-gate 	 */
48390Sstevel@tonic-gate 	zone_status_wait(zone, ZONE_IS_DEAD);
48400Sstevel@tonic-gate 	zone_zsd_callbacks(zone, ZSD_DESTROY);
48413448Sdh155122 	zone->zone_netstack = NULL;
48420Sstevel@tonic-gate 	uniqid = zone->zone_uniqid;
48430Sstevel@tonic-gate 	zone_rele(zone);
48440Sstevel@tonic-gate 	zone = NULL;	/* potentially free'd */
48450Sstevel@tonic-gate 
4846*13096SJordan.Vaughan@Sun.com 	log_refcounts = B_FALSE;
4847*13096SJordan.Vaughan@Sun.com 	wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
48480Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
48490Sstevel@tonic-gate 	for (; /* ever */; ) {
48500Sstevel@tonic-gate 		boolean_t unref;
4851*13096SJordan.Vaughan@Sun.com 		boolean_t refs_have_been_logged;
48520Sstevel@tonic-gate 
48530Sstevel@tonic-gate 		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
48540Sstevel@tonic-gate 		    zone->zone_uniqid != uniqid) {
48550Sstevel@tonic-gate 			/*
48560Sstevel@tonic-gate 			 * The zone has gone away.  Necessary conditions
48570Sstevel@tonic-gate 			 * are met, so we return success.
48580Sstevel@tonic-gate 			 */
48590Sstevel@tonic-gate 			mutex_exit(&zonehash_lock);
48600Sstevel@tonic-gate 			return (0);
48610Sstevel@tonic-gate 		}
48620Sstevel@tonic-gate 		mutex_enter(&zone->zone_lock);
48630Sstevel@tonic-gate 		unref = ZONE_IS_UNREF(zone);
4864*13096SJordan.Vaughan@Sun.com 		refs_have_been_logged = (zone->zone_flags &
4865*13096SJordan.Vaughan@Sun.com 		    ZF_REFCOUNTS_LOGGED);
48660Sstevel@tonic-gate 		mutex_exit(&zone->zone_lock);
48670Sstevel@tonic-gate 		if (unref) {
48680Sstevel@tonic-gate 			/*
48690Sstevel@tonic-gate 			 * There is only one reference to the zone -- that
48700Sstevel@tonic-gate 			 * added when the zone was added to the hashtables --
48710Sstevel@tonic-gate 			 * and things will remain this way until we drop
48720Sstevel@tonic-gate 			 * zonehash_lock... we can go ahead and cleanup the
48730Sstevel@tonic-gate 			 * zone.
48740Sstevel@tonic-gate 			 */
48750Sstevel@tonic-gate 			break;
48760Sstevel@tonic-gate 		}
48770Sstevel@tonic-gate 
4878*13096SJordan.Vaughan@Sun.com 		/*
4879*13096SJordan.Vaughan@Sun.com 		 * Wait for zone_rele_common() or zone_cred_rele() to signal
4880*13096SJordan.Vaughan@Sun.com 		 * zone_destroy_cv.  zone_destroy_cv is signaled only when
4881*13096SJordan.Vaughan@Sun.com 		 * some zone's general-purpose reference count reaches one.
4882*13096SJordan.Vaughan@Sun.com 		 * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
4883*13096SJordan.Vaughan@Sun.com 		 * on zone_destroy_cv, then log the zone's reference counts and
4884*13096SJordan.Vaughan@Sun.com 		 * continue to wait for zone_rele() and zone_cred_rele().
4885*13096SJordan.Vaughan@Sun.com 		 */
4886*13096SJordan.Vaughan@Sun.com 		if (!refs_have_been_logged) {
4887*13096SJordan.Vaughan@Sun.com 			if (!log_refcounts) {
4888*13096SJordan.Vaughan@Sun.com 				/*
4889*13096SJordan.Vaughan@Sun.com 				 * This thread hasn't timed out waiting on
4890*13096SJordan.Vaughan@Sun.com 				 * zone_destroy_cv yet.  Wait wait_time clock
4891*13096SJordan.Vaughan@Sun.com 				 * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
4892*13096SJordan.Vaughan@Sun.com 				 * seconds) for the zone's references to clear.
4893*13096SJordan.Vaughan@Sun.com 				 */
4894*13096SJordan.Vaughan@Sun.com 				ASSERT(wait_time > 0);
4895*13096SJordan.Vaughan@Sun.com 				wait_time = cv_reltimedwait_sig(
4896*13096SJordan.Vaughan@Sun.com 				    &zone_destroy_cv, &zonehash_lock, wait_time,
4897*13096SJordan.Vaughan@Sun.com 				    TR_SEC);
4898*13096SJordan.Vaughan@Sun.com 				if (wait_time > 0) {
4899*13096SJordan.Vaughan@Sun.com 					/*
4900*13096SJordan.Vaughan@Sun.com 					 * A thread in zone_rele() or
4901*13096SJordan.Vaughan@Sun.com 					 * zone_cred_rele() signaled
4902*13096SJordan.Vaughan@Sun.com 					 * zone_destroy_cv before this thread's
4903*13096SJordan.Vaughan@Sun.com 					 * wait timed out.  The zone might have
4904*13096SJordan.Vaughan@Sun.com 					 * only one reference left; find out!
4905*13096SJordan.Vaughan@Sun.com 					 */
4906*13096SJordan.Vaughan@Sun.com 					continue;
4907*13096SJordan.Vaughan@Sun.com 				} else if (wait_time == 0) {
4908*13096SJordan.Vaughan@Sun.com 					/* The thread's process was signaled. */
4909*13096SJordan.Vaughan@Sun.com 					mutex_exit(&zonehash_lock);
4910*13096SJordan.Vaughan@Sun.com 					return (set_errno(EINTR));
4911*13096SJordan.Vaughan@Sun.com 				}
4912*13096SJordan.Vaughan@Sun.com 
4913*13096SJordan.Vaughan@Sun.com 				/*
4914*13096SJordan.Vaughan@Sun.com 				 * The thread timed out while waiting on
4915*13096SJordan.Vaughan@Sun.com 				 * zone_destroy_cv.  Even though the thread
4916*13096SJordan.Vaughan@Sun.com 				 * timed out, it has to check whether another
4917*13096SJordan.Vaughan@Sun.com 				 * thread woke up from zone_destroy_cv and
4918*13096SJordan.Vaughan@Sun.com 				 * destroyed the zone.
4919*13096SJordan.Vaughan@Sun.com 				 *
4920*13096SJordan.Vaughan@Sun.com 				 * If the zone still exists and has more than
4921*13096SJordan.Vaughan@Sun.com 				 * one unreleased general-purpose reference,
4922*13096SJordan.Vaughan@Sun.com 				 * then log the zone's reference counts.
4923*13096SJordan.Vaughan@Sun.com 				 */
4924*13096SJordan.Vaughan@Sun.com 				log_refcounts = B_TRUE;
4925*13096SJordan.Vaughan@Sun.com 				continue;
4926*13096SJordan.Vaughan@Sun.com 			}
4927*13096SJordan.Vaughan@Sun.com 
4928*13096SJordan.Vaughan@Sun.com 			/*
4929*13096SJordan.Vaughan@Sun.com 			 * The thread already timed out on zone_destroy_cv while
4930*13096SJordan.Vaughan@Sun.com 			 * waiting for subsystems to release the zone's last
4931*13096SJordan.Vaughan@Sun.com 			 * general-purpose references.  Log the zone's reference
4932*13096SJordan.Vaughan@Sun.com 			 * counts and wait indefinitely on zone_destroy_cv.
4933*13096SJordan.Vaughan@Sun.com 			 */
4934*13096SJordan.Vaughan@Sun.com 			zone_log_refcounts(zone);
4935*13096SJordan.Vaughan@Sun.com 		}
49360Sstevel@tonic-gate 		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
4937*13096SJordan.Vaughan@Sun.com 			/* The thread's process was signaled. */
49380Sstevel@tonic-gate 			mutex_exit(&zonehash_lock);
49390Sstevel@tonic-gate 			return (set_errno(EINTR));
49400Sstevel@tonic-gate 		}
49410Sstevel@tonic-gate 	}
49420Sstevel@tonic-gate 
49433792Sakolb 	/*
49443792Sakolb 	 * Remove CPU cap for this zone now since we're not going to
49453792Sakolb 	 * fail below this point.
49463792Sakolb 	 */
49473792Sakolb 	cpucaps_zone_remove(zone);
49483792Sakolb 
49493792Sakolb 	/* Get rid of the zone's kstats */
49503247Sgjelinek 	zone_kstat_delete(zone);
49513247Sgjelinek 
495212273SCasper.Dik@Sun.COM 	/* remove the pfexecd doors */
495312273SCasper.Dik@Sun.COM 	if (zone->zone_pfexecd != NULL) {
495412273SCasper.Dik@Sun.COM 		klpd_freelist(&zone->zone_pfexecd);
495512273SCasper.Dik@Sun.COM 		zone->zone_pfexecd = NULL;
495612273SCasper.Dik@Sun.COM 	}
495712273SCasper.Dik@Sun.COM 
49584888Seh208807 	/* free brand specific data */
49594888Seh208807 	if (ZONE_IS_BRANDED(zone))
49604888Seh208807 		ZBROP(zone)->b_free_brand_data(zone);
49614888Seh208807 
49623671Ssl108498 	/* Say goodbye to brand framework. */
49633671Ssl108498 	brand_unregister_zone(zone->zone_brand);
49643671Ssl108498 
49650Sstevel@tonic-gate 	/*
49660Sstevel@tonic-gate 	 * It is now safe to let the zone be recreated; remove it from the
49670Sstevel@tonic-gate 	 * lists.  The memory will not be freed until the last cred
49680Sstevel@tonic-gate 	 * reference goes away.
49690Sstevel@tonic-gate 	 */
49700Sstevel@tonic-gate 	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
49710Sstevel@tonic-gate 	zonecount--;
49720Sstevel@tonic-gate 	/* remove from active list and hash tables */
49730Sstevel@tonic-gate 	list_remove(&zone_active, zone);
49740Sstevel@tonic-gate 	(void) mod_hash_destroy(zonehashbyname,
49750Sstevel@tonic-gate 	    (mod_hash_key_t)zone->zone_name);
49760Sstevel@tonic-gate 	(void) mod_hash_destroy(zonehashbyid,
49770Sstevel@tonic-gate 	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
49781769Scarlsonj 	if (zone->zone_flags & ZF_HASHED_LABEL)
49791676Sjpk 		(void) mod_hash_destroy(zonehashbylabel,
49801676Sjpk 		    (mod_hash_key_t)zone->zone_slabel);
49810Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
49820Sstevel@tonic-gate 
4983766Scarlsonj 	/*
4984766Scarlsonj 	 * Release the root vnode; we're not using it anymore.  Nor should any
4985766Scarlsonj 	 * other thread that might access it exist.
4986766Scarlsonj 	 */
4987766Scarlsonj 	if (zone->zone_rootvp != NULL) {
4988766Scarlsonj 		VN_RELE(zone->zone_rootvp);
4989766Scarlsonj 		zone->zone_rootvp = NULL;
4990766Scarlsonj 	}
4991766Scarlsonj 
49920Sstevel@tonic-gate 	/* add to deathrow list */
49930Sstevel@tonic-gate 	mutex_enter(&zone_deathrow_lock);
49940Sstevel@tonic-gate 	list_insert_tail(&zone_deathrow, zone);
49950Sstevel@tonic-gate 	mutex_exit(&zone_deathrow_lock);
49960Sstevel@tonic-gate 
49970Sstevel@tonic-gate 	/*
49980Sstevel@tonic-gate 	 * Drop last reference (which was added by zsched()), this will
49990Sstevel@tonic-gate 	 * free the zone unless there are outstanding cred references.
50000Sstevel@tonic-gate 	 */
50010Sstevel@tonic-gate 	zone_rele(zone);
50020Sstevel@tonic-gate 	return (0);
50030Sstevel@tonic-gate }
50040Sstevel@tonic-gate 
50050Sstevel@tonic-gate /*
50060Sstevel@tonic-gate  * Systemcall entry point for zone_getattr(2).
50070Sstevel@tonic-gate  */
50080Sstevel@tonic-gate static ssize_t
zone_getattr(zoneid_t zoneid,int attr,void * buf,size_t bufsize)50090Sstevel@tonic-gate zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
50100Sstevel@tonic-gate {
50110Sstevel@tonic-gate 	size_t size;
50120Sstevel@tonic-gate 	int error = 0, err;
50130Sstevel@tonic-gate 	zone_t *zone;
50140Sstevel@tonic-gate 	char *zonepath;
50152267Sdp 	char *outstr;
50160Sstevel@tonic-gate 	zone_status_t zone_status;
50170Sstevel@tonic-gate 	pid_t initpid;
50183792Sakolb 	boolean_t global = (curzone == global_zone);
50193792Sakolb 	boolean_t inzone = (curzone->zone_id == zoneid);
50203448Sdh155122 	ushort_t flags;
502112748SSowmini.Varadhan@oracle.COM 	zone_net_data_t *zbuf;
50220Sstevel@tonic-gate 
50230Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
50240Sstevel@tonic-gate 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
50250Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
50260Sstevel@tonic-gate 		return (set_errno(EINVAL));
50270Sstevel@tonic-gate 	}
50280Sstevel@tonic-gate 	zone_status = zone_status_get(zone);
50295880Snordmark 	if (zone_status < ZONE_IS_INITIALIZED) {
50300Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
50310Sstevel@tonic-gate 		return (set_errno(EINVAL));
50320Sstevel@tonic-gate 	}
50330Sstevel@tonic-gate 	zone_hold(zone);
50340Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
50350Sstevel@tonic-gate 
50360Sstevel@tonic-gate 	/*
50371676Sjpk 	 * If not in the global zone, don't show information about other zones,
50381676Sjpk 	 * unless the system is labeled and the local zone's label dominates
50391676Sjpk 	 * the other zone.
50400Sstevel@tonic-gate 	 */
50411676Sjpk 	if (!zone_list_access(zone)) {
50420Sstevel@tonic-gate 		zone_rele(zone);
50430Sstevel@tonic-gate 		return (set_errno(EINVAL));
50440Sstevel@tonic-gate 	}
50450Sstevel@tonic-gate 
50460Sstevel@tonic-gate 	switch (attr) {
50470Sstevel@tonic-gate 	case ZONE_ATTR_ROOT:
50480Sstevel@tonic-gate 		if (global) {
50490Sstevel@tonic-gate 			/*
50500Sstevel@tonic-gate 			 * Copy the path to trim the trailing "/" (except for
50510Sstevel@tonic-gate 			 * the global zone).
50520Sstevel@tonic-gate 			 */
50530Sstevel@tonic-gate 			if (zone != global_zone)
50540Sstevel@tonic-gate 				size = zone->zone_rootpathlen - 1;
50550Sstevel@tonic-gate 			else
50560Sstevel@tonic-gate 				size = zone->zone_rootpathlen;
50570Sstevel@tonic-gate 			zonepath = kmem_alloc(size, KM_SLEEP);
50580Sstevel@tonic-gate 			bcopy(zone->zone_rootpath, zonepath, size);
50590Sstevel@tonic-gate 			zonepath[size - 1] = '\0';
50600Sstevel@tonic-gate 		} else {
50613792Sakolb 			if (inzone || !is_system_labeled()) {
50621676Sjpk 				/*
50631676Sjpk 				 * Caller is not in the global zone.
50641676Sjpk 				 * if the query is on the current zone
50651676Sjpk 				 * or the system is not labeled,
50661676Sjpk 				 * just return faked-up path for current zone.
50671676Sjpk 				 */
50681676Sjpk 				zonepath = "/";
50691676Sjpk 				size = 2;
50701676Sjpk 			} else {
50711676Sjpk 				/*
50721676Sjpk 				 * Return related path for current zone.
50731676Sjpk 				 */
50741676Sjpk 				int prefix_len = strlen(zone_prefix);
50751676Sjpk 				int zname_len = strlen(zone->zone_name);
50761676Sjpk 
50771676Sjpk 				size = prefix_len + zname_len + 1;
50781676Sjpk 				zonepath = kmem_alloc(size, KM_SLEEP);
50791676Sjpk 				bcopy(zone_prefix, zonepath, prefix_len);
50801676Sjpk 				bcopy(zone->zone_name, zonepath +
50812267Sdp 				    prefix_len, zname_len);
50821676Sjpk 				zonepath[size - 1] = '\0';
50831676Sjpk 			}
50840Sstevel@tonic-gate 		}
50850Sstevel@tonic-gate 		if (bufsize > size)
50860Sstevel@tonic-gate 			bufsize = size;
50870Sstevel@tonic-gate 		if (buf != NULL) {
50880Sstevel@tonic-gate 			err = copyoutstr(zonepath, buf, bufsize, NULL);
50890Sstevel@tonic-gate 			if (err != 0 && err != ENAMETOOLONG)
50900Sstevel@tonic-gate 				error = EFAULT;
50910Sstevel@tonic-gate 		}
50923792Sakolb 		if (global || (is_system_labeled() && !inzone))
50930Sstevel@tonic-gate 			kmem_free(zonepath, size);
50940Sstevel@tonic-gate 		break;
50950Sstevel@tonic-gate 
50960Sstevel@tonic-gate 	case ZONE_ATTR_NAME:
50970Sstevel@tonic-gate 		size = strlen(zone->zone_name) + 1;
50980Sstevel@tonic-gate 		if (bufsize > size)
50990Sstevel@tonic-gate 			bufsize = size;
51000Sstevel@tonic-gate 		if (buf != NULL) {
51010Sstevel@tonic-gate 			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
51020Sstevel@tonic-gate 			if (err != 0 && err != ENAMETOOLONG)
51030Sstevel@tonic-gate 				error = EFAULT;
51040Sstevel@tonic-gate 		}
51050Sstevel@tonic-gate 		break;
51060Sstevel@tonic-gate 
51070Sstevel@tonic-gate 	case ZONE_ATTR_STATUS:
51080Sstevel@tonic-gate 		/*
51090Sstevel@tonic-gate 		 * Since we're not holding zonehash_lock, the zone status
51100Sstevel@tonic-gate 		 * may be anything; leave it up to userland to sort it out.
51110Sstevel@tonic-gate 		 */
51120Sstevel@tonic-gate 		size = sizeof (zone_status);
51130Sstevel@tonic-gate 		if (bufsize > size)
51140Sstevel@tonic-gate 			bufsize = size;
51150Sstevel@tonic-gate 		zone_status = zone_status_get(zone);
51160Sstevel@tonic-gate 		if (buf != NULL &&
51170Sstevel@tonic-gate 		    copyout(&zone_status, buf, bufsize) != 0)
51180Sstevel@tonic-gate 			error = EFAULT;
51190Sstevel@tonic-gate 		break;
51203448Sdh155122 	case ZONE_ATTR_FLAGS:
51213448Sdh155122 		size = sizeof (zone->zone_flags);
51223448Sdh155122 		if (bufsize > size)
51233448Sdh155122 			bufsize = size;
51243448Sdh155122 		flags = zone->zone_flags;
51253448Sdh155122 		if (buf != NULL &&
51263448Sdh155122 		    copyout(&flags, buf, bufsize) != 0)
51273448Sdh155122 			error = EFAULT;
51283448Sdh155122 		break;
51290Sstevel@tonic-gate 	case ZONE_ATTR_PRIVSET:
51300Sstevel@tonic-gate 		size = sizeof (priv_set_t);
51310Sstevel@tonic-gate 		if (bufsize > size)
51320Sstevel@tonic-gate 			bufsize = size;
51330Sstevel@tonic-gate 		if (buf != NULL &&
51340Sstevel@tonic-gate 		    copyout(zone->zone_privset, buf, bufsize) != 0)
51350Sstevel@tonic-gate 			error = EFAULT;
51360Sstevel@tonic-gate 		break;
51370Sstevel@tonic-gate 	case ZONE_ATTR_UNIQID:
51380Sstevel@tonic-gate 		size = sizeof (zone->zone_uniqid);
51390Sstevel@tonic-gate 		if (bufsize > size)
51400Sstevel@tonic-gate 			bufsize = size;
51410Sstevel@tonic-gate 		if (buf != NULL &&
51420Sstevel@tonic-gate 		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
51430Sstevel@tonic-gate 			error = EFAULT;
51440Sstevel@tonic-gate 		break;
51450Sstevel@tonic-gate 	case ZONE_ATTR_POOLID:
51460Sstevel@tonic-gate 		{
51470Sstevel@tonic-gate 			pool_t *pool;
51480Sstevel@tonic-gate 			poolid_t poolid;
51490Sstevel@tonic-gate 
51500Sstevel@tonic-gate 			if (pool_lock_intr() != 0) {
51510Sstevel@tonic-gate 				error = EINTR;
51520Sstevel@tonic-gate 				break;
51530Sstevel@tonic-gate 			}
51540Sstevel@tonic-gate 			pool = zone_pool_get(zone);
51550Sstevel@tonic-gate 			poolid = pool->pool_id;
51560Sstevel@tonic-gate 			pool_unlock();
51570Sstevel@tonic-gate 			size = sizeof (poolid);
51580Sstevel@tonic-gate 			if (bufsize > size)
51590Sstevel@tonic-gate 				bufsize = size;
51600Sstevel@tonic-gate 			if (buf != NULL && copyout(&poolid, buf, size) != 0)
51610Sstevel@tonic-gate 				error = EFAULT;
51620Sstevel@tonic-gate 		}
51630Sstevel@tonic-gate 		break;
51641676Sjpk 	case ZONE_ATTR_SLBL:
51651676Sjpk 		size = sizeof (bslabel_t);
51661676Sjpk 		if (bufsize > size)
51671676Sjpk 			bufsize = size;
51681676Sjpk 		if (zone->zone_slabel == NULL)
51691676Sjpk 			error = EINVAL;
51701676Sjpk 		else if (buf != NULL &&
51711676Sjpk 		    copyout(label2bslabel(zone->zone_slabel), buf,
51721676Sjpk 		    bufsize) != 0)
51731676Sjpk 			error = EFAULT;
51741676Sjpk 		break;
51750Sstevel@tonic-gate 	case ZONE_ATTR_INITPID:
51760Sstevel@tonic-gate 		size = sizeof (initpid);
51770Sstevel@tonic-gate 		if (bufsize > size)
51780Sstevel@tonic-gate 			bufsize = size;
51790Sstevel@tonic-gate 		initpid = zone->zone_proc_initpid;
51800Sstevel@tonic-gate 		if (initpid == -1) {
51810Sstevel@tonic-gate 			error = ESRCH;
51820Sstevel@tonic-gate 			break;
51830Sstevel@tonic-gate 		}
51840Sstevel@tonic-gate 		if (buf != NULL &&
51850Sstevel@tonic-gate 		    copyout(&initpid, buf, bufsize) != 0)
51860Sstevel@tonic-gate 			error = EFAULT;
51870Sstevel@tonic-gate 		break;
51882712Snn35248 	case ZONE_ATTR_BRAND:
51892712Snn35248 		size = strlen(zone->zone_brand->b_name) + 1;
51902712Snn35248 
51912712Snn35248 		if (bufsize > size)
51922712Snn35248 			bufsize = size;
51932712Snn35248 		if (buf != NULL) {
51942712Snn35248 			err = copyoutstr(zone->zone_brand->b_name, buf,
51952712Snn35248 			    bufsize, NULL);
51962712Snn35248 			if (err != 0 && err != ENAMETOOLONG)
51972712Snn35248 				error = EFAULT;
51982712Snn35248 		}
51992712Snn35248 		break;
52002267Sdp 	case ZONE_ATTR_INITNAME:
52012267Sdp 		size = strlen(zone->zone_initname) + 1;
52022267Sdp 		if (bufsize > size)
52032267Sdp 			bufsize = size;
52042267Sdp 		if (buf != NULL) {
52052267Sdp 			err = copyoutstr(zone->zone_initname, buf, bufsize,
52062267Sdp 			    NULL);
52072267Sdp 			if (err != 0 && err != ENAMETOOLONG)
52082267Sdp 				error = EFAULT;
52092267Sdp 		}
52102267Sdp 		break;
52112267Sdp 	case ZONE_ATTR_BOOTARGS:
52122267Sdp 		if (zone->zone_bootargs == NULL)
52132267Sdp 			outstr = "";
52142267Sdp 		else
52152267Sdp 			outstr = zone->zone_bootargs;
52162267Sdp 		size = strlen(outstr) + 1;
52172267Sdp 		if (bufsize > size)
52182267Sdp 			bufsize = size;
52192267Sdp 		if (buf != NULL) {
52202267Sdp 			err = copyoutstr(outstr, buf, bufsize, NULL);
52212267Sdp 			if (err != 0 && err != ENAMETOOLONG)
52222267Sdp 				error = EFAULT;
52232267Sdp 		}
52242267Sdp 		break;
52253247Sgjelinek 	case ZONE_ATTR_PHYS_MCAP:
52263247Sgjelinek 		size = sizeof (zone->zone_phys_mcap);
52273247Sgjelinek 		if (bufsize > size)
52283247Sgjelinek 			bufsize = size;
52293247Sgjelinek 		if (buf != NULL &&
52303247Sgjelinek 		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
52313247Sgjelinek 			error = EFAULT;
52323247Sgjelinek 		break;
52333247Sgjelinek 	case ZONE_ATTR_SCHED_CLASS:
52343247Sgjelinek 		mutex_enter(&class_lock);
52353247Sgjelinek 
52363247Sgjelinek 		if (zone->zone_defaultcid >= loaded_classes)
52373247Sgjelinek 			outstr = "";
52383247Sgjelinek 		else
52393247Sgjelinek 			outstr = sclass[zone->zone_defaultcid].cl_name;
52403247Sgjelinek 		size = strlen(outstr) + 1;
52413247Sgjelinek 		if (bufsize > size)
52423247Sgjelinek 			bufsize = size;
52433247Sgjelinek 		if (buf != NULL) {
52443247Sgjelinek 			err = copyoutstr(outstr, buf, bufsize, NULL);
52453247Sgjelinek 			if (err != 0 && err != ENAMETOOLONG)
52463247Sgjelinek 				error = EFAULT;
52473247Sgjelinek 		}
52483247Sgjelinek 
52493247Sgjelinek 		mutex_exit(&class_lock);
52503247Sgjelinek 		break;
52518662SJordan.Vaughan@Sun.com 	case ZONE_ATTR_HOSTID:
52528662SJordan.Vaughan@Sun.com 		if (zone->zone_hostid != HW_INVALID_HOSTID &&
52538662SJordan.Vaughan@Sun.com 		    bufsize == sizeof (zone->zone_hostid)) {
52548662SJordan.Vaughan@Sun.com 			size = sizeof (zone->zone_hostid);
52558662SJordan.Vaughan@Sun.com 			if (buf != NULL && copyout(&zone->zone_hostid, buf,
52568662SJordan.Vaughan@Sun.com 			    bufsize) != 0)
52578662SJordan.Vaughan@Sun.com 				error = EFAULT;
52588662SJordan.Vaughan@Sun.com 		} else {
52598662SJordan.Vaughan@Sun.com 			error = EINVAL;
52608662SJordan.Vaughan@Sun.com 		}
52618662SJordan.Vaughan@Sun.com 		break;
526212633Sjohn.levon@sun.com 	case ZONE_ATTR_FS_ALLOWED:
526312633Sjohn.levon@sun.com 		if (zone->zone_fs_allowed == NULL)
526412633Sjohn.levon@sun.com 			outstr = "";
526512633Sjohn.levon@sun.com 		else
526612633Sjohn.levon@sun.com 			outstr = zone->zone_fs_allowed;
526712633Sjohn.levon@sun.com 		size = strlen(outstr) + 1;
526812633Sjohn.levon@sun.com 		if (bufsize > size)
526912633Sjohn.levon@sun.com 			bufsize = size;
527012633Sjohn.levon@sun.com 		if (buf != NULL) {
527112633Sjohn.levon@sun.com 			err = copyoutstr(outstr, buf, bufsize, NULL);
527212633Sjohn.levon@sun.com 			if (err != 0 && err != ENAMETOOLONG)
527312633Sjohn.levon@sun.com 				error = EFAULT;
527412633Sjohn.levon@sun.com 		}
527512633Sjohn.levon@sun.com 		break;
527612748SSowmini.Varadhan@oracle.COM 	case ZONE_ATTR_NETWORK:
527712748SSowmini.Varadhan@oracle.COM 		zbuf = kmem_alloc(bufsize, KM_SLEEP);
527812748SSowmini.Varadhan@oracle.COM 		if (copyin(buf, zbuf, bufsize) != 0) {
527912748SSowmini.Varadhan@oracle.COM 			error = EFAULT;
528012748SSowmini.Varadhan@oracle.COM 		} else {
528112748SSowmini.Varadhan@oracle.COM 			error = zone_get_network(zoneid, zbuf);
528212748SSowmini.Varadhan@oracle.COM 			if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
528312748SSowmini.Varadhan@oracle.COM 				error = EFAULT;
528412748SSowmini.Varadhan@oracle.COM 		}
528512748SSowmini.Varadhan@oracle.COM 		kmem_free(zbuf, bufsize);
528612748SSowmini.Varadhan@oracle.COM 		break;
52870Sstevel@tonic-gate 	default:
52882712Snn35248 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
52892712Snn35248 			size = bufsize;
52902712Snn35248 			error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
52912712Snn35248 		} else {
52922712Snn35248 			error = EINVAL;
52932712Snn35248 		}
52940Sstevel@tonic-gate 	}
52950Sstevel@tonic-gate 	zone_rele(zone);
52960Sstevel@tonic-gate 
52970Sstevel@tonic-gate 	if (error)
52980Sstevel@tonic-gate 		return (set_errno(error));
52990Sstevel@tonic-gate 	return ((ssize_t)size);
53000Sstevel@tonic-gate }
53010Sstevel@tonic-gate 
53020Sstevel@tonic-gate /*
53032267Sdp  * Systemcall entry point for zone_setattr(2).
53042267Sdp  */
53052267Sdp /*ARGSUSED*/
53062267Sdp static int
zone_setattr(zoneid_t zoneid,int attr,void * buf,size_t bufsize)53072267Sdp zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
53082267Sdp {
53092267Sdp 	zone_t *zone;
53102267Sdp 	zone_status_t zone_status;
531112820Sdp@eng.sun.com 	int err = -1;
531212748SSowmini.Varadhan@oracle.COM 	zone_net_data_t *zbuf;
53132267Sdp 
53142267Sdp 	if (secpolicy_zone_config(CRED()) != 0)
53152267Sdp 		return (set_errno(EPERM));
53162267Sdp 
53172267Sdp 	/*
53183247Sgjelinek 	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
53193247Sgjelinek 	 * global zone.
53202267Sdp 	 */
53213247Sgjelinek 	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
53222267Sdp 		return (set_errno(EINVAL));
53232267Sdp 	}
53242267Sdp 
53252267Sdp 	mutex_enter(&zonehash_lock);
53262267Sdp 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
53272267Sdp 		mutex_exit(&zonehash_lock);
53282267Sdp 		return (set_errno(EINVAL));
53292267Sdp 	}
53302267Sdp 	zone_hold(zone);
53312267Sdp 	mutex_exit(&zonehash_lock);
53322267Sdp 
53333247Sgjelinek 	/*
53343247Sgjelinek 	 * At present most attributes can only be set on non-running,
53353247Sgjelinek 	 * non-global zones.
53363247Sgjelinek 	 */
53372267Sdp 	zone_status = zone_status_get(zone);
533812820Sdp@eng.sun.com 	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
533912820Sdp@eng.sun.com 		err = EINVAL;
53402267Sdp 		goto done;
534112820Sdp@eng.sun.com 	}
53422267Sdp 
53432267Sdp 	switch (attr) {
53442267Sdp 	case ZONE_ATTR_INITNAME:
53452267Sdp 		err = zone_set_initname(zone, (const char *)buf);
53462267Sdp 		break;
53472267Sdp 	case ZONE_ATTR_BOOTARGS:
53482267Sdp 		err = zone_set_bootargs(zone, (const char *)buf);
53492267Sdp 		break;
53502712Snn35248 	case ZONE_ATTR_BRAND:
53514141Sedp 		err = zone_set_brand(zone, (const char *)buf);
53522712Snn35248 		break;
535312633Sjohn.levon@sun.com 	case ZONE_ATTR_FS_ALLOWED:
535412633Sjohn.levon@sun.com 		err = zone_set_fs_allowed(zone, (const char *)buf);
535512633Sjohn.levon@sun.com 		break;
53563247Sgjelinek 	case ZONE_ATTR_PHYS_MCAP:
53573247Sgjelinek 		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
53583247Sgjelinek 		break;
53593247Sgjelinek 	case ZONE_ATTR_SCHED_CLASS:
53603247Sgjelinek 		err = zone_set_sched_class(zone, (const char *)buf);
53613247Sgjelinek 		break;
53628662SJordan.Vaughan@Sun.com 	case ZONE_ATTR_HOSTID:
53638662SJordan.Vaughan@Sun.com 		if (bufsize == sizeof (zone->zone_hostid)) {
53648662SJordan.Vaughan@Sun.com 			if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
53658662SJordan.Vaughan@Sun.com 				err = 0;
53668662SJordan.Vaughan@Sun.com 			else
53678662SJordan.Vaughan@Sun.com 				err = EFAULT;
53688662SJordan.Vaughan@Sun.com 		} else {
53698662SJordan.Vaughan@Sun.com 			err = EINVAL;
53708662SJordan.Vaughan@Sun.com 		}
53718662SJordan.Vaughan@Sun.com 		break;
537212748SSowmini.Varadhan@oracle.COM 	case ZONE_ATTR_NETWORK:
537312748SSowmini.Varadhan@oracle.COM 		if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
537412748SSowmini.Varadhan@oracle.COM 			err = EINVAL;
537512820Sdp@eng.sun.com 			break;
537612748SSowmini.Varadhan@oracle.COM 		}
537712748SSowmini.Varadhan@oracle.COM 		zbuf = kmem_alloc(bufsize, KM_SLEEP);
537812748SSowmini.Varadhan@oracle.COM 		if (copyin(buf, zbuf, bufsize) != 0) {
537912820Sdp@eng.sun.com 			kmem_free(zbuf, bufsize);
538012748SSowmini.Varadhan@oracle.COM 			err = EFAULT;
538112820Sdp@eng.sun.com 			break;
538212748SSowmini.Varadhan@oracle.COM 		}
538312748SSowmini.Varadhan@oracle.COM 		err = zone_set_network(zoneid, zbuf);
538412748SSowmini.Varadhan@oracle.COM 		kmem_free(zbuf, bufsize);
538512748SSowmini.Varadhan@oracle.COM 		break;
53862267Sdp 	default:
53872712Snn35248 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
53882712Snn35248 			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
53892712Snn35248 		else
53902712Snn35248 			err = EINVAL;
53912267Sdp 	}
53922267Sdp 
53932267Sdp done:
53942267Sdp 	zone_rele(zone);
539512820Sdp@eng.sun.com 	ASSERT(err != -1);
53962267Sdp 	return (err != 0 ? set_errno(err) : 0);
53972267Sdp }
53982267Sdp 
53992267Sdp /*
54000Sstevel@tonic-gate  * Return zero if the process has at least one vnode mapped in to its
54010Sstevel@tonic-gate  * address space which shouldn't be allowed to change zones.
54023247Sgjelinek  *
54033247Sgjelinek  * Also return zero if the process has any shared mappings which reserve
54043247Sgjelinek  * swap.  This is because the counting for zone.max-swap does not allow swap
54055331Samw  * reservation to be shared between zones.  zone swap reservation is counted
54063247Sgjelinek  * on zone->zone_max_swap.
54070Sstevel@tonic-gate  */
54080Sstevel@tonic-gate static int
as_can_change_zones(void)54090Sstevel@tonic-gate as_can_change_zones(void)
54100Sstevel@tonic-gate {
54110Sstevel@tonic-gate 	proc_t *pp = curproc;
54120Sstevel@tonic-gate 	struct seg *seg;
54130Sstevel@tonic-gate 	struct as *as = pp->p_as;
54140Sstevel@tonic-gate 	vnode_t *vp;
54150Sstevel@tonic-gate 	int allow = 1;
54160Sstevel@tonic-gate 
54170Sstevel@tonic-gate 	ASSERT(pp->p_as != &kas);
54183247Sgjelinek 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
54190Sstevel@tonic-gate 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
54203247Sgjelinek 
54213247Sgjelinek 		/*
54223247Sgjelinek 		 * Cannot enter zone with shared anon memory which
54233247Sgjelinek 		 * reserves swap.  See comment above.
54243247Sgjelinek 		 */
54253247Sgjelinek 		if (seg_can_change_zones(seg) == B_FALSE) {
54263247Sgjelinek 			allow = 0;
54273247Sgjelinek 			break;
54283247Sgjelinek 		}
54290Sstevel@tonic-gate 		/*
54300Sstevel@tonic-gate 		 * if we can't get a backing vnode for this segment then skip
54310Sstevel@tonic-gate 		 * it.
54320Sstevel@tonic-gate 		 */
54330Sstevel@tonic-gate 		vp = NULL;
54340Sstevel@tonic-gate 		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
54350Sstevel@tonic-gate 			continue;
54360Sstevel@tonic-gate 		if (!vn_can_change_zones(vp)) { /* bail on first match */
54370Sstevel@tonic-gate 			allow = 0;
54380Sstevel@tonic-gate 			break;
54390Sstevel@tonic-gate 		}
54400Sstevel@tonic-gate 	}
54413247Sgjelinek 	AS_LOCK_EXIT(as, &as->a_lock);
54420Sstevel@tonic-gate 	return (allow);
54430Sstevel@tonic-gate }
54440Sstevel@tonic-gate 
54450Sstevel@tonic-gate /*
54463247Sgjelinek  * Count swap reserved by curproc's address space
54473247Sgjelinek  */
54483247Sgjelinek static size_t
as_swresv(void)54493247Sgjelinek as_swresv(void)
54503247Sgjelinek {
54513247Sgjelinek 	proc_t *pp = curproc;
54523247Sgjelinek 	struct seg *seg;
54533247Sgjelinek 	struct as *as = pp->p_as;
54543247Sgjelinek 	size_t swap = 0;
54553247Sgjelinek 
54563247Sgjelinek 	ASSERT(pp->p_as != &kas);
54573247Sgjelinek 	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
54583247Sgjelinek 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
54593247Sgjelinek 		swap += seg_swresv(seg);
54603247Sgjelinek 
54613247Sgjelinek 	return (swap);
54623247Sgjelinek }
54633247Sgjelinek 
54643247Sgjelinek /*
54650Sstevel@tonic-gate  * Systemcall entry point for zone_enter().
54660Sstevel@tonic-gate  *
54670Sstevel@tonic-gate  * The current process is injected into said zone.  In the process
54680Sstevel@tonic-gate  * it will change its project membership, privileges, rootdir/cwd,
54690Sstevel@tonic-gate  * zone-wide rctls, and pool association to match those of the zone.
54700Sstevel@tonic-gate  *
54710Sstevel@tonic-gate  * The first zone_enter() called while the zone is in the ZONE_IS_READY
54720Sstevel@tonic-gate  * state will transition it to ZONE_IS_RUNNING.  Processes may only
54730Sstevel@tonic-gate  * enter a zone that is "ready" or "running".
54740Sstevel@tonic-gate  */
54750Sstevel@tonic-gate static int
zone_enter(zoneid_t zoneid)54760Sstevel@tonic-gate zone_enter(zoneid_t zoneid)
54770Sstevel@tonic-gate {
54780Sstevel@tonic-gate 	zone_t *zone;
54790Sstevel@tonic-gate 	vnode_t *vp;
54800Sstevel@tonic-gate 	proc_t *pp = curproc;
54810Sstevel@tonic-gate 	contract_t *ct;
54820Sstevel@tonic-gate 	cont_process_t *ctp;
54830Sstevel@tonic-gate 	task_t *tk, *oldtk;
54840Sstevel@tonic-gate 	kproject_t *zone_proj0;
54850Sstevel@tonic-gate 	cred_t *cr, *newcr;
54860Sstevel@tonic-gate 	pool_t *oldpool, *newpool;
54870Sstevel@tonic-gate 	sess_t *sp;
54880Sstevel@tonic-gate 	uid_t uid;
54890Sstevel@tonic-gate 	zone_status_t status;
54900Sstevel@tonic-gate 	int err = 0;
54910Sstevel@tonic-gate 	rctl_entity_p_t e;
54923247Sgjelinek 	size_t swap;
54933792Sakolb 	kthread_id_t t;
54940Sstevel@tonic-gate 
54950Sstevel@tonic-gate 	if (secpolicy_zone_config(CRED()) != 0)
54960Sstevel@tonic-gate 		return (set_errno(EPERM));
54970Sstevel@tonic-gate 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
54980Sstevel@tonic-gate 		return (set_errno(EINVAL));
54990Sstevel@tonic-gate 
55000Sstevel@tonic-gate 	/*
55010Sstevel@tonic-gate 	 * Stop all lwps so we don't need to hold a lock to look at
55020Sstevel@tonic-gate 	 * curproc->p_zone.  This needs to happen before we grab any
55030Sstevel@tonic-gate 	 * locks to avoid deadlock (another lwp in the process could
55040Sstevel@tonic-gate 	 * be waiting for the held lock).
55050Sstevel@tonic-gate 	 */
55060Sstevel@tonic-gate 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
55070Sstevel@tonic-gate 		return (set_errno(EINTR));
55080Sstevel@tonic-gate 
55090Sstevel@tonic-gate 	/*
55100Sstevel@tonic-gate 	 * Make sure we're not changing zones with files open or mapped in
55110Sstevel@tonic-gate 	 * to our address space which shouldn't be changing zones.
55120Sstevel@tonic-gate 	 */
55130Sstevel@tonic-gate 	if (!files_can_change_zones()) {
55140Sstevel@tonic-gate 		err = EBADF;
55150Sstevel@tonic-gate 		goto out;
55160Sstevel@tonic-gate 	}
55170Sstevel@tonic-gate 	if (!as_can_change_zones()) {
55180Sstevel@tonic-gate 		err = EFAULT;
55190Sstevel@tonic-gate 		goto out;
55200Sstevel@tonic-gate 	}
55210Sstevel@tonic-gate 
55220Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
55230Sstevel@tonic-gate 	if (pp->p_zone != global_zone) {
55240Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
55250Sstevel@tonic-gate 		err = EINVAL;
55260Sstevel@tonic-gate 		goto out;
55270Sstevel@tonic-gate 	}
55280Sstevel@tonic-gate 
55290Sstevel@tonic-gate 	zone = zone_find_all_by_id(zoneid);
55300Sstevel@tonic-gate 	if (zone == NULL) {
55310Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
55320Sstevel@tonic-gate 		err = EINVAL;
55330Sstevel@tonic-gate 		goto out;
55340Sstevel@tonic-gate 	}
55350Sstevel@tonic-gate 
55360Sstevel@tonic-gate 	/*
55370Sstevel@tonic-gate 	 * To prevent processes in a zone from holding contracts on
55380Sstevel@tonic-gate 	 * extrazonal resources, and to avoid process contract
55390Sstevel@tonic-gate 	 * memberships which span zones, contract holders and processes
55400Sstevel@tonic-gate 	 * which aren't the sole members of their encapsulating process
55410Sstevel@tonic-gate 	 * contracts are not allowed to zone_enter.
55420Sstevel@tonic-gate 	 */
55430Sstevel@tonic-gate 	ctp = pp->p_ct_process;
55440Sstevel@tonic-gate 	ct = &ctp->conp_contract;
55450Sstevel@tonic-gate 	mutex_enter(&ct->ct_lock);
55460Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
55470Sstevel@tonic-gate 	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
55480Sstevel@tonic-gate 		mutex_exit(&pp->p_lock);
55490Sstevel@tonic-gate 		mutex_exit(&ct->ct_lock);
55500Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
55510Sstevel@tonic-gate 		err = EINVAL;
55520Sstevel@tonic-gate 		goto out;
55530Sstevel@tonic-gate 	}
55540Sstevel@tonic-gate 
55550Sstevel@tonic-gate 	/*
55560Sstevel@tonic-gate 	 * Moreover, we don't allow processes whose encapsulating
55570Sstevel@tonic-gate 	 * process contracts have inherited extrazonal contracts.
55580Sstevel@tonic-gate 	 * While it would be easier to eliminate all process contracts
55590Sstevel@tonic-gate 	 * with inherited contracts, we need to be able to give a
55600Sstevel@tonic-gate 	 * restarted init (or other zone-penetrating process) its
55610Sstevel@tonic-gate 	 * predecessor's contracts.
55620Sstevel@tonic-gate 	 */
55630Sstevel@tonic-gate 	if (ctp->conp_ninherited != 0) {
55640Sstevel@tonic-gate 		contract_t *next;
55650Sstevel@tonic-gate 		for (next = list_head(&ctp->conp_inherited); next;
55660Sstevel@tonic-gate 		    next = list_next(&ctp->conp_inherited, next)) {
55670Sstevel@tonic-gate 			if (contract_getzuniqid(next) != zone->zone_uniqid) {
55680Sstevel@tonic-gate 				mutex_exit(&pp->p_lock);
55690Sstevel@tonic-gate 				mutex_exit(&ct->ct_lock);
55700Sstevel@tonic-gate 				mutex_exit(&zonehash_lock);
55710Sstevel@tonic-gate 				err = EINVAL;
55720Sstevel@tonic-gate 				goto out;
55730Sstevel@tonic-gate 			}
55740Sstevel@tonic-gate 		}
55750Sstevel@tonic-gate 	}
55766073Sacruz 
55770Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
55780Sstevel@tonic-gate 	mutex_exit(&ct->ct_lock);
55790Sstevel@tonic-gate 
55800Sstevel@tonic-gate 	status = zone_status_get(zone);
55810Sstevel@tonic-gate 	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
55820Sstevel@tonic-gate 		/*
55830Sstevel@tonic-gate 		 * Can't join
55840Sstevel@tonic-gate 		 */
55850Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
55860Sstevel@tonic-gate 		err = EINVAL;
55870Sstevel@tonic-gate 		goto out;
55880Sstevel@tonic-gate 	}
55890Sstevel@tonic-gate 
55900Sstevel@tonic-gate 	/*
55910Sstevel@tonic-gate 	 * Make sure new priv set is within the permitted set for caller
55920Sstevel@tonic-gate 	 */
55930Sstevel@tonic-gate 	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
55940Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
55950Sstevel@tonic-gate 		err = EPERM;
55960Sstevel@tonic-gate 		goto out;
55970Sstevel@tonic-gate 	}
55980Sstevel@tonic-gate 	/*
55990Sstevel@tonic-gate 	 * We want to momentarily drop zonehash_lock while we optimistically
56000Sstevel@tonic-gate 	 * bind curproc to the pool it should be running in.  This is safe
56010Sstevel@tonic-gate 	 * since the zone can't disappear (we have a hold on it).
56020Sstevel@tonic-gate 	 */
56030Sstevel@tonic-gate 	zone_hold(zone);
56040Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
56050Sstevel@tonic-gate 
56060Sstevel@tonic-gate 	/*
56070Sstevel@tonic-gate 	 * Grab pool_lock to keep the pools configuration from changing
56080Sstevel@tonic-gate 	 * and to stop ourselves from getting rebound to another pool
56090Sstevel@tonic-gate 	 * until we join the zone.
56100Sstevel@tonic-gate 	 */
56110Sstevel@tonic-gate 	if (pool_lock_intr() != 0) {
56120Sstevel@tonic-gate 		zone_rele(zone);
56130Sstevel@tonic-gate 		err = EINTR;
56140Sstevel@tonic-gate 		goto out;
56150Sstevel@tonic-gate 	}
56160Sstevel@tonic-gate 	ASSERT(secpolicy_pool(CRED()) == 0);
56170Sstevel@tonic-gate 	/*
56180Sstevel@tonic-gate 	 * Bind ourselves to the pool currently associated with the zone.
56190Sstevel@tonic-gate 	 */
56200Sstevel@tonic-gate 	oldpool = curproc->p_pool;
56210Sstevel@tonic-gate 	newpool = zone_pool_get(zone);
56220Sstevel@tonic-gate 	if (pool_state == POOL_ENABLED && newpool != oldpool &&
56230Sstevel@tonic-gate 	    (err = pool_do_bind(newpool, P_PID, P_MYID,
56240Sstevel@tonic-gate 	    POOL_BIND_ALL)) != 0) {
56250Sstevel@tonic-gate 		pool_unlock();
56260Sstevel@tonic-gate 		zone_rele(zone);
56270Sstevel@tonic-gate 		goto out;
56280Sstevel@tonic-gate 	}
56290Sstevel@tonic-gate 
56300Sstevel@tonic-gate 	/*
56310Sstevel@tonic-gate 	 * Grab cpu_lock now; we'll need it later when we call
56320Sstevel@tonic-gate 	 * task_join().
56330Sstevel@tonic-gate 	 */
56340Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
56350Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
56360Sstevel@tonic-gate 	/*
56370Sstevel@tonic-gate 	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
56380Sstevel@tonic-gate 	 */
56390Sstevel@tonic-gate 	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
56400Sstevel@tonic-gate 		/*
56410Sstevel@tonic-gate 		 * Can't join anymore.
56420Sstevel@tonic-gate 		 */
56430Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
56440Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
56450Sstevel@tonic-gate 		if (pool_state == POOL_ENABLED &&
56460Sstevel@tonic-gate 		    newpool != oldpool)
56470Sstevel@tonic-gate 			(void) pool_do_bind(oldpool, P_PID, P_MYID,
56480Sstevel@tonic-gate 			    POOL_BIND_ALL);
56490Sstevel@tonic-gate 		pool_unlock();
56500Sstevel@tonic-gate 		zone_rele(zone);
56510Sstevel@tonic-gate 		err = EINVAL;
56520Sstevel@tonic-gate 		goto out;
56530Sstevel@tonic-gate 	}
56540Sstevel@tonic-gate 
56553247Sgjelinek 	/*
56563247Sgjelinek 	 * a_lock must be held while transfering locked memory and swap
56573247Sgjelinek 	 * reservation from the global zone to the non global zone because
56583247Sgjelinek 	 * asynchronous faults on the processes' address space can lock
56593247Sgjelinek 	 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
56603247Sgjelinek 	 * segments respectively.
56613247Sgjelinek 	 */
56623247Sgjelinek 	AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
56633247Sgjelinek 	swap = as_swresv();
56640Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
56650Sstevel@tonic-gate 	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
56660Sstevel@tonic-gate 	/* verify that we do not exceed and task or lwp limits */
56670Sstevel@tonic-gate 	mutex_enter(&zone->zone_nlwps_lock);
56680Sstevel@tonic-gate 	/* add new lwps to zone and zone's proj0 */
56690Sstevel@tonic-gate 	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
56700Sstevel@tonic-gate 	zone->zone_nlwps += pp->p_lwpcnt;
56710Sstevel@tonic-gate 	/* add 1 task to zone's proj0 */
56720Sstevel@tonic-gate 	zone_proj0->kpj_ntasks += 1;
567312725SMenno.Lageman@Sun.COM 
567412725SMenno.Lageman@Sun.COM 	zone_proj0->kpj_nprocs++;
567512725SMenno.Lageman@Sun.COM 	zone->zone_nprocs++;
56760Sstevel@tonic-gate 	mutex_exit(&zone->zone_nlwps_lock);
56770Sstevel@tonic-gate 
56783247Sgjelinek 	mutex_enter(&zone->zone_mem_lock);
56792768Ssl108498 	zone->zone_locked_mem += pp->p_locked_mem;
56802768Ssl108498 	zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
56813247Sgjelinek 	zone->zone_max_swap += swap;
56823247Sgjelinek 	mutex_exit(&zone->zone_mem_lock);
56832768Ssl108498 
56843916Skrishna 	mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
56853916Skrishna 	zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
56863916Skrishna 	mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
56873916Skrishna 
568812725SMenno.Lageman@Sun.COM 	/* remove lwps and process from proc's old zone and old project */
56890Sstevel@tonic-gate 	mutex_enter(&pp->p_zone->zone_nlwps_lock);
56900Sstevel@tonic-gate 	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
56910Sstevel@tonic-gate 	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
569212725SMenno.Lageman@Sun.COM 	pp->p_task->tk_proj->kpj_nprocs--;
569312725SMenno.Lageman@Sun.COM 	pp->p_zone->zone_nprocs--;
56940Sstevel@tonic-gate 	mutex_exit(&pp->p_zone->zone_nlwps_lock);
56950Sstevel@tonic-gate 
56963247Sgjelinek 	mutex_enter(&pp->p_zone->zone_mem_lock);
56972768Ssl108498 	pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
56982768Ssl108498 	pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
56993247Sgjelinek 	pp->p_zone->zone_max_swap -= swap;
57003247Sgjelinek 	mutex_exit(&pp->p_zone->zone_mem_lock);
57012768Ssl108498 
57023916Skrishna 	mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
57033916Skrishna 	pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
57043916Skrishna 	mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
57053916Skrishna 
57069121SVamsi.Krishna@Sun.COM 	pp->p_flag |= SZONETOP;
57079121SVamsi.Krishna@Sun.COM 	pp->p_zone = zone;
57082768Ssl108498 	mutex_exit(&pp->p_lock);
57093247Sgjelinek 	AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
57102768Ssl108498 
57110Sstevel@tonic-gate 	/*
57120Sstevel@tonic-gate 	 * Joining the zone cannot fail from now on.
57130Sstevel@tonic-gate 	 *
57140Sstevel@tonic-gate 	 * This means that a lot of the following code can be commonized and
57150Sstevel@tonic-gate 	 * shared with zsched().
57160Sstevel@tonic-gate 	 */
57170Sstevel@tonic-gate 
57180Sstevel@tonic-gate 	/*
57196073Sacruz 	 * If the process contract fmri was inherited, we need to
57206073Sacruz 	 * flag this so that any contract status will not leak
57216073Sacruz 	 * extra zone information, svc_fmri in this case
57226073Sacruz 	 */
57236073Sacruz 	if (ctp->conp_svc_ctid != ct->ct_id) {
57246073Sacruz 		mutex_enter(&ct->ct_lock);
57256073Sacruz 		ctp->conp_svc_zone_enter = ct->ct_id;
57266073Sacruz 		mutex_exit(&ct->ct_lock);
57276073Sacruz 	}
57286073Sacruz 
57296073Sacruz 	/*
57300Sstevel@tonic-gate 	 * Reset the encapsulating process contract's zone.
57310Sstevel@tonic-gate 	 */
57320Sstevel@tonic-gate 	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
57330Sstevel@tonic-gate 	contract_setzuniqid(ct, zone->zone_uniqid);
57340Sstevel@tonic-gate 
57350Sstevel@tonic-gate 	/*
57360Sstevel@tonic-gate 	 * Create a new task and associate the process with the project keyed
57370Sstevel@tonic-gate 	 * by (projid,zoneid).
57380Sstevel@tonic-gate 	 *
57390Sstevel@tonic-gate 	 * We might as well be in project 0; the global zone's projid doesn't
57400Sstevel@tonic-gate 	 * make much sense in a zone anyhow.
57410Sstevel@tonic-gate 	 *
57420Sstevel@tonic-gate 	 * This also increments zone_ntasks, and returns with p_lock held.
57430Sstevel@tonic-gate 	 */
57440Sstevel@tonic-gate 	tk = task_create(0, zone);
57450Sstevel@tonic-gate 	oldtk = task_join(tk, 0);
57460Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
57470Sstevel@tonic-gate 
57480Sstevel@tonic-gate 	/*
57490Sstevel@tonic-gate 	 * call RCTLOP_SET functions on this proc
57500Sstevel@tonic-gate 	 */
57510Sstevel@tonic-gate 	e.rcep_p.zone = zone;
57520Sstevel@tonic-gate 	e.rcep_t = RCENTITY_ZONE;
57530Sstevel@tonic-gate 	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
57540Sstevel@tonic-gate 	    RCD_CALLBACK);
57550Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
57560Sstevel@tonic-gate 
57570Sstevel@tonic-gate 	/*
57580Sstevel@tonic-gate 	 * We don't need to hold any of zsched's locks here; not only do we know
57590Sstevel@tonic-gate 	 * the process and zone aren't going away, we know its session isn't
57600Sstevel@tonic-gate 	 * changing either.
57610Sstevel@tonic-gate 	 *
57620Sstevel@tonic-gate 	 * By joining zsched's session here, we mimic the behavior in the
57630Sstevel@tonic-gate 	 * global zone of init's sid being the pid of sched.  We extend this
57640Sstevel@tonic-gate 	 * to all zlogin-like zone_enter()'ing processes as well.
57650Sstevel@tonic-gate 	 */
57660Sstevel@tonic-gate 	mutex_enter(&pidlock);
57670Sstevel@tonic-gate 	sp = zone->zone_zsched->p_sessp;
57682712Snn35248 	sess_hold(zone->zone_zsched);
57690Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
57700Sstevel@tonic-gate 	pgexit(pp);
57712712Snn35248 	sess_rele(pp->p_sessp, B_TRUE);
57720Sstevel@tonic-gate 	pp->p_sessp = sp;
57730Sstevel@tonic-gate 	pgjoin(pp, zone->zone_zsched->p_pidp);
57743247Sgjelinek 
57753247Sgjelinek 	/*
57763792Sakolb 	 * If any threads are scheduled to be placed on zone wait queue they
57773792Sakolb 	 * should abandon the idea since the wait queue is changing.
57783792Sakolb 	 * We need to be holding pidlock & p_lock to do this.
57793792Sakolb 	 */
57803792Sakolb 	if ((t = pp->p_tlist) != NULL) {
57813792Sakolb 		do {
57823792Sakolb 			thread_lock(t);
57833792Sakolb 			/*
57843792Sakolb 			 * Kick this thread so that he doesn't sit
57853792Sakolb 			 * on a wrong wait queue.
57863792Sakolb 			 */
57873792Sakolb 			if (ISWAITING(t))
57883792Sakolb 				setrun_locked(t);
57893792Sakolb 
57903792Sakolb 			if (t->t_schedflag & TS_ANYWAITQ)
57913792Sakolb 				t->t_schedflag &= ~ TS_ANYWAITQ;
57923792Sakolb 
57933792Sakolb 			thread_unlock(t);
57943792Sakolb 		} while ((t = t->t_forw) != pp->p_tlist);
57953792Sakolb 	}
57963792Sakolb 
57973792Sakolb 	/*
57983247Sgjelinek 	 * If there is a default scheduling class for the zone and it is not
57993247Sgjelinek 	 * the class we are currently in, change all of the threads in the
58003247Sgjelinek 	 * process to the new class.  We need to be holding pidlock & p_lock
58013247Sgjelinek 	 * when we call parmsset so this is a good place to do it.
58023247Sgjelinek 	 */
58033247Sgjelinek 	if (zone->zone_defaultcid > 0 &&
58043247Sgjelinek 	    zone->zone_defaultcid != curthread->t_cid) {
58053247Sgjelinek 		pcparms_t pcparms;
58063247Sgjelinek 
58073247Sgjelinek 		pcparms.pc_cid = zone->zone_defaultcid;
58083247Sgjelinek 		pcparms.pc_clparms[0] = 0;
58093247Sgjelinek 
58103247Sgjelinek 		/*
58113247Sgjelinek 		 * If setting the class fails, we still want to enter the zone.
58123247Sgjelinek 		 */
58133247Sgjelinek 		if ((t = pp->p_tlist) != NULL) {
58143247Sgjelinek 			do {
58153247Sgjelinek 				(void) parmsset(&pcparms, t);
58163247Sgjelinek 			} while ((t = t->t_forw) != pp->p_tlist);
58173247Sgjelinek 		}
58183247Sgjelinek 	}
58193247Sgjelinek 
58200Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
58210Sstevel@tonic-gate 	mutex_exit(&pidlock);
58220Sstevel@tonic-gate 
58230Sstevel@tonic-gate 	mutex_exit(&zonehash_lock);
58240Sstevel@tonic-gate 	/*
58250Sstevel@tonic-gate 	 * We're firmly in the zone; let pools progress.
58260Sstevel@tonic-gate 	 */
58270Sstevel@tonic-gate 	pool_unlock();
58280Sstevel@tonic-gate 	task_rele(oldtk);
58290Sstevel@tonic-gate 	/*
58300Sstevel@tonic-gate 	 * We don't need to retain a hold on the zone since we already
58310Sstevel@tonic-gate 	 * incremented zone_ntasks, so the zone isn't going anywhere.
58320Sstevel@tonic-gate 	 */
58330Sstevel@tonic-gate 	zone_rele(zone);
58340Sstevel@tonic-gate 
58350Sstevel@tonic-gate 	/*
58360Sstevel@tonic-gate 	 * Chroot
58370Sstevel@tonic-gate 	 */
58380Sstevel@tonic-gate 	vp = zone->zone_rootvp;
58390Sstevel@tonic-gate 	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
58400Sstevel@tonic-gate 	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
58410Sstevel@tonic-gate 
58420Sstevel@tonic-gate 	/*
58430Sstevel@tonic-gate 	 * Change process credentials
58440Sstevel@tonic-gate 	 */
58450Sstevel@tonic-gate 	newcr = cralloc();
58460Sstevel@tonic-gate 	mutex_enter(&pp->p_crlock);
58470Sstevel@tonic-gate 	cr = pp->p_cred;
58480Sstevel@tonic-gate 	crcopy_to(cr, newcr);
58490Sstevel@tonic-gate 	crsetzone(newcr, zone);
58500Sstevel@tonic-gate 	pp->p_cred = newcr;
58510Sstevel@tonic-gate 
58520Sstevel@tonic-gate 	/*
58530Sstevel@tonic-gate 	 * Restrict all process privilege sets to zone limit
58540Sstevel@tonic-gate 	 */
58550Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
58560Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
58570Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
58580Sstevel@tonic-gate 	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
58590Sstevel@tonic-gate 	mutex_exit(&pp->p_crlock);
58600Sstevel@tonic-gate 	crset(pp, newcr);
58610Sstevel@tonic-gate 
58620Sstevel@tonic-gate 	/*
58630Sstevel@tonic-gate 	 * Adjust upcount to reflect zone entry.
58640Sstevel@tonic-gate 	 */
58650Sstevel@tonic-gate 	uid = crgetruid(newcr);
58660Sstevel@tonic-gate 	mutex_enter(&pidlock);
58670Sstevel@tonic-gate 	upcount_dec(uid, GLOBAL_ZONEID);
58680Sstevel@tonic-gate 	upcount_inc(uid, zoneid);
58690Sstevel@tonic-gate 	mutex_exit(&pidlock);
58700Sstevel@tonic-gate 
58710Sstevel@tonic-gate 	/*
58720Sstevel@tonic-gate 	 * Set up core file path and content.
58730Sstevel@tonic-gate 	 */
58740Sstevel@tonic-gate 	set_core_defaults();
58750Sstevel@tonic-gate 
58760Sstevel@tonic-gate out:
58770Sstevel@tonic-gate 	/*
58780Sstevel@tonic-gate 	 * Let the other lwps continue.
58790Sstevel@tonic-gate 	 */
58800Sstevel@tonic-gate 	mutex_enter(&pp->p_lock);
58810Sstevel@tonic-gate 	if (curthread != pp->p_agenttp)
58820Sstevel@tonic-gate 		continuelwps(pp);
58830Sstevel@tonic-gate 	mutex_exit(&pp->p_lock);
58840Sstevel@tonic-gate 
58850Sstevel@tonic-gate 	return (err != 0 ? set_errno(err) : 0);
58860Sstevel@tonic-gate }
58870Sstevel@tonic-gate 
58880Sstevel@tonic-gate /*
58890Sstevel@tonic-gate  * Systemcall entry point for zone_list(2).
58900Sstevel@tonic-gate  *
58910Sstevel@tonic-gate  * Processes running in a (non-global) zone only see themselves.
58921676Sjpk  * On labeled systems, they see all zones whose label they dominate.
58930Sstevel@tonic-gate  */
58940Sstevel@tonic-gate static int
zone_list(zoneid_t * zoneidlist,uint_t * numzones)58950Sstevel@tonic-gate zone_list(zoneid_t *zoneidlist, uint_t *numzones)
58960Sstevel@tonic-gate {
58970Sstevel@tonic-gate 	zoneid_t *zoneids;
58981769Scarlsonj 	zone_t *zone, *myzone;
58990Sstevel@tonic-gate 	uint_t user_nzones, real_nzones;
59001676Sjpk 	uint_t domi_nzones;
59011676Sjpk 	int error;
59020Sstevel@tonic-gate 
59030Sstevel@tonic-gate 	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
59040Sstevel@tonic-gate 		return (set_errno(EFAULT));
59050Sstevel@tonic-gate 
59061769Scarlsonj 	myzone = curproc->p_zone;
59071769Scarlsonj 	if (myzone != global_zone) {
59081676Sjpk 		bslabel_t *mybslab;
59091676Sjpk 
59101676Sjpk 		if (!is_system_labeled()) {
59111676Sjpk 			/* just return current zone */
59121676Sjpk 			real_nzones = domi_nzones = 1;
59131676Sjpk 			zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
59141769Scarlsonj 			zoneids[0] = myzone->zone_id;
59151676Sjpk 		} else {
59161676Sjpk 			/* return all zones that are dominated */
59171676Sjpk 			mutex_enter(&zonehash_lock);
59181676Sjpk 			real_nzones = zonecount;
59191676Sjpk 			domi_nzones = 0;
59201676Sjpk 			if (real_nzones > 0) {
59211676Sjpk 				zoneids = kmem_alloc(real_nzones *
59221676Sjpk 				    sizeof (zoneid_t), KM_SLEEP);
59231769Scarlsonj 				mybslab = label2bslabel(myzone->zone_slabel);
59241676Sjpk 				for (zone = list_head(&zone_active);
59251676Sjpk 				    zone != NULL;
59261676Sjpk 				    zone = list_next(&zone_active, zone)) {
59271676Sjpk 					if (zone->zone_id == GLOBAL_ZONEID)
59281676Sjpk 						continue;
59291769Scarlsonj 					if (zone != myzone &&
59301769Scarlsonj 					    (zone->zone_flags & ZF_IS_SCRATCH))
59311769Scarlsonj 						continue;
59321769Scarlsonj 					/*
59331769Scarlsonj 					 * Note that a label always dominates
59341769Scarlsonj 					 * itself, so myzone is always included
59351769Scarlsonj 					 * in the list.
59361769Scarlsonj 					 */
59371676Sjpk 					if (bldominates(mybslab,
59381676Sjpk 					    label2bslabel(zone->zone_slabel))) {
59391676Sjpk 						zoneids[domi_nzones++] =
59401676Sjpk 						    zone->zone_id;
59411676Sjpk 					}
59421676Sjpk 				}
59431676Sjpk 			}
59441676Sjpk 			mutex_exit(&zonehash_lock);
59451676Sjpk 		}
59460Sstevel@tonic-gate 	} else {
59470Sstevel@tonic-gate 		mutex_enter(&zonehash_lock);
59480Sstevel@tonic-gate 		real_nzones = zonecount;
59491676Sjpk 		domi_nzones = 0;
59501676Sjpk 		if (real_nzones > 0) {
59510Sstevel@tonic-gate 			zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
59520Sstevel@tonic-gate 			    KM_SLEEP);
59530Sstevel@tonic-gate 			for (zone = list_head(&zone_active); zone != NULL;
59540Sstevel@tonic-gate 			    zone = list_next(&zone_active, zone))
59551676Sjpk 				zoneids[domi_nzones++] = zone->zone_id;
59561676Sjpk 			ASSERT(domi_nzones == real_nzones);
59570Sstevel@tonic-gate 		}
59580Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
59590Sstevel@tonic-gate 	}
59600Sstevel@tonic-gate 
59611676Sjpk 	/*
59621676Sjpk 	 * If user has allocated space for fewer entries than we found, then
59631676Sjpk 	 * return only up to his limit.  Either way, tell him exactly how many
59641676Sjpk 	 * we found.
59651676Sjpk 	 */
59661676Sjpk 	if (domi_nzones < user_nzones)
59671676Sjpk 		user_nzones = domi_nzones;
59681676Sjpk 	error = 0;
59691676Sjpk 	if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
59700Sstevel@tonic-gate 		error = EFAULT;
59711676Sjpk 	} else if (zoneidlist != NULL && user_nzones != 0) {
59720Sstevel@tonic-gate 		if (copyout(zoneids, zoneidlist,
59730Sstevel@tonic-gate 		    user_nzones * sizeof (zoneid_t)) != 0)
59740Sstevel@tonic-gate 			error = EFAULT;
59750Sstevel@tonic-gate 	}
59760Sstevel@tonic-gate 
59771676Sjpk 	if (real_nzones > 0)
59780Sstevel@tonic-gate 		kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
59790Sstevel@tonic-gate 
59801676Sjpk 	if (error != 0)
59810Sstevel@tonic-gate 		return (set_errno(error));
59820Sstevel@tonic-gate 	else
59830Sstevel@tonic-gate 		return (0);
59840Sstevel@tonic-gate }
59850Sstevel@tonic-gate 
59860Sstevel@tonic-gate /*
59870Sstevel@tonic-gate  * Systemcall entry point for zone_lookup(2).
59880Sstevel@tonic-gate  *
59891676Sjpk  * Non-global zones are only able to see themselves and (on labeled systems)
59901676Sjpk  * the zones they dominate.
59910Sstevel@tonic-gate  */
59920Sstevel@tonic-gate static zoneid_t
zone_lookup(const char * zone_name)59930Sstevel@tonic-gate zone_lookup(const char *zone_name)
59940Sstevel@tonic-gate {
59950Sstevel@tonic-gate 	char *kname;
59960Sstevel@tonic-gate 	zone_t *zone;
59970Sstevel@tonic-gate 	zoneid_t zoneid;
59980Sstevel@tonic-gate 	int err;
59990Sstevel@tonic-gate 
60000Sstevel@tonic-gate 	if (zone_name == NULL) {
60010Sstevel@tonic-gate 		/* return caller's zone id */
60020Sstevel@tonic-gate 		return (getzoneid());
60030Sstevel@tonic-gate 	}
60040Sstevel@tonic-gate 
60050Sstevel@tonic-gate 	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
60060Sstevel@tonic-gate 	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
60070Sstevel@tonic-gate 		kmem_free(kname, ZONENAME_MAX);
60080Sstevel@tonic-gate 		return (set_errno(err));
60090Sstevel@tonic-gate 	}
60100Sstevel@tonic-gate 
60110Sstevel@tonic-gate 	mutex_enter(&zonehash_lock);
60120Sstevel@tonic-gate 	zone = zone_find_all_by_name(kname);
60130Sstevel@tonic-gate 	kmem_free(kname, ZONENAME_MAX);
60141676Sjpk 	/*
60151676Sjpk 	 * In a non-global zone, can only lookup global and own name.
60161676Sjpk 	 * In Trusted Extensions zone label dominance rules apply.
60171676Sjpk 	 */
60181676Sjpk 	if (zone == NULL ||
60191676Sjpk 	    zone_status_get(zone) < ZONE_IS_READY ||
60201676Sjpk 	    !zone_list_access(zone)) {
60210Sstevel@tonic-gate 		mutex_exit(&zonehash_lock);
60220Sstevel@tonic-gate 		return (set_errno(EINVAL));
60231676Sjpk 	} else {
60241676Sjpk 		zoneid = zone->zone_id;
60251676Sjpk 		mutex_exit(&zonehash_lock);
60261676Sjpk 		return (zoneid);
60270Sstevel@tonic-gate 	}
60280Sstevel@tonic-gate }
60290Sstevel@tonic-gate 
6030813Sdp static int
zone_version(int * version_arg)6031813Sdp zone_version(int *version_arg)
6032813Sdp {
6033813Sdp 	int version = ZONE_SYSCALL_API_VERSION;
6034813Sdp 
6035813Sdp 	if (copyout(&version, version_arg, sizeof (int)) != 0)
6036813Sdp 		return (set_errno(EFAULT));
6037813Sdp 	return (0);
6038813Sdp }
6039813Sdp 
60400Sstevel@tonic-gate /* ARGSUSED */
60410Sstevel@tonic-gate long
zone(int cmd,void * arg1,void * arg2,void * arg3,void * arg4)6042789Sahrens zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
60430Sstevel@tonic-gate {
60440Sstevel@tonic-gate 	zone_def zs;
604510616SSebastien.Roy@Sun.COM 	int err;
60460Sstevel@tonic-gate 
60470Sstevel@tonic-gate 	switch (cmd) {
60480Sstevel@tonic-gate 	case ZONE_CREATE:
60490Sstevel@tonic-gate 		if (get_udatamodel() == DATAMODEL_NATIVE) {
60500Sstevel@tonic-gate 			if (copyin(arg1, &zs, sizeof (zone_def))) {
60510Sstevel@tonic-gate 				return (set_errno(EFAULT));
60520Sstevel@tonic-gate 			}
60530Sstevel@tonic-gate 		} else {
60540Sstevel@tonic-gate #ifdef _SYSCALL32_IMPL
60550Sstevel@tonic-gate 			zone_def32 zs32;
60560Sstevel@tonic-gate 
60570Sstevel@tonic-gate 			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
60580Sstevel@tonic-gate 				return (set_errno(EFAULT));
60590Sstevel@tonic-gate 			}
60600Sstevel@tonic-gate 			zs.zone_name =
60610Sstevel@tonic-gate 			    (const char *)(unsigned long)zs32.zone_name;
60620Sstevel@tonic-gate 			zs.zone_root =
60630Sstevel@tonic-gate 			    (const char *)(unsigned long)zs32.zone_root;
60640Sstevel@tonic-gate 			zs.zone_privs =
60650Sstevel@tonic-gate 			    (const struct priv_set *)
60660Sstevel@tonic-gate 			    (unsigned long)zs32.zone_privs;
60671409Sdp 			zs.zone_privssz = zs32.zone_privssz;
60680Sstevel@tonic-gate 			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
60690Sstevel@tonic-gate 			zs.rctlbufsz = zs32.rctlbufsz;
6070789Sahrens 			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6071789Sahrens 			zs.zfsbufsz = zs32.zfsbufsz;
60720Sstevel@tonic-gate 			zs.extended_error =
60730Sstevel@tonic-gate 			    (int *)(unsigned long)zs32.extended_error;
60741676Sjpk 			zs.match = zs32.match;
60751676Sjpk 			zs.doi = zs32.doi;
60761676Sjpk 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
60773448Sdh155122 			zs.flags = zs32.flags;
60780Sstevel@tonic-gate #else
60790Sstevel@tonic-gate 			panic("get_udatamodel() returned bogus result\n");
60800Sstevel@tonic-gate #endif
60810Sstevel@tonic-gate 		}
60820Sstevel@tonic-gate 
60830Sstevel@tonic-gate 		return (zone_create(zs.zone_name, zs.zone_root,
6084813Sdp 		    zs.zone_privs, zs.zone_privssz,
6085813Sdp 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6086813Sdp 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
60871676Sjpk 		    zs.extended_error, zs.match, zs.doi,
60883448Sdh155122 		    zs.label, zs.flags));
60890Sstevel@tonic-gate 	case ZONE_BOOT:
60902267Sdp 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
60910Sstevel@tonic-gate 	case ZONE_DESTROY:
60920Sstevel@tonic-gate 		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
60930Sstevel@tonic-gate 	case ZONE_GETATTR:
60940Sstevel@tonic-gate 		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
60950Sstevel@tonic-gate 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
60962267Sdp 	case ZONE_SETATTR:
60972267Sdp 		return (zone_setattr((zoneid_t)(uintptr_t)arg1,
60982267Sdp 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
60990Sstevel@tonic-gate 	case ZONE_ENTER:
61000Sstevel@tonic-gate 		return (zone_enter((zoneid_t)(uintptr_t)arg1));
61010Sstevel@tonic-gate 	case ZONE_LIST:
61020Sstevel@tonic-gate 		return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
61030Sstevel@tonic-gate 	case ZONE_SHUTDOWN:
61040Sstevel@tonic-gate 		return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
61050Sstevel@tonic-gate 	case ZONE_LOOKUP:
61060Sstevel@tonic-gate 		return (zone_lookup((const char *)arg1));
6107813Sdp 	case ZONE_VERSION:
6108813Sdp 		return (zone_version((int *)arg1));
61093448Sdh155122 	case ZONE_ADD_DATALINK:
61103448Sdh155122 		return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
611110616SSebastien.Roy@Sun.COM 		    (datalink_id_t)(uintptr_t)arg2));
61123448Sdh155122 	case ZONE_DEL_DATALINK:
61133448Sdh155122 		return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
611410616SSebastien.Roy@Sun.COM 		    (datalink_id_t)(uintptr_t)arg2));
611510616SSebastien.Roy@Sun.COM 	case ZONE_CHECK_DATALINK: {
611610616SSebastien.Roy@Sun.COM 		zoneid_t	zoneid;
611710616SSebastien.Roy@Sun.COM 		boolean_t	need_copyout;
611810616SSebastien.Roy@Sun.COM 
611910616SSebastien.Roy@Sun.COM 		if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
612010616SSebastien.Roy@Sun.COM 			return (EFAULT);
612110616SSebastien.Roy@Sun.COM 		need_copyout = (zoneid == ALL_ZONES);
612210616SSebastien.Roy@Sun.COM 		err = zone_check_datalink(&zoneid,
612310616SSebastien.Roy@Sun.COM 		    (datalink_id_t)(uintptr_t)arg2);
612410616SSebastien.Roy@Sun.COM 		if (err == 0 && need_copyout) {
612510616SSebastien.Roy@Sun.COM 			if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
612610616SSebastien.Roy@Sun.COM 				err = EFAULT;
612710616SSebastien.Roy@Sun.COM 		}
612810616SSebastien.Roy@Sun.COM 		return (err == 0 ? 0 : set_errno(err));
612910616SSebastien.Roy@Sun.COM 	}
61303448Sdh155122 	case ZONE_LIST_DATALINK:
61313448Sdh155122 		return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
613210616SSebastien.Roy@Sun.COM 		    (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
61330Sstevel@tonic-gate 	default:
61340Sstevel@tonic-gate 		return (set_errno(EINVAL));
61350Sstevel@tonic-gate 	}
61360Sstevel@tonic-gate }
61370Sstevel@tonic-gate 
61380Sstevel@tonic-gate struct zarg {
61390Sstevel@tonic-gate 	zone_t *zone;
61400Sstevel@tonic-gate 	zone_cmd_arg_t arg;
61410Sstevel@tonic-gate };
61420Sstevel@tonic-gate 
61430Sstevel@tonic-gate static int
zone_lookup_door(const char * zone_name,door_handle_t * doorp)61440Sstevel@tonic-gate zone_lookup_door(const char *zone_name, door_handle_t *doorp)
61450Sstevel@tonic-gate {
61460Sstevel@tonic-gate 	char *buf;
61470Sstevel@tonic-gate 	size_t buflen;
61480Sstevel@tonic-gate 	int error;
61490Sstevel@tonic-gate 
61500Sstevel@tonic-gate 	buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
61510Sstevel@tonic-gate 	buf = kmem_alloc(buflen, KM_SLEEP);
61520Sstevel@tonic-gate 	(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
61530Sstevel@tonic-gate 	error = door_ki_open(buf, doorp);
61540Sstevel@tonic-gate 	kmem_free(buf, buflen);
61550Sstevel@tonic-gate 	return (error);
61560Sstevel@tonic-gate }
61570Sstevel@tonic-gate 
61580Sstevel@tonic-gate static void
zone_release_door(door_handle_t * doorp)61590Sstevel@tonic-gate zone_release_door(door_handle_t *doorp)
61600Sstevel@tonic-gate {
61610Sstevel@tonic-gate 	door_ki_rele(*doorp);
61620Sstevel@tonic-gate 	*doorp = NULL;
61630Sstevel@tonic-gate }
61640Sstevel@tonic-gate 
61650Sstevel@tonic-gate static void
zone_ki_call_zoneadmd(struct zarg * zargp)61660Sstevel@tonic-gate zone_ki_call_zoneadmd(struct zarg *zargp)
61670Sstevel@tonic-gate {
61680Sstevel@tonic-gate 	door_handle_t door = NULL;
61690Sstevel@tonic-gate 	door_arg_t darg, save_arg;
61700Sstevel@tonic-gate 	char *zone_name;
61710Sstevel@tonic-gate 	size_t zone_namelen;
61720Sstevel@tonic-gate 	zoneid_t zoneid;
61730Sstevel@tonic-gate 	zone_t *zone;
61740Sstevel@tonic-gate 	zone_cmd_arg_t arg;
61750Sstevel@tonic-gate 	uint64_t uniqid;
61760Sstevel@tonic-gate 	size_t size;
61770Sstevel@tonic-gate 	int error;
61780Sstevel@tonic-gate 	int retry;
61790Sstevel@tonic-gate 
61800Sstevel@tonic-gate 	zone = zargp->zone;
61810Sstevel@tonic-gate 	arg = zargp->arg;
61820Sstevel@tonic-gate 	kmem_free(zargp, sizeof (*zargp));
61830Sstevel@tonic-gate 
61840Sstevel@tonic-gate 	zone_namelen = strlen(zone->zone_name) + 1;
61850Sstevel@tonic-gate 	zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
61860Sstevel@tonic-gate 	bcopy(zone->zone_name, zone_name, zone_namelen);
61870Sstevel@tonic-gate 	zoneid = zone->zone_id;
61880Sstevel@tonic-gate 	uniqid = zone->zone_uniqid;
61890Sstevel@tonic-gate 	/*
61900Sstevel@tonic-gate 	 * zoneadmd may be down, but at least we can empty out the zone.
61910Sstevel@tonic-gate 	 * We can ignore the return value of zone_empty() since we're called
61920Sstevel@tonic-gate 	 * from a kernel thread and know we won't be delivered any signals.
61930Sstevel@tonic-gate 	 */
61940Sstevel@tonic-gate 	ASSERT(curproc == &p0);
61950Sstevel@tonic-gate 	(void) zone_empty(zone);
61960Sstevel@tonic-gate 	ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
61970Sstevel@tonic-gate 	zone_rele(zone);
61980Sstevel@tonic-gate 
61990Sstevel@tonic-gate 	size = sizeof (arg);
62000Sstevel@tonic-gate 	darg.rbuf = (char *)&arg;
62010Sstevel@tonic-gate 	darg.data_ptr = (char *)&arg;
62020Sstevel@tonic-gate 	darg.rsize = size;
62030Sstevel@tonic-gate 	darg.data_size = size;
62040Sstevel@tonic-gate 	darg.desc_ptr = NULL;
62050Sstevel@tonic-gate 	darg.desc_num = 0;
62060Sstevel@tonic-gate 
62070Sstevel@tonic-gate 	save_arg = darg;
62080Sstevel@tonic-gate 	/*
62090Sstevel@tonic-gate 	 * Since we're not holding a reference to the zone, any number of
62100Sstevel@tonic-gate 	 * things can go wrong, including the zone disappearing before we get a
62110Sstevel@tonic-gate 	 * chance to talk to zoneadmd.
62120Sstevel@tonic-gate 	 */
62130Sstevel@tonic-gate 	for (retry = 0; /* forever */; retry++) {
62140Sstevel@tonic-gate 		if (door == NULL &&
62150Sstevel@tonic-gate 		    (error = zone_lookup_door(zone_name, &door)) != 0) {
62160Sstevel@tonic-gate 			goto next;
62170Sstevel@tonic-gate 		}
62180Sstevel@tonic-gate 		ASSERT(door != NULL);
62190Sstevel@tonic-gate 
62206997Sjwadams 		if ((error = door_ki_upcall_limited(door, &darg, NULL,
62216997Sjwadams 		    SIZE_MAX, 0)) == 0) {
62220Sstevel@tonic-gate 			break;
62230Sstevel@tonic-gate 		}
62240Sstevel@tonic-gate 		switch (error) {
62250Sstevel@tonic-gate 		case EINTR:
62260Sstevel@tonic-gate 			/* FALLTHROUGH */
62270Sstevel@tonic-gate 		case EAGAIN:	/* process may be forking */
62280Sstevel@tonic-gate 			/*
62290Sstevel@tonic-gate 			 * Back off for a bit
62300Sstevel@tonic-gate 			 */
62310Sstevel@tonic-gate 			break;
62320Sstevel@tonic-gate 		case EBADF:
62330Sstevel@tonic-gate 			zone_release_door(&door);
62340Sstevel@tonic-gate 			if (zone_lookup_door(zone_name, &door) != 0) {
62350Sstevel@tonic-gate 				/*
62360Sstevel@tonic-gate 				 * zoneadmd may be dead, but it may come back to
62370Sstevel@tonic-gate 				 * life later.
62380Sstevel@tonic-gate 				 */
62390Sstevel@tonic-gate 				break;
62400Sstevel@tonic-gate 			}
62410Sstevel@tonic-gate 			break;
62420Sstevel@tonic-gate 		default:
62430Sstevel@tonic-gate 			cmn_err(CE_WARN,
62440Sstevel@tonic-gate 			    "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
62450Sstevel@tonic-gate 			    error);
62460Sstevel@tonic-gate 			goto out;
62470Sstevel@tonic-gate 		}
62480Sstevel@tonic-gate next:
62490Sstevel@tonic-gate 		/*
62500Sstevel@tonic-gate 		 * If this isn't the same zone_t that we originally had in mind,
62510Sstevel@tonic-gate 		 * then this is the same as if two kadmin requests come in at
62520Sstevel@tonic-gate 		 * the same time: the first one wins.  This means we lose, so we
62530Sstevel@tonic-gate 		 * bail.
62540Sstevel@tonic-gate 		 */
62550Sstevel@tonic-gate 		if ((zone = zone_find_by_id(zoneid)) == NULL) {
62560Sstevel@tonic-gate 			/*
62570Sstevel@tonic-gate 			 * Problem is solved.
62580Sstevel@tonic-gate 			 */
62590Sstevel@tonic-gate 			break;
62600Sstevel@tonic-gate 		}
62610Sstevel@tonic-gate 		if (zone->zone_uniqid != uniqid) {
62620Sstevel@tonic-gate 			/*
62630Sstevel@tonic-gate 			 * zoneid recycled
62640Sstevel@tonic-gate 			 */
62650Sstevel@tonic-gate 			zone_rele(zone);
62660Sstevel@tonic-gate 			break;
62670Sstevel@tonic-gate 		}
62680Sstevel@tonic-gate 		/*
62690Sstevel@tonic-gate 		 * We could zone_status_timedwait(), but there doesn't seem to
62700Sstevel@tonic-gate 		 * be much point in doing that (plus, it would mean that
62710Sstevel@tonic-gate 		 * zone_free() isn't called until this thread exits).
62720Sstevel@tonic-gate 		 */
62730Sstevel@tonic-gate 		zone_rele(zone);
62740Sstevel@tonic-gate 		delay(hz);
62750Sstevel@tonic-gate 		darg = save_arg;
62760Sstevel@tonic-gate 	}
62770Sstevel@tonic-gate out:
62780Sstevel@tonic-gate 	if (door != NULL) {
62790Sstevel@tonic-gate 		zone_release_door(&door);
62800Sstevel@tonic-gate 	}
62810Sstevel@tonic-gate 	kmem_free(zone_name, zone_namelen);
62820Sstevel@tonic-gate 	thread_exit();
62830Sstevel@tonic-gate }
62840Sstevel@tonic-gate 
62850Sstevel@tonic-gate /*
62862267Sdp  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
62872267Sdp  * kadmin().  The caller is a process in the zone.
62880Sstevel@tonic-gate  *
62890Sstevel@tonic-gate  * In order to shutdown the zone, we will hand off control to zoneadmd
62900Sstevel@tonic-gate  * (running in the global zone) via a door.  We do a half-hearted job at
62910Sstevel@tonic-gate  * killing all processes in the zone, create a kernel thread to contact
62920Sstevel@tonic-gate  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
62930Sstevel@tonic-gate  * a form of generation number used to let zoneadmd (as well as
62940Sstevel@tonic-gate  * zone_destroy()) know exactly which zone they're re talking about.
62950Sstevel@tonic-gate  */
62960Sstevel@tonic-gate int
zone_kadmin(int cmd,int fcn,const char * mdep,cred_t * credp)62972267Sdp zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
62980Sstevel@tonic-gate {
62990Sstevel@tonic-gate 	struct zarg *zargp;
63000Sstevel@tonic-gate 	zone_cmd_t zcmd;
63010Sstevel@tonic-gate 	zone_t *zone;
63020Sstevel@tonic-gate 
63030Sstevel@tonic-gate 	zone = curproc->p_zone;
63040Sstevel@tonic-gate 	ASSERT(getzoneid() != GLOBAL_ZONEID);
63050Sstevel@tonic-gate 
63060Sstevel@tonic-gate 	switch (cmd) {
63070Sstevel@tonic-gate 	case A_SHUTDOWN:
63080Sstevel@tonic-gate 		switch (fcn) {
63090Sstevel@tonic-gate 		case AD_HALT:
63100Sstevel@tonic-gate 		case AD_POWEROFF:
63110Sstevel@tonic-gate 			zcmd = Z_HALT;
63120Sstevel@tonic-gate 			break;
63130Sstevel@tonic-gate 		case AD_BOOT:
63140Sstevel@tonic-gate 			zcmd = Z_REBOOT;
63150Sstevel@tonic-gate 			break;
63160Sstevel@tonic-gate 		case AD_IBOOT:
63170Sstevel@tonic-gate 		case AD_SBOOT:
63180Sstevel@tonic-gate 		case AD_SIBOOT:
63190Sstevel@tonic-gate 		case AD_NOSYNC:
63200Sstevel@tonic-gate 			return (ENOTSUP);
63210Sstevel@tonic-gate 		default:
63220Sstevel@tonic-gate 			return (EINVAL);
63230Sstevel@tonic-gate 		}
63240Sstevel@tonic-gate 		break;
63250Sstevel@tonic-gate 	case A_REBOOT:
63260Sstevel@tonic-gate 		zcmd = Z_REBOOT;
63270Sstevel@tonic-gate 		break;
63280Sstevel@tonic-gate 	case A_FTRACE:
63290Sstevel@tonic-gate 	case A_REMOUNT:
63300Sstevel@tonic-gate 	case A_FREEZE:
63310Sstevel@tonic-gate 	case A_DUMP:
63329160SSherry.Moore@Sun.COM 	case A_CONFIG:
63330Sstevel@tonic-gate 		return (ENOTSUP);
63340Sstevel@tonic-gate 	default:
63350Sstevel@tonic-gate 		ASSERT(cmd != A_SWAPCTL);	/* handled by uadmin() */
63360Sstevel@tonic-gate 		return (EINVAL);
63370Sstevel@tonic-gate 	}
63380Sstevel@tonic-gate 
63390Sstevel@tonic-gate 	if (secpolicy_zone_admin(credp, B_FALSE))
63400Sstevel@tonic-gate 		return (EPERM);
63410Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
63422267Sdp 
63430Sstevel@tonic-gate 	/*
63440Sstevel@tonic-gate 	 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
63450Sstevel@tonic-gate 	 * is in the zone.
63460Sstevel@tonic-gate 	 */
63470Sstevel@tonic-gate 	ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
63480Sstevel@tonic-gate 	if (zone_status_get(zone) > ZONE_IS_RUNNING) {
63490Sstevel@tonic-gate 		/*
63500Sstevel@tonic-gate 		 * This zone is already on its way down.
63510Sstevel@tonic-gate 		 */
63520Sstevel@tonic-gate 		mutex_exit(&zone_status_lock);
63530Sstevel@tonic-gate 		return (0);
63540Sstevel@tonic-gate 	}
63550Sstevel@tonic-gate 	/*
63560Sstevel@tonic-gate 	 * Prevent future zone_enter()s
63570Sstevel@tonic-gate 	 */
63580Sstevel@tonic-gate 	zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
63590Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
63600Sstevel@tonic-gate 
63610Sstevel@tonic-gate 	/*
63620Sstevel@tonic-gate 	 * Kill everyone now and call zoneadmd later.
63630Sstevel@tonic-gate 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
63640Sstevel@tonic-gate 	 * later.
63650Sstevel@tonic-gate 	 */
63660Sstevel@tonic-gate 	killall(zone->zone_id);
63670Sstevel@tonic-gate 	/*
63680Sstevel@tonic-gate 	 * Now, create the thread to contact zoneadmd and do the rest of the
63690Sstevel@tonic-gate 	 * work.  This thread can't be created in our zone otherwise
63700Sstevel@tonic-gate 	 * zone_destroy() would deadlock.
63710Sstevel@tonic-gate 	 */
63722267Sdp 	zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
63730Sstevel@tonic-gate 	zargp->arg.cmd = zcmd;
63740Sstevel@tonic-gate 	zargp->arg.uniqid = zone->zone_uniqid;
63752267Sdp 	zargp->zone = zone;
63760Sstevel@tonic-gate 	(void) strcpy(zargp->arg.locale, "C");
63772267Sdp 	/* mdep was already copied in for us by uadmin */
63782267Sdp 	if (mdep != NULL)
63792267Sdp 		(void) strlcpy(zargp->arg.bootbuf, mdep,
63802267Sdp 		    sizeof (zargp->arg.bootbuf));
63812267Sdp 	zone_hold(zone);
63820Sstevel@tonic-gate 
63830Sstevel@tonic-gate 	(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
63840Sstevel@tonic-gate 	    TS_RUN, minclsyspri);
63850Sstevel@tonic-gate 	exit(CLD_EXITED, 0);
63860Sstevel@tonic-gate 
63870Sstevel@tonic-gate 	return (EINVAL);
63880Sstevel@tonic-gate }
63890Sstevel@tonic-gate 
63900Sstevel@tonic-gate /*
63910Sstevel@tonic-gate  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
63920Sstevel@tonic-gate  * status to ZONE_IS_SHUTTING_DOWN.
63938364SJordan.Vaughan@Sun.com  *
63948364SJordan.Vaughan@Sun.com  * This function also shuts down all running zones to ensure that they won't
63958364SJordan.Vaughan@Sun.com  * fork new processes.
63960Sstevel@tonic-gate  */
63970Sstevel@tonic-gate void
zone_shutdown_global(void)63980Sstevel@tonic-gate zone_shutdown_global(void)
63990Sstevel@tonic-gate {
64008364SJordan.Vaughan@Sun.com 	zone_t *current_zonep;
64018364SJordan.Vaughan@Sun.com 
64028364SJordan.Vaughan@Sun.com 	ASSERT(INGLOBALZONE(curproc));
64038364SJordan.Vaughan@Sun.com 	mutex_enter(&zonehash_lock);
64040Sstevel@tonic-gate 	mutex_enter(&zone_status_lock);
64058364SJordan.Vaughan@Sun.com 
64068364SJordan.Vaughan@Sun.com 	/* Modify the global zone's status first. */
64070Sstevel@tonic-gate 	ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
64080Sstevel@tonic-gate 	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
64098364SJordan.Vaughan@Sun.com 
64108364SJordan.Vaughan@Sun.com 	/*
64118364SJordan.Vaughan@Sun.com 	 * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
64128364SJordan.Vaughan@Sun.com 	 * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
64138364SJordan.Vaughan@Sun.com 	 * could cause assertions to fail (e.g., assertions about a zone's
64148364SJordan.Vaughan@Sun.com 	 * state during initialization, readying, or booting) or produce races.
64158364SJordan.Vaughan@Sun.com 	 * We'll let threads continue to initialize and ready new zones: they'll
64168364SJordan.Vaughan@Sun.com 	 * fail to boot the new zones when they see that the global zone is
64178364SJordan.Vaughan@Sun.com 	 * shutting down.
64188364SJordan.Vaughan@Sun.com 	 */
64198364SJordan.Vaughan@Sun.com 	for (current_zonep = list_head(&zone_active); current_zonep != NULL;
64208364SJordan.Vaughan@Sun.com 	    current_zonep = list_next(&zone_active, current_zonep)) {
64218364SJordan.Vaughan@Sun.com 		if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
64228364SJordan.Vaughan@Sun.com 			zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
64238364SJordan.Vaughan@Sun.com 	}
64240Sstevel@tonic-gate 	mutex_exit(&zone_status_lock);
64258364SJordan.Vaughan@Sun.com 	mutex_exit(&zonehash_lock);
64260Sstevel@tonic-gate }
6427789Sahrens 
6428789Sahrens /*
6429789Sahrens  * Returns true if the named dataset is visible in the current zone.
6430789Sahrens  * The 'write' parameter is set to 1 if the dataset is also writable.
6431789Sahrens  */
6432789Sahrens int
zone_dataset_visible(const char * dataset,int * write)6433789Sahrens zone_dataset_visible(const char *dataset, int *write)
6434789Sahrens {
643511850SSanjeev.Bagewadi@Sun.COM 	static int zfstype = -1;
6436789Sahrens 	zone_dataset_t *zd;
6437789Sahrens 	size_t len;
6438789Sahrens 	zone_t *zone = curproc->p_zone;
643911850SSanjeev.Bagewadi@Sun.COM 	const char *name = NULL;
644011850SSanjeev.Bagewadi@Sun.COM 	vfs_t *vfsp = NULL;
6441789Sahrens 
6442789Sahrens 	if (dataset[0] == '\0')
6443789Sahrens 		return (0);
6444789Sahrens 
6445789Sahrens 	/*
6446789Sahrens 	 * Walk the list once, looking for datasets which match exactly, or
6447789Sahrens 	 * specify a dataset underneath an exported dataset.  If found, return
6448789Sahrens 	 * true and note that it is writable.
6449789Sahrens 	 */
6450789Sahrens 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
6451789Sahrens 	    zd = list_next(&zone->zone_datasets, zd)) {
6452789Sahrens 
6453789Sahrens 		len = strlen(zd->zd_dataset);
6454789Sahrens 		if (strlen(dataset) >= len &&
6455789Sahrens 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
6456816Smaybee 		    (dataset[len] == '\0' || dataset[len] == '/' ||
6457816Smaybee 		    dataset[len] == '@')) {
6458789Sahrens 			if (write)
6459789Sahrens 				*write = 1;
6460789Sahrens 			return (1);
6461789Sahrens 		}
6462789Sahrens 	}
6463789Sahrens 
6464789Sahrens 	/*
6465789Sahrens 	 * Walk the list a second time, searching for datasets which are parents
6466789Sahrens 	 * of exported datasets.  These should be visible, but read-only.
6467789Sahrens 	 *
6468789Sahrens 	 * Note that we also have to support forms such as 'pool/dataset/', with
6469789Sahrens 	 * a trailing slash.
6470789Sahrens 	 */
6471789Sahrens 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
6472789Sahrens 	    zd = list_next(&zone->zone_datasets, zd)) {
6473789Sahrens 
6474789Sahrens 		len = strlen(dataset);
6475789Sahrens 		if (dataset[len - 1] == '/')
6476789Sahrens 			len--;	/* Ignore trailing slash */
6477789Sahrens 		if (len < strlen(zd->zd_dataset) &&
6478789Sahrens 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
6479789Sahrens 		    zd->zd_dataset[len] == '/') {
6480789Sahrens 			if (write)
6481789Sahrens 				*write = 0;
6482789Sahrens 			return (1);
6483789Sahrens 		}
6484789Sahrens 	}
6485789Sahrens 
648611850SSanjeev.Bagewadi@Sun.COM 	/*
648711850SSanjeev.Bagewadi@Sun.COM 	 * We reach here if the given dataset is not found in the zone_dataset
648811850SSanjeev.Bagewadi@Sun.COM 	 * list. Check if this dataset was added as a filesystem (ie. "add fs")
648911850SSanjeev.Bagewadi@Sun.COM 	 * instead of delegation. For this we search for the dataset in the
649011850SSanjeev.Bagewadi@Sun.COM 	 * zone_vfslist of this zone. If found, return true and note that it is
649111850SSanjeev.Bagewadi@Sun.COM 	 * not writable.
649211850SSanjeev.Bagewadi@Sun.COM 	 */
649311850SSanjeev.Bagewadi@Sun.COM 
649411850SSanjeev.Bagewadi@Sun.COM 	/*
649511850SSanjeev.Bagewadi@Sun.COM 	 * Initialize zfstype if it is not initialized yet.
649611850SSanjeev.Bagewadi@Sun.COM 	 */
649711850SSanjeev.Bagewadi@Sun.COM 	if (zfstype == -1) {
649811850SSanjeev.Bagewadi@Sun.COM 		struct vfssw *vswp = vfs_getvfssw("zfs");
649911850SSanjeev.Bagewadi@Sun.COM 		zfstype = vswp - vfssw;
650011850SSanjeev.Bagewadi@Sun.COM 		vfs_unrefvfssw(vswp);
650111850SSanjeev.Bagewadi@Sun.COM 	}
650211850SSanjeev.Bagewadi@Sun.COM 
650311850SSanjeev.Bagewadi@Sun.COM 	vfs_list_read_lock();
650411850SSanjeev.Bagewadi@Sun.COM 	vfsp = zone->zone_vfslist;
650511850SSanjeev.Bagewadi@Sun.COM 	do {
650611850SSanjeev.Bagewadi@Sun.COM 		ASSERT(vfsp);
650711850SSanjeev.Bagewadi@Sun.COM 		if (vfsp->vfs_fstype == zfstype) {
650811850SSanjeev.Bagewadi@Sun.COM 			name = refstr_value(vfsp->vfs_resource);
650911850SSanjeev.Bagewadi@Sun.COM 
651011850SSanjeev.Bagewadi@Sun.COM 			/*
651111850SSanjeev.Bagewadi@Sun.COM 			 * Check if we have an exact match.
651211850SSanjeev.Bagewadi@Sun.COM 			 */
651311850SSanjeev.Bagewadi@Sun.COM 			if (strcmp(dataset, name) == 0) {
651411850SSanjeev.Bagewadi@Sun.COM 				vfs_list_unlock();
651511850SSanjeev.Bagewadi@Sun.COM 				if (write)
651611850SSanjeev.Bagewadi@Sun.COM 					*write = 0;
651711850SSanjeev.Bagewadi@Sun.COM 				return (1);
651811850SSanjeev.Bagewadi@Sun.COM 			}
651911850SSanjeev.Bagewadi@Sun.COM 			/*
652011850SSanjeev.Bagewadi@Sun.COM 			 * We need to check if we are looking for parents of
652111850SSanjeev.Bagewadi@Sun.COM 			 * a dataset. These should be visible, but read-only.
652211850SSanjeev.Bagewadi@Sun.COM 			 */
652311850SSanjeev.Bagewadi@Sun.COM 			len = strlen(dataset);
652411850SSanjeev.Bagewadi@Sun.COM 			if (dataset[len - 1] == '/')
652511850SSanjeev.Bagewadi@Sun.COM 				len--;
652611850SSanjeev.Bagewadi@Sun.COM 
652711850SSanjeev.Bagewadi@Sun.COM 			if (len < strlen(name) &&
652811850SSanjeev.Bagewadi@Sun.COM 			    bcmp(dataset, name, len) == 0 && name[len] == '/') {
652911850SSanjeev.Bagewadi@Sun.COM 				vfs_list_unlock();
653011850SSanjeev.Bagewadi@Sun.COM 				if (write)
653111850SSanjeev.Bagewadi@Sun.COM 					*write = 0;
653211850SSanjeev.Bagewadi@Sun.COM 				return (1);
653311850SSanjeev.Bagewadi@Sun.COM 			}
653411850SSanjeev.Bagewadi@Sun.COM 		}
653511850SSanjeev.Bagewadi@Sun.COM 		vfsp = vfsp->vfs_zone_next;
653611850SSanjeev.Bagewadi@Sun.COM 	} while (vfsp != zone->zone_vfslist);
653711850SSanjeev.Bagewadi@Sun.COM 
653811850SSanjeev.Bagewadi@Sun.COM 	vfs_list_unlock();
6539789Sahrens 	return (0);
6540789Sahrens }
65411676Sjpk 
65421676Sjpk /*
65431676Sjpk  * zone_find_by_any_path() -
65441676Sjpk  *
65451676Sjpk  * kernel-private routine similar to zone_find_by_path(), but which
65461676Sjpk  * effectively compares against zone paths rather than zonerootpath
65471676Sjpk  * (i.e., the last component of zonerootpaths, which should be "root/",
65481676Sjpk  * are not compared.)  This is done in order to accurately identify all
65491676Sjpk  * paths, whether zone-visible or not, including those which are parallel
65501676Sjpk  * to /root/, such as /dev/, /home/, etc...
65511676Sjpk  *
65521676Sjpk  * If the specified path does not fall under any zone path then global
65531676Sjpk  * zone is returned.
65541676Sjpk  *
65551676Sjpk  * The treat_abs parameter indicates whether the path should be treated as
65561676Sjpk  * an absolute path although it does not begin with "/".  (This supports
65571676Sjpk  * nfs mount syntax such as host:any/path.)
65581676Sjpk  *
65591676Sjpk  * The caller is responsible for zone_rele of the returned zone.
65601676Sjpk  */
65611676Sjpk zone_t *
zone_find_by_any_path(const char * path,boolean_t treat_abs)65621676Sjpk zone_find_by_any_path(const char *path, boolean_t treat_abs)
65631676Sjpk {
65641676Sjpk 	zone_t *zone;
65651676Sjpk 	int path_offset = 0;
65661676Sjpk 
65671676Sjpk 	if (path == NULL) {
65681676Sjpk 		zone_hold(global_zone);
65691676Sjpk 		return (global_zone);
65701676Sjpk 	}
65711676Sjpk 
65721676Sjpk 	if (*path != '/') {
65731676Sjpk 		ASSERT(treat_abs);
65741676Sjpk 		path_offset = 1;
65751676Sjpk 	}
65761676Sjpk 
65771676Sjpk 	mutex_enter(&zonehash_lock);
65781676Sjpk 	for (zone = list_head(&zone_active); zone != NULL;
65791676Sjpk 	    zone = list_next(&zone_active, zone)) {
65801676Sjpk 		char	*c;
65811676Sjpk 		size_t	pathlen;
65821876Smp46848 		char *rootpath_start;
65831676Sjpk 
65841676Sjpk 		if (zone == global_zone)	/* skip global zone */
65851676Sjpk 			continue;
65861676Sjpk 
65871676Sjpk 		/* scan backwards to find start of last component */
65881676Sjpk 		c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
65891676Sjpk 		do {
65901676Sjpk 			c--;
65911676Sjpk 		} while (*c != '/');
65921676Sjpk 
65931876Smp46848 		pathlen = c - zone->zone_rootpath + 1 - path_offset;
65941876Smp46848 		rootpath_start = (zone->zone_rootpath + path_offset);
65951876Smp46848 		if (strncmp(path, rootpath_start, pathlen) == 0)
65961676Sjpk 			break;
65971676Sjpk 	}
65981676Sjpk 	if (zone == NULL)
65991676Sjpk 		zone = global_zone;
66001676Sjpk 	zone_hold(zone);
66011676Sjpk 	mutex_exit(&zonehash_lock);
66021676Sjpk 	return (zone);
66031676Sjpk }
66043448Sdh155122 
66053448Sdh155122 /*
660610616SSebastien.Roy@Sun.COM  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
660710616SSebastien.Roy@Sun.COM  * zone_dl_t pointer if found, and NULL otherwise.
66083448Sdh155122  */
660910616SSebastien.Roy@Sun.COM static zone_dl_t *
zone_find_dl(zone_t * zone,datalink_id_t linkid)661010616SSebastien.Roy@Sun.COM zone_find_dl(zone_t *zone, datalink_id_t linkid)
661110616SSebastien.Roy@Sun.COM {
661210616SSebastien.Roy@Sun.COM 	zone_dl_t *zdl;
661310616SSebastien.Roy@Sun.COM 
661410616SSebastien.Roy@Sun.COM 	ASSERT(mutex_owned(&zone->zone_lock));
661510616SSebastien.Roy@Sun.COM 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
661610616SSebastien.Roy@Sun.COM 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
661710616SSebastien.Roy@Sun.COM 		if (zdl->zdl_id == linkid)
661810616SSebastien.Roy@Sun.COM 			break;
661910616SSebastien.Roy@Sun.COM 	}
662010616SSebastien.Roy@Sun.COM 	return (zdl);
662110616SSebastien.Roy@Sun.COM }
662210616SSebastien.Roy@Sun.COM 
66233448Sdh155122 static boolean_t
zone_dl_exists(zone_t * zone,datalink_id_t linkid)662410616SSebastien.Roy@Sun.COM zone_dl_exists(zone_t *zone, datalink_id_t linkid)
66253448Sdh155122 {
662610616SSebastien.Roy@Sun.COM 	boolean_t exists;
66273448Sdh155122 
66283448Sdh155122 	mutex_enter(&zone->zone_lock);
662910616SSebastien.Roy@Sun.COM 	exists = (zone_find_dl(zone, linkid) != NULL);
66303448Sdh155122 	mutex_exit(&zone->zone_lock);
663110616SSebastien.Roy@Sun.COM 	return (exists);
66323448Sdh155122 }
66333448Sdh155122 
66343448Sdh155122 /*
663510616SSebastien.Roy@Sun.COM  * Add an data link name for the zone.
66363448Sdh155122  */
66373448Sdh155122 static int
zone_add_datalink(zoneid_t zoneid,datalink_id_t linkid)663810616SSebastien.Roy@Sun.COM zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
66393448Sdh155122 {
664010616SSebastien.Roy@Sun.COM 	zone_dl_t *zdl;
66413448Sdh155122 	zone_t *zone;
66423448Sdh155122 	zone_t *thiszone;
664310616SSebastien.Roy@Sun.COM 
664410616SSebastien.Roy@Sun.COM 	if ((thiszone = zone_find_by_id(zoneid)) == NULL)
66453448Sdh155122 		return (set_errno(ENXIO));
664610616SSebastien.Roy@Sun.COM 
664710616SSebastien.Roy@Sun.COM 	/* Verify that the datalink ID doesn't already belong to a zone. */
66483448Sdh155122 	mutex_enter(&zonehash_lock);
66493448Sdh155122 	for (zone = list_head(&zone_active); zone != NULL;
66503448Sdh155122 	    zone = list_next(&zone_active, zone)) {
665110616SSebastien.Roy@Sun.COM 		if (zone_dl_exists(zone, linkid)) {
66523448Sdh155122 			mutex_exit(&zonehash_lock);
66533448Sdh155122 			zone_rele(thiszone);
665410616SSebastien.Roy@Sun.COM 			return (set_errno((zone == thiszone) ? EEXIST : EPERM));
66553448Sdh155122 		}
66563448Sdh155122 	}
665710616SSebastien.Roy@Sun.COM 
665810616SSebastien.Roy@Sun.COM 	zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
665910616SSebastien.Roy@Sun.COM 	zdl->zdl_id = linkid;
666012748SSowmini.Varadhan@oracle.COM 	zdl->zdl_net = NULL;
66613448Sdh155122 	mutex_enter(&thiszone->zone_lock);
666210616SSebastien.Roy@Sun.COM 	list_insert_head(&thiszone->zone_dl_list, zdl);
66633448Sdh155122 	mutex_exit(&thiszone->zone_lock);
66643448Sdh155122 	mutex_exit(&zonehash_lock);
66653448Sdh155122 	zone_rele(thiszone);
66663448Sdh155122 	return (0);
66673448Sdh155122 }
66683448Sdh155122 
66693448Sdh155122 static int
zone_remove_datalink(zoneid_t zoneid,datalink_id_t linkid)667010616SSebastien.Roy@Sun.COM zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
66713448Sdh155122 {
667210616SSebastien.Roy@Sun.COM 	zone_dl_t *zdl;
66733448Sdh155122 	zone_t *zone;
667410616SSebastien.Roy@Sun.COM 	int err = 0;
667510616SSebastien.Roy@Sun.COM 
667610616SSebastien.Roy@Sun.COM 	if ((zone = zone_find_by_id(zoneid)) == NULL)
66773448Sdh155122 		return (set_errno(EINVAL));
66783448Sdh155122 
66793448Sdh155122 	mutex_enter(&zone->zone_lock);
668010616SSebastien.Roy@Sun.COM 	if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
668110616SSebastien.Roy@Sun.COM 		err = ENXIO;
668210616SSebastien.Roy@Sun.COM 	} else {
668310616SSebastien.Roy@Sun.COM 		list_remove(&zone->zone_dl_list, zdl);
668412748SSowmini.Varadhan@oracle.COM 		if (zdl->zdl_net != NULL)
668512748SSowmini.Varadhan@oracle.COM 			nvlist_free(zdl->zdl_net);
668610616SSebastien.Roy@Sun.COM 		kmem_free(zdl, sizeof (zone_dl_t));
66873448Sdh155122 	}
66883448Sdh155122 	mutex_exit(&zone->zone_lock);
66893448Sdh155122 	zone_rele(zone);
669010616SSebastien.Roy@Sun.COM 	return (err == 0 ? 0 : set_errno(err));
66913448Sdh155122 }
66923448Sdh155122 
66933448Sdh155122 /*
669410616SSebastien.Roy@Sun.COM  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
669510616SSebastien.Roy@Sun.COM  * the linkid.  Otherwise we just check if the specified zoneidp has been
669610616SSebastien.Roy@Sun.COM  * assigned the supplied linkid.
66973448Sdh155122  */
669810616SSebastien.Roy@Sun.COM int
zone_check_datalink(zoneid_t * zoneidp,datalink_id_t linkid)669910616SSebastien.Roy@Sun.COM zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
67003448Sdh155122 {
67013448Sdh155122 	zone_t *zone;
670210616SSebastien.Roy@Sun.COM 	int err = ENXIO;
670310616SSebastien.Roy@Sun.COM 
670410616SSebastien.Roy@Sun.COM 	if (*zoneidp != ALL_ZONES) {
670510616SSebastien.Roy@Sun.COM 		if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
670610616SSebastien.Roy@Sun.COM 			if (zone_dl_exists(zone, linkid))
670710616SSebastien.Roy@Sun.COM 				err = 0;
670810616SSebastien.Roy@Sun.COM 			zone_rele(zone);
670910616SSebastien.Roy@Sun.COM 		}
671010616SSebastien.Roy@Sun.COM 		return (err);
671110616SSebastien.Roy@Sun.COM 	}
671210616SSebastien.Roy@Sun.COM 
67133448Sdh155122 	mutex_enter(&zonehash_lock);
67143448Sdh155122 	for (zone = list_head(&zone_active); zone != NULL;
67153448Sdh155122 	    zone = list_next(&zone_active, zone)) {
671610616SSebastien.Roy@Sun.COM 		if (zone_dl_exists(zone, linkid)) {
671710616SSebastien.Roy@Sun.COM 			*zoneidp = zone->zone_id;
671810616SSebastien.Roy@Sun.COM 			err = 0;
671910616SSebastien.Roy@Sun.COM 			break;
67203448Sdh155122 		}
67213448Sdh155122 	}
67223448Sdh155122 	mutex_exit(&zonehash_lock);
672310616SSebastien.Roy@Sun.COM 	return (err);
67243448Sdh155122 }
67253448Sdh155122 
67263448Sdh155122 /*
672710616SSebastien.Roy@Sun.COM  * Get the list of datalink IDs assigned to a zone.
672810616SSebastien.Roy@Sun.COM  *
672910616SSebastien.Roy@Sun.COM  * On input, *nump is the number of datalink IDs that can fit in the supplied
673010616SSebastien.Roy@Sun.COM  * idarray.  Upon return, *nump is either set to the number of datalink IDs
673110616SSebastien.Roy@Sun.COM  * that were placed in the array if the array was large enough, or to the
673210616SSebastien.Roy@Sun.COM  * number of datalink IDs that the function needs to place in the array if the
673310616SSebastien.Roy@Sun.COM  * array is too small.
67343448Sdh155122  */
67353448Sdh155122 static int
zone_list_datalink(zoneid_t zoneid,int * nump,datalink_id_t * idarray)673610616SSebastien.Roy@Sun.COM zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
67373448Sdh155122 {
673810616SSebastien.Roy@Sun.COM 	uint_t num, dlcount;
67393448Sdh155122 	zone_t *zone;
674010616SSebastien.Roy@Sun.COM 	zone_dl_t *zdl;
674110616SSebastien.Roy@Sun.COM 	datalink_id_t *idptr = idarray;
67423448Sdh155122 
67433448Sdh155122 	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
67443448Sdh155122 		return (set_errno(EFAULT));
674510616SSebastien.Roy@Sun.COM 	if ((zone = zone_find_by_id(zoneid)) == NULL)
67463448Sdh155122 		return (set_errno(ENXIO));
67473448Sdh155122 
67483448Sdh155122 	num = 0;
67493448Sdh155122 	mutex_enter(&zone->zone_lock);
675010616SSebastien.Roy@Sun.COM 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
675110616SSebastien.Roy@Sun.COM 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
67523448Sdh155122 		/*
675310616SSebastien.Roy@Sun.COM 		 * If the list is bigger than what the caller supplied, just
675410616SSebastien.Roy@Sun.COM 		 * count, don't do copyout.
67553448Sdh155122 		 */
67563448Sdh155122 		if (++num > dlcount)
67573448Sdh155122 			continue;
675810616SSebastien.Roy@Sun.COM 		if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
67593448Sdh155122 			mutex_exit(&zone->zone_lock);
67603448Sdh155122 			zone_rele(zone);
67613448Sdh155122 			return (set_errno(EFAULT));
67623448Sdh155122 		}
676310616SSebastien.Roy@Sun.COM 		idptr++;
67643448Sdh155122 	}
67653448Sdh155122 	mutex_exit(&zone->zone_lock);
67663448Sdh155122 	zone_rele(zone);
67673448Sdh155122 
67683448Sdh155122 	/* Increased or decreased, caller should be notified. */
67693448Sdh155122 	if (num != dlcount) {
677010616SSebastien.Roy@Sun.COM 		if (copyout(&num, nump, sizeof (num)) != 0)
67713448Sdh155122 			return (set_errno(EFAULT));
67723448Sdh155122 	}
67733448Sdh155122 	return (0);
67743448Sdh155122 }
67753448Sdh155122 
67763448Sdh155122 /*
67773448Sdh155122  * Public interface for looking up a zone by zoneid. It's a customized version
67785880Snordmark  * for netstack_zone_create(). It can only be called from the zsd create
67795880Snordmark  * callbacks, since it doesn't have reference on the zone structure hence if
67805880Snordmark  * it is called elsewhere the zone could disappear after the zonehash_lock
67815880Snordmark  * is dropped.
67825880Snordmark  *
67835880Snordmark  * Furthermore it
67845880Snordmark  * 1. Doesn't check the status of the zone.
67855880Snordmark  * 2. It will be called even before zone_init is called, in that case the
67863448Sdh155122  *    address of zone0 is returned directly, and netstack_zone_create()
67873448Sdh155122  *    will only assign a value to zone0.zone_netstack, won't break anything.
67885880Snordmark  * 3. Returns without the zone being held.
67893448Sdh155122  */
67903448Sdh155122 zone_t *
zone_find_by_id_nolock(zoneid_t zoneid)67913448Sdh155122 zone_find_by_id_nolock(zoneid_t zoneid)
67923448Sdh155122 {
67935880Snordmark 	zone_t *zone;
67945880Snordmark 
67955880Snordmark 	mutex_enter(&zonehash_lock);
67963448Sdh155122 	if (zonehashbyid == NULL)
67975880Snordmark 		zone = &zone0;
67983448Sdh155122 	else
67995880Snordmark 		zone = zone_find_all_by_id(zoneid);
68005880Snordmark 	mutex_exit(&zonehash_lock);
68015880Snordmark 	return (zone);
68023448Sdh155122 }
68035895Syz147064 
68045895Syz147064 /*
68055895Syz147064  * Walk the datalinks for a given zone
68065895Syz147064  */
68075895Syz147064 int
zone_datalink_walk(zoneid_t zoneid,int (* cb)(datalink_id_t,void *),void * data)680810616SSebastien.Roy@Sun.COM zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
680910616SSebastien.Roy@Sun.COM     void *data)
68105895Syz147064 {
681110616SSebastien.Roy@Sun.COM 	zone_t		*zone;
681210616SSebastien.Roy@Sun.COM 	zone_dl_t	*zdl;
681310616SSebastien.Roy@Sun.COM 	datalink_id_t	*idarray;
681410616SSebastien.Roy@Sun.COM 	uint_t		idcount = 0;
681510616SSebastien.Roy@Sun.COM 	int		i, ret = 0;
68165895Syz147064 
68175895Syz147064 	if ((zone = zone_find_by_id(zoneid)) == NULL)
68185895Syz147064 		return (ENOENT);
68195895Syz147064 
682010616SSebastien.Roy@Sun.COM 	/*
682110616SSebastien.Roy@Sun.COM 	 * We first build an array of linkid's so that we can walk these and
682210616SSebastien.Roy@Sun.COM 	 * execute the callback with the zone_lock dropped.
682310616SSebastien.Roy@Sun.COM 	 */
68245895Syz147064 	mutex_enter(&zone->zone_lock);
682510616SSebastien.Roy@Sun.COM 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
682610616SSebastien.Roy@Sun.COM 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
682710616SSebastien.Roy@Sun.COM 		idcount++;
682810616SSebastien.Roy@Sun.COM 	}
682910616SSebastien.Roy@Sun.COM 
683010616SSebastien.Roy@Sun.COM 	if (idcount == 0) {
683110616SSebastien.Roy@Sun.COM 		mutex_exit(&zone->zone_lock);
683210616SSebastien.Roy@Sun.COM 		zone_rele(zone);
683310616SSebastien.Roy@Sun.COM 		return (0);
683410616SSebastien.Roy@Sun.COM 	}
683510616SSebastien.Roy@Sun.COM 
683610616SSebastien.Roy@Sun.COM 	idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
683710616SSebastien.Roy@Sun.COM 	if (idarray == NULL) {
683810616SSebastien.Roy@Sun.COM 		mutex_exit(&zone->zone_lock);
683910616SSebastien.Roy@Sun.COM 		zone_rele(zone);
684010616SSebastien.Roy@Sun.COM 		return (ENOMEM);
684110616SSebastien.Roy@Sun.COM 	}
684210616SSebastien.Roy@Sun.COM 
684310616SSebastien.Roy@Sun.COM 	for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
684410616SSebastien.Roy@Sun.COM 	    i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
684510616SSebastien.Roy@Sun.COM 		idarray[i] = zdl->zdl_id;
684610616SSebastien.Roy@Sun.COM 	}
684710616SSebastien.Roy@Sun.COM 
684810616SSebastien.Roy@Sun.COM 	mutex_exit(&zone->zone_lock);
684910616SSebastien.Roy@Sun.COM 
685010616SSebastien.Roy@Sun.COM 	for (i = 0; i < idcount && ret == 0; i++) {
685110616SSebastien.Roy@Sun.COM 		if ((ret = (*cb)(idarray[i], data)) != 0)
68525895Syz147064 			break;
68535895Syz147064 	}
685410616SSebastien.Roy@Sun.COM 
68555895Syz147064 	zone_rele(zone);
685610616SSebastien.Roy@Sun.COM 	kmem_free(idarray, sizeof (datalink_id_t) * idcount);
68575895Syz147064 	return (ret);
68585895Syz147064 }
685912748SSowmini.Varadhan@oracle.COM 
686012748SSowmini.Varadhan@oracle.COM static char *
zone_net_type2name(int type)686112748SSowmini.Varadhan@oracle.COM zone_net_type2name(int type)
686212748SSowmini.Varadhan@oracle.COM {
686312748SSowmini.Varadhan@oracle.COM 	switch (type) {
686412748SSowmini.Varadhan@oracle.COM 	case ZONE_NETWORK_ADDRESS:
686512748SSowmini.Varadhan@oracle.COM 		return (ZONE_NET_ADDRNAME);
686612748SSowmini.Varadhan@oracle.COM 	case ZONE_NETWORK_DEFROUTER:
686712748SSowmini.Varadhan@oracle.COM 		return (ZONE_NET_RTRNAME);
686812748SSowmini.Varadhan@oracle.COM 	default:
686912748SSowmini.Varadhan@oracle.COM 		return (NULL);
687012748SSowmini.Varadhan@oracle.COM 	}
687112748SSowmini.Varadhan@oracle.COM }
687212748SSowmini.Varadhan@oracle.COM 
687312748SSowmini.Varadhan@oracle.COM static int
zone_set_network(zoneid_t zoneid,zone_net_data_t * znbuf)687412748SSowmini.Varadhan@oracle.COM zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
687512748SSowmini.Varadhan@oracle.COM {
687612748SSowmini.Varadhan@oracle.COM 	zone_t *zone;
687712748SSowmini.Varadhan@oracle.COM 	zone_dl_t *zdl;
687812748SSowmini.Varadhan@oracle.COM 	nvlist_t *nvl;
687912748SSowmini.Varadhan@oracle.COM 	int err = 0;
688012748SSowmini.Varadhan@oracle.COM 	uint8_t *new = NULL;
688112748SSowmini.Varadhan@oracle.COM 	char *nvname;
688212748SSowmini.Varadhan@oracle.COM 	int bufsize;
688312748SSowmini.Varadhan@oracle.COM 	datalink_id_t linkid = znbuf->zn_linkid;
688412748SSowmini.Varadhan@oracle.COM 
688512748SSowmini.Varadhan@oracle.COM 	if (secpolicy_zone_config(CRED()) != 0)
688612748SSowmini.Varadhan@oracle.COM 		return (set_errno(EPERM));
688712748SSowmini.Varadhan@oracle.COM 
688812748SSowmini.Varadhan@oracle.COM 	if (zoneid == GLOBAL_ZONEID)
688912748SSowmini.Varadhan@oracle.COM 		return (set_errno(EINVAL));
689012748SSowmini.Varadhan@oracle.COM 
689112748SSowmini.Varadhan@oracle.COM 	nvname = zone_net_type2name(znbuf->zn_type);
689212748SSowmini.Varadhan@oracle.COM 	bufsize = znbuf->zn_len;
689312748SSowmini.Varadhan@oracle.COM 	new = znbuf->zn_val;
689412748SSowmini.Varadhan@oracle.COM 	if (nvname == NULL)
689512748SSowmini.Varadhan@oracle.COM 		return (set_errno(EINVAL));
689612748SSowmini.Varadhan@oracle.COM 
689712748SSowmini.Varadhan@oracle.COM 	if ((zone = zone_find_by_id(zoneid)) == NULL) {
689812748SSowmini.Varadhan@oracle.COM 		return (set_errno(EINVAL));
689912748SSowmini.Varadhan@oracle.COM 	}
690012748SSowmini.Varadhan@oracle.COM 
690112748SSowmini.Varadhan@oracle.COM 	mutex_enter(&zone->zone_lock);
690212748SSowmini.Varadhan@oracle.COM 	if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
690312748SSowmini.Varadhan@oracle.COM 		err = ENXIO;
690412748SSowmini.Varadhan@oracle.COM 		goto done;
690512748SSowmini.Varadhan@oracle.COM 	}
690612748SSowmini.Varadhan@oracle.COM 	if ((nvl = zdl->zdl_net) == NULL) {
690712748SSowmini.Varadhan@oracle.COM 		if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
690812748SSowmini.Varadhan@oracle.COM 			err = ENOMEM;
690912748SSowmini.Varadhan@oracle.COM 			goto done;
691012748SSowmini.Varadhan@oracle.COM 		} else {
691112748SSowmini.Varadhan@oracle.COM 			zdl->zdl_net = nvl;
691212748SSowmini.Varadhan@oracle.COM 		}
691312748SSowmini.Varadhan@oracle.COM 	}
691412748SSowmini.Varadhan@oracle.COM 	if (nvlist_exists(nvl, nvname)) {
691512748SSowmini.Varadhan@oracle.COM 		err = EINVAL;
691612748SSowmini.Varadhan@oracle.COM 		goto done;
691712748SSowmini.Varadhan@oracle.COM 	}
691812748SSowmini.Varadhan@oracle.COM 	err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
691912748SSowmini.Varadhan@oracle.COM 	ASSERT(err == 0);
692012748SSowmini.Varadhan@oracle.COM done:
692112748SSowmini.Varadhan@oracle.COM 	mutex_exit(&zone->zone_lock);
692212748SSowmini.Varadhan@oracle.COM 	zone_rele(zone);
692312748SSowmini.Varadhan@oracle.COM 	if (err != 0)
692412748SSowmini.Varadhan@oracle.COM 		return (set_errno(err));
692512748SSowmini.Varadhan@oracle.COM 	else
692612748SSowmini.Varadhan@oracle.COM 		return (0);
692712748SSowmini.Varadhan@oracle.COM }
692812748SSowmini.Varadhan@oracle.COM 
692912748SSowmini.Varadhan@oracle.COM static int
zone_get_network(zoneid_t zoneid,zone_net_data_t * znbuf)693012748SSowmini.Varadhan@oracle.COM zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
693112748SSowmini.Varadhan@oracle.COM {
693212748SSowmini.Varadhan@oracle.COM 	zone_t *zone;
693312748SSowmini.Varadhan@oracle.COM 	zone_dl_t *zdl;
693412748SSowmini.Varadhan@oracle.COM 	nvlist_t *nvl;
693512748SSowmini.Varadhan@oracle.COM 	uint8_t *ptr;
693612748SSowmini.Varadhan@oracle.COM 	uint_t psize;
693712748SSowmini.Varadhan@oracle.COM 	int err = 0;
693812748SSowmini.Varadhan@oracle.COM 	char *nvname;
693912748SSowmini.Varadhan@oracle.COM 	int bufsize;
694012748SSowmini.Varadhan@oracle.COM 	void *buf;
694112748SSowmini.Varadhan@oracle.COM 	datalink_id_t linkid = znbuf->zn_linkid;
694212748SSowmini.Varadhan@oracle.COM 
694312748SSowmini.Varadhan@oracle.COM 	if (zoneid == GLOBAL_ZONEID)
694412748SSowmini.Varadhan@oracle.COM 		return (set_errno(EINVAL));
694512748SSowmini.Varadhan@oracle.COM 
694612748SSowmini.Varadhan@oracle.COM 	nvname = zone_net_type2name(znbuf->zn_type);
694712748SSowmini.Varadhan@oracle.COM 	bufsize = znbuf->zn_len;
694812748SSowmini.Varadhan@oracle.COM 	buf = znbuf->zn_val;
694912748SSowmini.Varadhan@oracle.COM 
695012748SSowmini.Varadhan@oracle.COM 	if (nvname == NULL)
695112748SSowmini.Varadhan@oracle.COM 		return (set_errno(EINVAL));
695212748SSowmini.Varadhan@oracle.COM 	if ((zone = zone_find_by_id(zoneid)) == NULL)
695312748SSowmini.Varadhan@oracle.COM 		return (set_errno(EINVAL));
695412748SSowmini.Varadhan@oracle.COM 
695512748SSowmini.Varadhan@oracle.COM 	mutex_enter(&zone->zone_lock);
695612748SSowmini.Varadhan@oracle.COM 	if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
695712748SSowmini.Varadhan@oracle.COM 		err = ENXIO;
695812748SSowmini.Varadhan@oracle.COM 		goto done;
695912748SSowmini.Varadhan@oracle.COM 	}
696012748SSowmini.Varadhan@oracle.COM 	if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
696112748SSowmini.Varadhan@oracle.COM 		err = ENOENT;
696212748SSowmini.Varadhan@oracle.COM 		goto done;
696312748SSowmini.Varadhan@oracle.COM 	}
696412748SSowmini.Varadhan@oracle.COM 	err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
696512748SSowmini.Varadhan@oracle.COM 	ASSERT(err == 0);
696612748SSowmini.Varadhan@oracle.COM 
696712748SSowmini.Varadhan@oracle.COM 	if (psize > bufsize) {
696812748SSowmini.Varadhan@oracle.COM 		err = ENOBUFS;
696912748SSowmini.Varadhan@oracle.COM 		goto done;
697012748SSowmini.Varadhan@oracle.COM 	}
697112748SSowmini.Varadhan@oracle.COM 	znbuf->zn_len = psize;
697212748SSowmini.Varadhan@oracle.COM 	bcopy(ptr, buf, psize);
697312748SSowmini.Varadhan@oracle.COM done:
697412748SSowmini.Varadhan@oracle.COM 	mutex_exit(&zone->zone_lock);
697512748SSowmini.Varadhan@oracle.COM 	zone_rele(zone);
697612748SSowmini.Varadhan@oracle.COM 	if (err != 0)
697712748SSowmini.Varadhan@oracle.COM 		return (set_errno(err));
697812748SSowmini.Varadhan@oracle.COM 	else
697912748SSowmini.Varadhan@oracle.COM 		return (0);
698012748SSowmini.Varadhan@oracle.COM }
6981