xref: /onnv-gate/usr/src/uts/i86xpv/os/xen_machdep.c (revision 11120:fe619717975a)
15084Sjohnlev /*
25084Sjohnlev  * CDDL HEADER START
35084Sjohnlev  *
45084Sjohnlev  * The contents of this file are subject to the terms of the
55084Sjohnlev  * Common Development and Distribution License (the "License").
65084Sjohnlev  * You may not use this file except in compliance with the License.
75084Sjohnlev  *
85084Sjohnlev  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95084Sjohnlev  * or http://www.opensolaris.org/os/licensing.
105084Sjohnlev  * See the License for the specific language governing permissions
115084Sjohnlev  * and limitations under the License.
125084Sjohnlev  *
135084Sjohnlev  * When distributing Covered Code, include this CDDL HEADER in each
145084Sjohnlev  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155084Sjohnlev  * If applicable, add the following below this CDDL HEADER, with the
165084Sjohnlev  * fields enclosed by brackets "[]" replaced with your own identifying
175084Sjohnlev  * information: Portions Copyright [yyyy] [name of copyright owner]
185084Sjohnlev  *
195084Sjohnlev  * CDDL HEADER END
205084Sjohnlev  */
215084Sjohnlev 
225084Sjohnlev /*
2310175SStuart.Maybee@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
245084Sjohnlev  * Use is subject to license terms.
255084Sjohnlev  */
265084Sjohnlev 
275084Sjohnlev /* derived from netbsd's xen_machdep.c 1.1.2.1 */
285084Sjohnlev 
295084Sjohnlev /*
305084Sjohnlev  *
315084Sjohnlev  * Copyright (c) 2004 Christian Limpach.
325084Sjohnlev  * All rights reserved.
335084Sjohnlev  *
345084Sjohnlev  * Redistribution and use in source and binary forms, with or without
355084Sjohnlev  * modification, are permitted provided that the following conditions
365084Sjohnlev  * are met:
375084Sjohnlev  * 1. Redistributions of source code must retain the above copyright
385084Sjohnlev  *    notice, this list of conditions and the following disclaimer.
395084Sjohnlev  * 2. Redistributions in binary form must reproduce the above copyright
405084Sjohnlev  *    notice, this list of conditions and the following disclaimer in the
415084Sjohnlev  *    documentation and/or other materials provided with the distribution.
425084Sjohnlev  * 3. This section intentionally left blank.
435084Sjohnlev  * 4. The name of the author may not be used to endorse or promote products
445084Sjohnlev  *    derived from this software without specific prior written permission.
455084Sjohnlev  *
465084Sjohnlev  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
475084Sjohnlev  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
485084Sjohnlev  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
495084Sjohnlev  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
505084Sjohnlev  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
515084Sjohnlev  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
525084Sjohnlev  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
535084Sjohnlev  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
545084Sjohnlev  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
555084Sjohnlev  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
565084Sjohnlev  */
575084Sjohnlev /*
585084Sjohnlev  * Section 3 of the above license was updated in response to bug 6379571.
595084Sjohnlev  */
605084Sjohnlev 
6110175SStuart.Maybee@Sun.COM #include <sys/xpv_user.h>
6210175SStuart.Maybee@Sun.COM 
6310175SStuart.Maybee@Sun.COM /* XXX 3.3. TODO remove this include */
6410175SStuart.Maybee@Sun.COM #include <xen/public/arch-x86/xen-mca.h>
6510175SStuart.Maybee@Sun.COM 
665199Sgarypen #include <sys/ctype.h>
675084Sjohnlev #include <sys/types.h>
685084Sjohnlev #include <sys/cmn_err.h>
695084Sjohnlev #include <sys/trap.h>
705084Sjohnlev #include <sys/segments.h>
715084Sjohnlev #include <sys/hypervisor.h>
725084Sjohnlev #include <sys/xen_mmu.h>
735084Sjohnlev #include <sys/machsystm.h>
745084Sjohnlev #include <sys/promif.h>
755084Sjohnlev #include <sys/bootconf.h>
765084Sjohnlev #include <sys/bootinfo.h>
775084Sjohnlev #include <sys/cpr.h>
785084Sjohnlev #include <sys/taskq.h>
795084Sjohnlev #include <sys/uadmin.h>
805084Sjohnlev #include <sys/evtchn_impl.h>
815084Sjohnlev #include <sys/archsystm.h>
825084Sjohnlev #include <xen/sys/xenbus_impl.h>
835084Sjohnlev #include <sys/mach_mmu.h>
845084Sjohnlev #include <vm/hat_i86.h>
855084Sjohnlev #include <sys/gnttab.h>
865084Sjohnlev #include <sys/reboot.h>
875084Sjohnlev #include <sys/stack.h>
885084Sjohnlev #include <sys/clock.h>
895084Sjohnlev #include <sys/bitmap.h>
905084Sjohnlev #include <sys/processor.h>
915084Sjohnlev #include <sys/xen_errno.h>
925084Sjohnlev #include <sys/xpv_panic.h>
935084Sjohnlev #include <sys/smp_impldefs.h>
945084Sjohnlev #include <sys/cpu.h>
955084Sjohnlev #include <sys/balloon_impl.h>
965084Sjohnlev #include <sys/ddi.h>
975084Sjohnlev 
985084Sjohnlev #ifdef DEBUG
995084Sjohnlev #define	SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
1005084Sjohnlev #else
1015084Sjohnlev #define	SUSPEND_DEBUG(...)
1025084Sjohnlev #endif
1035084Sjohnlev 
1045084Sjohnlev int cpr_debug;
1055084Sjohnlev cpuset_t cpu_suspend_lost_set;
1065084Sjohnlev static int xen_suspend_debug;
1075084Sjohnlev 
1087532SSean.Ye@Sun.COM uint_t xen_phys_ncpus;
1097532SSean.Ye@Sun.COM xen_mc_logical_cpu_t *xen_phys_cpus;
1107532SSean.Ye@Sun.COM int xen_physinfo_debug = 0;
1117532SSean.Ye@Sun.COM 
1125199Sgarypen /*
1135199Sgarypen  * Determine helpful version information.
1145199Sgarypen  *
1155199Sgarypen  * (And leave copies in the data segment so we can look at them later
1165199Sgarypen  * with e.g. kmdb.)
1175199Sgarypen  */
1185199Sgarypen 
1195199Sgarypen typedef enum xen_version {
1205199Sgarypen 	XENVER_BOOT_IDX,
1215199Sgarypen 	XENVER_CURRENT_IDX
1225199Sgarypen } xen_version_t;
1235199Sgarypen 
1245199Sgarypen struct xenver {
1255199Sgarypen 	ulong_t xv_major;
1265199Sgarypen 	ulong_t xv_minor;
1275199Sgarypen 	ulong_t xv_revision;
1285199Sgarypen 	xen_extraversion_t xv_ver;
1295316Sjohnlev 	ulong_t xv_is_xvm;
1305199Sgarypen 	xen_changeset_info_t xv_chgset;
1315199Sgarypen 	xen_compile_info_t xv_build;
1325199Sgarypen 	xen_capabilities_info_t xv_caps;
1335199Sgarypen } xenver[2];
1345199Sgarypen 
1355199Sgarypen #define	XENVER_BOOT(m)	(xenver[XENVER_BOOT_IDX].m)
1365199Sgarypen #define	XENVER_CURRENT(m)	(xenver[XENVER_CURRENT_IDX].m)
1375199Sgarypen 
1385199Sgarypen /*
1395199Sgarypen  * Update the xenver data. We maintain two copies, boot and
1405199Sgarypen  * current. If we are setting the boot, then also set current.
1415199Sgarypen  */
1425199Sgarypen static void
xen_set_version(xen_version_t idx)1435199Sgarypen xen_set_version(xen_version_t idx)
1445199Sgarypen {
1455199Sgarypen 	ulong_t ver;
1465199Sgarypen 
1475199Sgarypen 	bzero(&xenver[idx], sizeof (xenver[idx]));
1485199Sgarypen 
1495199Sgarypen 	ver = HYPERVISOR_xen_version(XENVER_version, 0);
1505199Sgarypen 
1515199Sgarypen 	xenver[idx].xv_major = BITX(ver, 31, 16);
1525199Sgarypen 	xenver[idx].xv_minor = BITX(ver, 15, 0);
1535199Sgarypen 
1545199Sgarypen 	(void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver);
1555199Sgarypen 
1565199Sgarypen 	/*
1575199Sgarypen 	 * The revision is buried in the extraversion information that is
1585199Sgarypen 	 * maintained by the hypervisor. For our purposes we expect that
1595199Sgarypen 	 * the revision number is:
1605199Sgarypen 	 * 	- the second character in the extraversion information
1615199Sgarypen 	 *	- one character long
1625199Sgarypen 	 *	- numeric digit
1635199Sgarypen 	 * If it isn't then we can't extract the revision and we leave it
1645199Sgarypen 	 * set to 0.
1655199Sgarypen 	 */
1665199Sgarypen 	if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1]))
1675199Sgarypen 		xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0';
1685199Sgarypen 	else
1695199Sgarypen 		cmn_err(CE_WARN, "Cannot extract revision on this hypervisor "
1705199Sgarypen 		    "version: v%s, unexpected version format",
1715199Sgarypen 		    xenver[idx].xv_ver);
1725199Sgarypen 
1735316Sjohnlev 	xenver[idx].xv_is_xvm = 0;
1745316Sjohnlev 
175*11120SMark.Johnson@Sun.COM 	if (strstr(xenver[idx].xv_ver, "-xvm") != NULL)
1765316Sjohnlev 		xenver[idx].xv_is_xvm = 1;
1775316Sjohnlev 
1785199Sgarypen 	(void) HYPERVISOR_xen_version(XENVER_changeset,
1795199Sgarypen 	    &xenver[idx].xv_chgset);
1805199Sgarypen 
1815199Sgarypen 	(void) HYPERVISOR_xen_version(XENVER_compile_info,
1825199Sgarypen 	    &xenver[idx].xv_build);
1835199Sgarypen 	/*
1845199Sgarypen 	 * Capabilities are a set of space separated ascii strings
1855199Sgarypen 	 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64'
1865199Sgarypen 	 */
1875199Sgarypen 	(void) HYPERVISOR_xen_version(XENVER_capabilities,
1885199Sgarypen 	    &xenver[idx].xv_caps);
1895199Sgarypen 
1905199Sgarypen 	cmn_err(CE_CONT, "?v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major,
1915199Sgarypen 	    xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset);
1925199Sgarypen 
1935199Sgarypen 	if (idx == XENVER_BOOT_IDX)
1945199Sgarypen 		bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX],
1955199Sgarypen 		    sizeof (xenver[XENVER_BOOT_IDX]));
1965199Sgarypen }
1975199Sgarypen 
1985199Sgarypen typedef enum xen_hypervisor_check {
1995199Sgarypen 	XEN_RUN_CHECK,
2005199Sgarypen 	XEN_SUSPEND_CHECK
2015199Sgarypen } xen_hypervisor_check_t;
2025199Sgarypen 
2035199Sgarypen /*
2045199Sgarypen  * To run the hypervisor must be 3.0.4 or better. To suspend/resume
2055199Sgarypen  * we need 3.0.4 or better and if it is 3.0.4. then it must be provided
2065199Sgarypen  * by the Solaris xVM project.
2075199Sgarypen  * Checking can be disabled for testing purposes by setting the
2085199Sgarypen  * xen_suspend_debug variable.
2095199Sgarypen  */
2105199Sgarypen static int
xen_hypervisor_supports_solaris(xen_hypervisor_check_t check)2115199Sgarypen xen_hypervisor_supports_solaris(xen_hypervisor_check_t check)
2125199Sgarypen {
2135199Sgarypen 	if (xen_suspend_debug == 1)
2145199Sgarypen 		return (1);
2155199Sgarypen 	if (XENVER_CURRENT(xv_major) < 3)
2165199Sgarypen 		return (0);
2175199Sgarypen 	if (XENVER_CURRENT(xv_major) > 3)
2185199Sgarypen 		return (1);
2195199Sgarypen 	if (XENVER_CURRENT(xv_minor) > 0)
2205199Sgarypen 		return (1);
2215199Sgarypen 	if (XENVER_CURRENT(xv_revision) < 4)
2225199Sgarypen 		return (0);
2235316Sjohnlev 	if (check == XEN_SUSPEND_CHECK && XENVER_CURRENT(xv_revision) == 4 &&
2245316Sjohnlev 	    !XENVER_CURRENT(xv_is_xvm))
2255316Sjohnlev 		return (0);
2265316Sjohnlev 
2275199Sgarypen 	return (1);
2285199Sgarypen }
2295199Sgarypen 
2305316Sjohnlev /*
2315316Sjohnlev  * If the hypervisor is -xvm, or 3.1.2 or higher, we don't need the
2325316Sjohnlev  * workaround.
2335316Sjohnlev  */
2345316Sjohnlev static void
xen_pte_workaround(void)2355316Sjohnlev xen_pte_workaround(void)
2365316Sjohnlev {
2375316Sjohnlev #if defined(__amd64)
2385316Sjohnlev 	extern int pt_kern;
2395316Sjohnlev 
2405316Sjohnlev 	if (XENVER_CURRENT(xv_major) != 3)
2415316Sjohnlev 		return;
2425316Sjohnlev 	if (XENVER_CURRENT(xv_minor) > 1)
2435316Sjohnlev 		return;
2445316Sjohnlev 	if (XENVER_CURRENT(xv_minor) == 1 &&
2455316Sjohnlev 	    XENVER_CURRENT(xv_revision) > 1)
2465316Sjohnlev 		return;
2475316Sjohnlev 	if (XENVER_CURRENT(xv_is_xvm))
2485316Sjohnlev 		return;
2495316Sjohnlev 
2505316Sjohnlev 	pt_kern = PT_USER;
2515316Sjohnlev #endif
2525316Sjohnlev }
2535316Sjohnlev 
2545084Sjohnlev void
xen_set_callback(void (* func)(void),uint_t type,uint_t flags)2555084Sjohnlev xen_set_callback(void (*func)(void), uint_t type, uint_t flags)
2565084Sjohnlev {
2575084Sjohnlev 	struct callback_register cb;
2585084Sjohnlev 
2595084Sjohnlev 	bzero(&cb, sizeof (cb));
2605084Sjohnlev #if defined(__amd64)
2615084Sjohnlev 	cb.address = (ulong_t)func;
2625084Sjohnlev #elif defined(__i386)
2635084Sjohnlev 	cb.address.cs = KCS_SEL;
2645084Sjohnlev 	cb.address.eip = (ulong_t)func;
2655084Sjohnlev #endif
2665084Sjohnlev 	cb.type = type;
2675084Sjohnlev 	cb.flags = flags;
2685084Sjohnlev 
2695084Sjohnlev 	/*
2705084Sjohnlev 	 * XXPV always ignore return value for NMI
2715084Sjohnlev 	 */
2725084Sjohnlev 	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 &&
2735084Sjohnlev 	    type != CALLBACKTYPE_nmi)
2745084Sjohnlev 		panic("HYPERVISOR_callback_op failed");
2755084Sjohnlev }
2765084Sjohnlev 
2775084Sjohnlev void
xen_init_callbacks(void)2785084Sjohnlev xen_init_callbacks(void)
2795084Sjohnlev {
2805084Sjohnlev 	/*
2815084Sjohnlev 	 * register event (interrupt) handler.
2825084Sjohnlev 	 */
2835084Sjohnlev 	xen_set_callback(xen_callback, CALLBACKTYPE_event, 0);
2845084Sjohnlev 
2855084Sjohnlev 	/*
2865084Sjohnlev 	 * failsafe handler.
2875084Sjohnlev 	 */
2885084Sjohnlev 	xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe,
2895084Sjohnlev 	    CALLBACKF_mask_events);
2905084Sjohnlev 
2915084Sjohnlev 	/*
2925084Sjohnlev 	 * NMI handler.
2935084Sjohnlev 	 */
2945084Sjohnlev 	xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0);
2955084Sjohnlev 
2965084Sjohnlev 	/*
2975084Sjohnlev 	 * system call handler
2985084Sjohnlev 	 * XXPV move to init_cpu_syscall?
2995084Sjohnlev 	 */
3005084Sjohnlev #if defined(__amd64)
3015084Sjohnlev 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
3025084Sjohnlev 	    CALLBACKF_mask_events);
3035084Sjohnlev #endif	/* __amd64 */
3045084Sjohnlev }
3055084Sjohnlev 
3065084Sjohnlev 
3075084Sjohnlev /*
3085084Sjohnlev  * cmn_err() followed by a 1/4 second delay; this gives the
3095084Sjohnlev  * logging service a chance to flush messages and helps avoid
3105084Sjohnlev  * intermixing output from prom_printf().
3115084Sjohnlev  * XXPV: doesn't exactly help us on UP though.
3125084Sjohnlev  */
3135084Sjohnlev /*PRINTFLIKE2*/
3145084Sjohnlev void
cpr_err(int ce,const char * fmt,...)3155084Sjohnlev cpr_err(int ce, const char *fmt, ...)
3165084Sjohnlev {
3175084Sjohnlev 	va_list adx;
3185084Sjohnlev 
3195084Sjohnlev 	va_start(adx, fmt);
3205084Sjohnlev 	vcmn_err(ce, fmt, adx);
3215084Sjohnlev 	va_end(adx);
3225084Sjohnlev 	drv_usecwait(MICROSEC >> 2);
3235084Sjohnlev }
3245084Sjohnlev 
3255084Sjohnlev void
xen_suspend_devices(void)3265084Sjohnlev xen_suspend_devices(void)
3275084Sjohnlev {
3285084Sjohnlev 	int rc;
3295084Sjohnlev 
3305084Sjohnlev 	SUSPEND_DEBUG("xen_suspend_devices\n");
3315084Sjohnlev 
3325084Sjohnlev 	if ((rc = cpr_suspend_devices(ddi_root_node())) != 0)
3335084Sjohnlev 		panic("failed to suspend devices: %d", rc);
3345084Sjohnlev }
3355084Sjohnlev 
3365084Sjohnlev void
xen_resume_devices(void)3375084Sjohnlev xen_resume_devices(void)
3385084Sjohnlev {
3395084Sjohnlev 	int rc;
3405084Sjohnlev 
3415084Sjohnlev 	SUSPEND_DEBUG("xen_resume_devices\n");
3425084Sjohnlev 
3435084Sjohnlev 	if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0)
3445084Sjohnlev 		panic("failed to resume devices: %d", rc);
3455084Sjohnlev }
3465084Sjohnlev 
3475084Sjohnlev /*
3485084Sjohnlev  * The list of mfn pages is out of date.  Recompute it.
3495084Sjohnlev  */
3505084Sjohnlev static void
rebuild_mfn_list(void)3515084Sjohnlev rebuild_mfn_list(void)
3525084Sjohnlev {
3535084Sjohnlev 	int i = 0;
3545084Sjohnlev 	size_t sz;
3555084Sjohnlev 	size_t off;
3565084Sjohnlev 	pfn_t pfn;
3575084Sjohnlev 
3585084Sjohnlev 	SUSPEND_DEBUG("rebuild_mfn_list\n");
3595084Sjohnlev 
3605084Sjohnlev 	sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK;
3615084Sjohnlev 
3625084Sjohnlev 	for (off = 0; off < sz; off += MMU_PAGESIZE) {
3635084Sjohnlev 		size_t j = mmu_btop(off);
3645084Sjohnlev 		if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) {
3655084Sjohnlev 			pfn = hat_getpfnum(kas.a_hat,
3665084Sjohnlev 			    (caddr_t)&mfn_list_pages[j]);
3675084Sjohnlev 			mfn_list_pages_page[i++] = pfn_to_mfn(pfn);
3685084Sjohnlev 		}
3695084Sjohnlev 
3705084Sjohnlev 		pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off);
3715084Sjohnlev 		mfn_list_pages[j] = pfn_to_mfn(pfn);
3725084Sjohnlev 	}
3735084Sjohnlev 
3745084Sjohnlev 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page);
3755084Sjohnlev 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list
3765084Sjohnlev 	    = pfn_to_mfn(pfn);
3775084Sjohnlev }
3785084Sjohnlev 
3795084Sjohnlev static void
suspend_cpus(void)3805084Sjohnlev suspend_cpus(void)
3815084Sjohnlev {
3825084Sjohnlev 	int i;
3835084Sjohnlev 
3845084Sjohnlev 	SUSPEND_DEBUG("suspend_cpus\n");
3855084Sjohnlev 
3865159Sjohnlev 	mp_enter_barrier();
3875084Sjohnlev 
3885084Sjohnlev 	for (i = 1; i < ncpus; i++) {
3895084Sjohnlev 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
3905084Sjohnlev 			SUSPEND_DEBUG("xen_vcpu_down %d\n", i);
3915084Sjohnlev 			(void) xen_vcpu_down(i);
3925084Sjohnlev 		}
3935084Sjohnlev 
3945084Sjohnlev 		mach_cpucontext_reset(cpu[i]);
3955084Sjohnlev 	}
3965084Sjohnlev }
3975084Sjohnlev 
3985084Sjohnlev static void
resume_cpus(void)3995084Sjohnlev resume_cpus(void)
4005084Sjohnlev {
4015084Sjohnlev 	int i;
4025084Sjohnlev 
4035084Sjohnlev 	for (i = 1; i < ncpus; i++) {
4045084Sjohnlev 		if (cpu[i] == NULL)
4055084Sjohnlev 			continue;
4065084Sjohnlev 
4075084Sjohnlev 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
4085084Sjohnlev 			SUSPEND_DEBUG("xen_vcpu_up %d\n", i);
4095084Sjohnlev 			mach_cpucontext_restore(cpu[i]);
4105084Sjohnlev 			(void) xen_vcpu_up(i);
4115084Sjohnlev 		}
4125084Sjohnlev 	}
4135084Sjohnlev 
4145159Sjohnlev 	mp_leave_barrier();
4155084Sjohnlev }
4165084Sjohnlev 
4175084Sjohnlev /*
4185084Sjohnlev  * Top level routine to direct suspend/resume of a domain.
4195084Sjohnlev  */
4205084Sjohnlev void
xen_suspend_domain(void)4215084Sjohnlev xen_suspend_domain(void)
4225084Sjohnlev {
4235084Sjohnlev 	extern void rtcsync(void);
4245084Sjohnlev 	extern hrtime_t hres_last_tick;
4255084Sjohnlev 	mfn_t start_info_mfn;
4265084Sjohnlev 	ulong_t flags;
4275084Sjohnlev 	pfn_t pfn;
4285084Sjohnlev 	int i;
4295084Sjohnlev 
4305084Sjohnlev 	/*
4315199Sgarypen 	 * Check that we are happy to suspend on this hypervisor.
4325199Sgarypen 	 */
4335199Sgarypen 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) {
4345199Sgarypen 		cpr_err(CE_WARN, "Cannot suspend on this hypervisor "
4355199Sgarypen 		    "version: v%lu.%lu%s, need at least version v3.0.4 or "
4365199Sgarypen 		    "-xvm based hypervisor", XENVER_CURRENT(xv_major),
4375199Sgarypen 		    XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver));
4385199Sgarypen 		return;
4395199Sgarypen 	}
4405199Sgarypen 
4415199Sgarypen 	/*
4425084Sjohnlev 	 * XXPV - Are we definitely OK to suspend by the time we've connected
4435084Sjohnlev 	 * the handler?
4445084Sjohnlev 	 */
4455084Sjohnlev 
4465084Sjohnlev 	cpr_err(CE_NOTE, "Domain suspending for save/migrate");
4475084Sjohnlev 
4485084Sjohnlev 	SUSPEND_DEBUG("xen_suspend_domain\n");
4495084Sjohnlev 
4505084Sjohnlev 	/*
4515084Sjohnlev 	 * suspend interrupts and devices
4525084Sjohnlev 	 * XXPV - we use suspend/resume for both save/restore domains (like sun
4535084Sjohnlev 	 * cpr) and for migration.  Would be nice to know the difference if
4545084Sjohnlev 	 * possible.  For save/restore where down time may be a long time, we
4555084Sjohnlev 	 * may want to do more of the things that cpr does.  (i.e. notify user
4565084Sjohnlev 	 * processes, shrink memory footprint for faster restore, etc.)
4575084Sjohnlev 	 */
4585084Sjohnlev 	xen_suspend_devices();
4595084Sjohnlev 	SUSPEND_DEBUG("xenbus_suspend\n");
4605084Sjohnlev 	xenbus_suspend();
4615084Sjohnlev 
4625084Sjohnlev 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info);
4635084Sjohnlev 	start_info_mfn = pfn_to_mfn(pfn);
4645084Sjohnlev 
4655084Sjohnlev 	/*
4665084Sjohnlev 	 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe
4675084Sjohnlev 	 * wrt xenbus being suspended here?
4685084Sjohnlev 	 */
4695084Sjohnlev 	mutex_enter(&cpu_lock);
4705084Sjohnlev 
4715084Sjohnlev 	/*
4725084Sjohnlev 	 * Suspend must be done on vcpu 0, as no context for other CPUs is
4735084Sjohnlev 	 * saved.
4745084Sjohnlev 	 *
4755084Sjohnlev 	 * XXPV - add to taskq API ?
4765084Sjohnlev 	 */
4775084Sjohnlev 	thread_affinity_set(curthread, 0);
4785084Sjohnlev 	kpreempt_disable();
4795084Sjohnlev 
4805084Sjohnlev 	SUSPEND_DEBUG("xen_start_migrate\n");
4815084Sjohnlev 	xen_start_migrate();
4825084Sjohnlev 	if (ncpus > 1)
4835084Sjohnlev 		suspend_cpus();
4845084Sjohnlev 
4855084Sjohnlev 	/*
4865084Sjohnlev 	 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence
4875084Sjohnlev 	 * any holder would have dropped it to get through suspend_cpus().
4885084Sjohnlev 	 */
4895084Sjohnlev 	mutex_enter(&ec_lock);
4905084Sjohnlev 
4915084Sjohnlev 	/*
4925084Sjohnlev 	 * From here on in, we can't take locks.
4935084Sjohnlev 	 */
4945084Sjohnlev 	SUSPEND_DEBUG("ec_suspend\n");
4955084Sjohnlev 	ec_suspend();
4965084Sjohnlev 	SUSPEND_DEBUG("gnttab_suspend\n");
4975084Sjohnlev 	gnttab_suspend();
4985084Sjohnlev 
4995084Sjohnlev 	flags = intr_clear();
5005084Sjohnlev 
5015084Sjohnlev 	xpv_time_suspend();
5025084Sjohnlev 
5035084Sjohnlev 	/*
5045084Sjohnlev 	 * Currently, the hypervisor incorrectly fails to bring back
5055084Sjohnlev 	 * powered-down VCPUs.  Thus we need to record any powered-down VCPUs
5065084Sjohnlev 	 * to prevent any attempts to operate on them.  But we have to do this
5075084Sjohnlev 	 * *after* the very first time we do ec_suspend().
5085084Sjohnlev 	 */
5095084Sjohnlev 	for (i = 1; i < ncpus; i++) {
5105084Sjohnlev 		if (cpu[i] == NULL)
5115084Sjohnlev 			continue;
5125084Sjohnlev 
5135084Sjohnlev 		if (cpu_get_state(cpu[i]) == P_POWEROFF)
5145084Sjohnlev 			CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i);
5155084Sjohnlev 	}
5165084Sjohnlev 
5175084Sjohnlev 	/*
5185084Sjohnlev 	 * The dom0 save/migrate code doesn't automatically translate
5195084Sjohnlev 	 * these into PFNs, but expects them to be, so we do it here.
5205084Sjohnlev 	 * We don't use mfn_to_pfn() because so many OS services have
5215084Sjohnlev 	 * been disabled at this point.
5225084Sjohnlev 	 */
5235084Sjohnlev 	xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn];
5245084Sjohnlev 	xen_info->console.domU.mfn =
5255084Sjohnlev 	    mfn_to_pfn_mapping[xen_info->console.domU.mfn];
5265084Sjohnlev 
5275084Sjohnlev 	if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) {
5285084Sjohnlev 		prom_printf("xen_suspend_domain(): "
5295084Sjohnlev 		    "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n");
5305084Sjohnlev 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
5315084Sjohnlev 	}
5325084Sjohnlev 
5335084Sjohnlev 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
5345084Sjohnlev 	    0, UVMF_INVLPG)) {
5355084Sjohnlev 		prom_printf("xen_suspend_domain(): "
5365084Sjohnlev 		    "HYPERVISOR_update_va_mapping() failed\n");
5375084Sjohnlev 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
5385084Sjohnlev 	}
5395084Sjohnlev 
5405084Sjohnlev 	SUSPEND_DEBUG("HYPERVISOR_suspend\n");
5415084Sjohnlev 
5425084Sjohnlev 	/*
5435084Sjohnlev 	 * At this point we suspend and sometime later resume.
5445084Sjohnlev 	 */
5455084Sjohnlev 	if (HYPERVISOR_suspend(start_info_mfn)) {
5465084Sjohnlev 		prom_printf("xen_suspend_domain(): "
5475084Sjohnlev 		    "HYPERVISOR_suspend() failed\n");
5485084Sjohnlev 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
5495084Sjohnlev 	}
5505084Sjohnlev 
5515084Sjohnlev 	/*
5525084Sjohnlev 	 * Point HYPERVISOR_shared_info to its new value.
5535084Sjohnlev 	 */
5545084Sjohnlev 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
5555084Sjohnlev 	    xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE,
5565084Sjohnlev 	    UVMF_INVLPG))
5575084Sjohnlev 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
5585084Sjohnlev 
5595084Sjohnlev 	if (xen_info->nr_pages != mfn_count) {
5605084Sjohnlev 		prom_printf("xen_suspend_domain(): number of pages"
5615084Sjohnlev 		    " changed, was 0x%lx, now 0x%lx\n", mfn_count,
5625084Sjohnlev 		    xen_info->nr_pages);
5635084Sjohnlev 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
5645084Sjohnlev 	}
5655084Sjohnlev 
5665084Sjohnlev 	xpv_time_resume();
5675084Sjohnlev 
5685084Sjohnlev 	cached_max_mfn = 0;
5695084Sjohnlev 
5705084Sjohnlev 	SUSPEND_DEBUG("gnttab_resume\n");
5715084Sjohnlev 	gnttab_resume();
5725084Sjohnlev 
5735084Sjohnlev 	/* XXPV: add a note that this must be lockless. */
5745084Sjohnlev 	SUSPEND_DEBUG("ec_resume\n");
5755084Sjohnlev 	ec_resume();
5765084Sjohnlev 
5775084Sjohnlev 	intr_restore(flags);
5785084Sjohnlev 
5795084Sjohnlev 	if (ncpus > 1)
5805084Sjohnlev 		resume_cpus();
5815084Sjohnlev 
5825084Sjohnlev 	mutex_exit(&ec_lock);
5835084Sjohnlev 	xen_end_migrate();
5845084Sjohnlev 	mutex_exit(&cpu_lock);
5855084Sjohnlev 
5865084Sjohnlev 	/*
5875084Sjohnlev 	 * Now we can take locks again.
5885084Sjohnlev 	 */
5895084Sjohnlev 
5905084Sjohnlev 	/*
5915084Sjohnlev 	 * Force the tick value used for tv_nsec in hres_tick() to be up to
5925084Sjohnlev 	 * date. rtcsync() will reset the hrestime value appropriately.
5935084Sjohnlev 	 */
5945084Sjohnlev 	hres_last_tick = xpv_gethrtime();
5955084Sjohnlev 
5965084Sjohnlev 	/*
5975084Sjohnlev 	 * XXPV: we need to have resumed the CPUs since this takes locks, but
5985084Sjohnlev 	 * can remote CPUs see bad state? Presumably yes. Should probably nest
5995084Sjohnlev 	 * taking of todlock inside of cpu_lock, or vice versa, then provide an
6005084Sjohnlev 	 * unlocked version.  Probably need to call clkinitf to reset cpu freq
6015084Sjohnlev 	 * and re-calibrate if we migrated to a different speed cpu.  Also need
6025084Sjohnlev 	 * to make a (re)init_cpu_info call to update processor info structs
6035084Sjohnlev 	 * and device tree info.  That remains to be written at the moment.
6045084Sjohnlev 	 */
6055084Sjohnlev 	rtcsync();
6065084Sjohnlev 
6075084Sjohnlev 	rebuild_mfn_list();
6085084Sjohnlev 
6095084Sjohnlev 	SUSPEND_DEBUG("xenbus_resume\n");
6105084Sjohnlev 	xenbus_resume();
6115084Sjohnlev 	SUSPEND_DEBUG("xenbus_resume_devices\n");
6125084Sjohnlev 	xen_resume_devices();
6135084Sjohnlev 
6145084Sjohnlev 	thread_affinity_clear(curthread);
6155084Sjohnlev 	kpreempt_enable();
6165084Sjohnlev 
6175084Sjohnlev 	SUSPEND_DEBUG("finished xen_suspend_domain\n");
6185199Sgarypen 
6195199Sgarypen 	/*
6205199Sgarypen 	 * We have restarted our suspended domain, update the hypervisor
6215199Sgarypen 	 * details. NB: This must be done at the end of this function,
6225199Sgarypen 	 * since we need the domain to be completely resumed before
6235199Sgarypen 	 * these functions will work correctly.
6245199Sgarypen 	 */
6255199Sgarypen 	xen_set_version(XENVER_CURRENT_IDX);
6265199Sgarypen 
6275199Sgarypen 	/*
6285199Sgarypen 	 * We can check and report a warning, but we don't stop the
6295199Sgarypen 	 * process.
6305199Sgarypen 	 */
6315199Sgarypen 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0)
6325199Sgarypen 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
6335199Sgarypen 		    "but need at least version v3.0.4",
6345199Sgarypen 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
6355199Sgarypen 		    XENVER_CURRENT(xv_ver));
6365199Sgarypen 
6375084Sjohnlev 	cmn_err(CE_NOTE, "domain restore/migrate completed");
6385084Sjohnlev }
6395084Sjohnlev 
6405084Sjohnlev /*ARGSUSED*/
6415084Sjohnlev int
xen_debug_handler(void * arg)6425084Sjohnlev xen_debug_handler(void *arg)
6435084Sjohnlev {
6445084Sjohnlev 	debug_enter("External debug event received");
6455084Sjohnlev 
6465084Sjohnlev 	/*
6475084Sjohnlev 	 * If we've not got KMDB loaded, output some stuff difficult to capture
6485084Sjohnlev 	 * from a domain core.
6495084Sjohnlev 	 */
6505084Sjohnlev 	if (!(boothowto & RB_DEBUG)) {
6515084Sjohnlev 		shared_info_t *si = HYPERVISOR_shared_info;
6525084Sjohnlev 		int i;
6535084Sjohnlev 
6545084Sjohnlev 		prom_printf("evtchn_pending [ ");
6555084Sjohnlev 		for (i = 0; i < 8; i++)
6565084Sjohnlev 			prom_printf("%lx ", si->evtchn_pending[i]);
6575084Sjohnlev 		prom_printf("]\nevtchn_mask [ ");
6585084Sjohnlev 		for (i = 0; i < 8; i++)
6595084Sjohnlev 			prom_printf("%lx ", si->evtchn_mask[i]);
6605084Sjohnlev 		prom_printf("]\n");
6615084Sjohnlev 
6625084Sjohnlev 		for (i = 0; i < ncpus; i++) {
6635084Sjohnlev 			vcpu_info_t *vcpu = &si->vcpu_info[i];
6645084Sjohnlev 			if (cpu[i] == NULL)
6655084Sjohnlev 				continue;
6665084Sjohnlev 			prom_printf("CPU%d pending %d mask %d sel %lx\n",
6675084Sjohnlev 			    i, vcpu->evtchn_upcall_pending,
6685084Sjohnlev 			    vcpu->evtchn_upcall_mask,
6695084Sjohnlev 			    vcpu->evtchn_pending_sel);
6705084Sjohnlev 		}
6715084Sjohnlev 	}
6725084Sjohnlev 
6735084Sjohnlev 	return (0);
6745084Sjohnlev }
6755084Sjohnlev 
6765084Sjohnlev /*ARGSUSED*/
6775084Sjohnlev static void
xen_sysrq_handler(struct xenbus_watch * watch,const char ** vec,unsigned int len)6785084Sjohnlev xen_sysrq_handler(struct xenbus_watch *watch, const char **vec,
6795084Sjohnlev     unsigned int len)
6805084Sjohnlev {
6815084Sjohnlev 	xenbus_transaction_t xbt;
6825084Sjohnlev 	char key = '\0';
6835084Sjohnlev 	int ret;
6845084Sjohnlev 
6855084Sjohnlev retry:
6865084Sjohnlev 	if (xenbus_transaction_start(&xbt)) {
6875084Sjohnlev 		cmn_err(CE_WARN, "failed to start sysrq transaction");
6885084Sjohnlev 		return;
6895084Sjohnlev 	}
6905084Sjohnlev 
6915084Sjohnlev 	if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) {
6925084Sjohnlev 		/*
6935084Sjohnlev 		 * ENOENT happens in response to our own xenbus_rm.
6945084Sjohnlev 		 * XXPV - this happens spuriously on boot?
6955084Sjohnlev 		 */
6965084Sjohnlev 		if (ret != ENOENT)
6975084Sjohnlev 			cmn_err(CE_WARN, "failed to read sysrq: %d", ret);
6985084Sjohnlev 		goto out;
6995084Sjohnlev 	}
7005084Sjohnlev 
7015084Sjohnlev 	if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) {
7025084Sjohnlev 		cmn_err(CE_WARN, "failed to reset sysrq: %d", ret);
7035084Sjohnlev 		goto out;
7045084Sjohnlev 	}
7055084Sjohnlev 
7065084Sjohnlev 	if (xenbus_transaction_end(xbt, 0) == EAGAIN)
7075084Sjohnlev 		goto retry;
7085084Sjohnlev 
7095084Sjohnlev 	/*
7105084Sjohnlev 	 * Somewhat arbitrary - on Linux this means 'reboot'. We could just
7115084Sjohnlev 	 * accept any key, but this might increase the risk of sending a
7125084Sjohnlev 	 * harmless sysrq to the wrong domain...
7135084Sjohnlev 	 */
7145084Sjohnlev 	if (key == 'b')
7155084Sjohnlev 		(void) xen_debug_handler(NULL);
7165084Sjohnlev 	else
7175084Sjohnlev 		cmn_err(CE_WARN, "Ignored sysrq %c", key);
7185084Sjohnlev 	return;
7195084Sjohnlev 
7205084Sjohnlev out:
7215084Sjohnlev 	(void) xenbus_transaction_end(xbt, 1);
7225084Sjohnlev }
7235084Sjohnlev 
7245084Sjohnlev taskq_t *xen_shutdown_tq;
7255084Sjohnlev 
7265084Sjohnlev #define	SHUTDOWN_INVALID	-1
7275084Sjohnlev #define	SHUTDOWN_POWEROFF	0
7285084Sjohnlev #define	SHUTDOWN_REBOOT		1
7295084Sjohnlev #define	SHUTDOWN_SUSPEND	2
7305084Sjohnlev #define	SHUTDOWN_HALT		3
7315084Sjohnlev #define	SHUTDOWN_MAX		4
7325084Sjohnlev 
7335084Sjohnlev #define	SHUTDOWN_TIMEOUT_SECS (60 * 5)
7345084Sjohnlev 
7355084Sjohnlev static const char *cmd_strings[SHUTDOWN_MAX] = {
7365084Sjohnlev 	"poweroff",
7375084Sjohnlev 	"reboot",
7385084Sjohnlev 	"suspend",
7395084Sjohnlev 	"halt"
7405084Sjohnlev };
7415084Sjohnlev 
7425084Sjohnlev static void
xen_dirty_shutdown(void * arg)7435084Sjohnlev xen_dirty_shutdown(void *arg)
7445084Sjohnlev {
7455084Sjohnlev 	int cmd = (uintptr_t)arg;
7465084Sjohnlev 
7475084Sjohnlev 	cmn_err(CE_WARN, "Externally requested shutdown failed or "
7485084Sjohnlev 	    "timed out.\nShutting down.\n");
7495084Sjohnlev 
7505084Sjohnlev 	switch (cmd) {
7515084Sjohnlev 	case SHUTDOWN_HALT:
7525084Sjohnlev 	case SHUTDOWN_POWEROFF:
7535084Sjohnlev 		(void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred);
7545084Sjohnlev 		break;
7555084Sjohnlev 	case SHUTDOWN_REBOOT:
7565084Sjohnlev 		(void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred);
7575084Sjohnlev 		break;
7585084Sjohnlev 	}
7595084Sjohnlev }
7605084Sjohnlev 
7615084Sjohnlev static void
xen_shutdown(void * arg)7625084Sjohnlev xen_shutdown(void *arg)
7635084Sjohnlev {
7645084Sjohnlev 	int cmd = (uintptr_t)arg;
7656681Sjohnlev 	proc_t *initpp;
7665084Sjohnlev 
7675084Sjohnlev 	ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX);
7685084Sjohnlev 
7695084Sjohnlev 	if (cmd == SHUTDOWN_SUSPEND) {
7705084Sjohnlev 		xen_suspend_domain();
7715084Sjohnlev 		return;
7725084Sjohnlev 	}
7735084Sjohnlev 
7746681Sjohnlev 	switch (cmd) {
7756681Sjohnlev 	case SHUTDOWN_POWEROFF:
7766681Sjohnlev 		force_shutdown_method = AD_POWEROFF;
7776681Sjohnlev 		break;
7786681Sjohnlev 	case SHUTDOWN_HALT:
7796681Sjohnlev 		force_shutdown_method = AD_HALT;
7806681Sjohnlev 		break;
7816681Sjohnlev 	case SHUTDOWN_REBOOT:
7826681Sjohnlev 		force_shutdown_method = AD_BOOT;
7836681Sjohnlev 		break;
7846681Sjohnlev 	}
7855084Sjohnlev 
7866681Sjohnlev 	/*
7876681Sjohnlev 	 * If we're still booting and init(1) isn't set up yet, simply halt.
7886681Sjohnlev 	 */
7896681Sjohnlev 	mutex_enter(&pidlock);
7906681Sjohnlev 	initpp = prfind(P_INITPID);
7916681Sjohnlev 	mutex_exit(&pidlock);
7926681Sjohnlev 	if (initpp == NULL) {
7936681Sjohnlev 		extern void halt(char *);
7946681Sjohnlev 		halt("Power off the System");   /* just in case */
7956681Sjohnlev 	}
7965084Sjohnlev 
7976681Sjohnlev 	/*
7986681Sjohnlev 	 * else, graceful shutdown with inittab and all getting involved
7996681Sjohnlev 	 */
8006681Sjohnlev 	psignal(initpp, SIGPWR);
8015084Sjohnlev 
8025084Sjohnlev 	(void) timeout(xen_dirty_shutdown, arg,
8035084Sjohnlev 	    SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC));
8045084Sjohnlev }
8055084Sjohnlev 
8065084Sjohnlev /*ARGSUSED*/
8075084Sjohnlev static void
xen_shutdown_handler(struct xenbus_watch * watch,const char ** vec,unsigned int len)8085084Sjohnlev xen_shutdown_handler(struct xenbus_watch *watch, const char **vec,
8095084Sjohnlev 	unsigned int len)
8105084Sjohnlev {
8115084Sjohnlev 	char *str;
8125084Sjohnlev 	xenbus_transaction_t xbt;
8135084Sjohnlev 	int err, shutdown_code = SHUTDOWN_INVALID;
8145084Sjohnlev 	unsigned int slen;
8155084Sjohnlev 
8165084Sjohnlev again:
8175084Sjohnlev 	err = xenbus_transaction_start(&xbt);
8185084Sjohnlev 	if (err)
8195084Sjohnlev 		return;
8205084Sjohnlev 	if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) {
8215084Sjohnlev 		(void) xenbus_transaction_end(xbt, 1);
8225084Sjohnlev 		return;
8235084Sjohnlev 	}
8245084Sjohnlev 
8255084Sjohnlev 	SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str);
8265084Sjohnlev 
8275084Sjohnlev 	/*
8285084Sjohnlev 	 * If this is a watch fired from our write below, check out early to
8295084Sjohnlev 	 * avoid an infinite loop.
8305084Sjohnlev 	 */
8315084Sjohnlev 	if (strcmp(str, "") == 0) {
8325084Sjohnlev 		(void) xenbus_transaction_end(xbt, 0);
8335084Sjohnlev 		kmem_free(str, slen);
8345084Sjohnlev 		return;
8355084Sjohnlev 	} else if (strcmp(str, "poweroff") == 0) {
8365084Sjohnlev 		shutdown_code = SHUTDOWN_POWEROFF;
8375084Sjohnlev 	} else if (strcmp(str, "reboot") == 0) {
8385084Sjohnlev 		shutdown_code = SHUTDOWN_REBOOT;
8395084Sjohnlev 	} else if (strcmp(str, "suspend") == 0) {
8405084Sjohnlev 		shutdown_code = SHUTDOWN_SUSPEND;
8415084Sjohnlev 	} else if (strcmp(str, "halt") == 0) {
8425084Sjohnlev 		shutdown_code = SHUTDOWN_HALT;
8435084Sjohnlev 	} else {
8445084Sjohnlev 		printf("Ignoring shutdown request: %s\n", str);
8455084Sjohnlev 	}
8465084Sjohnlev 
8475084Sjohnlev 	/*
8485084Sjohnlev 	 * XXPV	Should we check the value of xenbus_write() too, or are all
8495084Sjohnlev 	 *	errors automatically folded into xenbus_transaction_end() ??
8505084Sjohnlev 	 */
8515084Sjohnlev 	(void) xenbus_write(xbt, "control", "shutdown", "");
8525084Sjohnlev 	err = xenbus_transaction_end(xbt, 0);
8535084Sjohnlev 	if (err == EAGAIN) {
8545084Sjohnlev 		SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id);
8555084Sjohnlev 		kmem_free(str, slen);
8565084Sjohnlev 		goto again;
8575084Sjohnlev 	}
8585084Sjohnlev 
8595084Sjohnlev 	kmem_free(str, slen);
8605084Sjohnlev 	if (shutdown_code != SHUTDOWN_INVALID) {
8615084Sjohnlev 		(void) taskq_dispatch(xen_shutdown_tq, xen_shutdown,
8625084Sjohnlev 		    (void *)(intptr_t)shutdown_code, 0);
8635084Sjohnlev 	}
8645084Sjohnlev }
8655084Sjohnlev 
8665084Sjohnlev static struct xenbus_watch shutdown_watch;
8675084Sjohnlev static struct xenbus_watch sysrq_watch;
8685084Sjohnlev 
8695084Sjohnlev void
xen_late_startup(void)8705084Sjohnlev xen_late_startup(void)
8715084Sjohnlev {
8725084Sjohnlev 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
8735084Sjohnlev 		xen_shutdown_tq = taskq_create("shutdown_taskq", 1,
8745084Sjohnlev 		    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
8755084Sjohnlev 		shutdown_watch.node = "control/shutdown";
8765084Sjohnlev 		shutdown_watch.callback = xen_shutdown_handler;
8775084Sjohnlev 		if (register_xenbus_watch(&shutdown_watch))
8785084Sjohnlev 			cmn_err(CE_WARN, "Failed to set shutdown watcher");
8795084Sjohnlev 
8805084Sjohnlev 		sysrq_watch.node = "control/sysrq";
8815084Sjohnlev 		sysrq_watch.callback = xen_sysrq_handler;
8825084Sjohnlev 		if (register_xenbus_watch(&sysrq_watch))
8835084Sjohnlev 			cmn_err(CE_WARN, "Failed to set sysrq watcher");
8845084Sjohnlev 	}
8855084Sjohnlev 	balloon_init(xen_info->nr_pages);
8865084Sjohnlev }
8875084Sjohnlev 
8885084Sjohnlev #ifdef DEBUG
8895084Sjohnlev #define	XEN_PRINTF_BUFSIZE	1024
8905084Sjohnlev 
8915084Sjohnlev char xen_printf_buffer[XEN_PRINTF_BUFSIZE];
8925084Sjohnlev 
8935084Sjohnlev /*
8945084Sjohnlev  * Printf function that calls hypervisor directly.  For DomU it only
8955084Sjohnlev  * works when running on a xen hypervisor built with debug on.  Works
8965084Sjohnlev  * always since no I/O ring interaction is needed.
8975084Sjohnlev  */
8985084Sjohnlev /*PRINTFLIKE1*/
8995084Sjohnlev void
xen_printf(const char * fmt,...)9005084Sjohnlev xen_printf(const char *fmt, ...)
9015084Sjohnlev {
9025084Sjohnlev 	va_list	ap;
9035084Sjohnlev 
9045084Sjohnlev 	va_start(ap, fmt);
9055084Sjohnlev 	(void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap);
9065084Sjohnlev 	va_end(ap);
9075084Sjohnlev 
9085084Sjohnlev 	(void) HYPERVISOR_console_io(CONSOLEIO_write,
9095084Sjohnlev 	    strlen(xen_printf_buffer), xen_printf_buffer);
9105084Sjohnlev }
9115084Sjohnlev #else
9125084Sjohnlev void
xen_printf(const char * fmt,...)9135084Sjohnlev xen_printf(const char *fmt, ...)
9145084Sjohnlev {
9155084Sjohnlev }
9165084Sjohnlev #endif	/* DEBUG */
9175084Sjohnlev 
9185084Sjohnlev void
startup_xen_version(void)9195316Sjohnlev startup_xen_version(void)
9205084Sjohnlev {
9215199Sgarypen 	xen_set_version(XENVER_BOOT_IDX);
9225199Sgarypen 	if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0)
9235199Sgarypen 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
9245199Sgarypen 		    "but need at least version v3.0.4",
9255199Sgarypen 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
9265199Sgarypen 		    XENVER_CURRENT(xv_ver));
9275316Sjohnlev 	xen_pte_workaround();
9285084Sjohnlev }
9295084Sjohnlev 
9307532SSean.Ye@Sun.COM int xen_mca_simulate_mc_physinfo_failure = 0;
9317532SSean.Ye@Sun.COM 
9327532SSean.Ye@Sun.COM void
startup_xen_mca(void)9337532SSean.Ye@Sun.COM startup_xen_mca(void)
9347532SSean.Ye@Sun.COM {
9357532SSean.Ye@Sun.COM 	if (!DOMAIN_IS_INITDOMAIN(xen_info))
9367532SSean.Ye@Sun.COM 		return;
9377532SSean.Ye@Sun.COM 
9387532SSean.Ye@Sun.COM 	xen_phys_ncpus = 0;
9397532SSean.Ye@Sun.COM 	xen_phys_cpus = NULL;
9407532SSean.Ye@Sun.COM 
9417532SSean.Ye@Sun.COM 	if (xen_mca_simulate_mc_physinfo_failure ||
9427532SSean.Ye@Sun.COM 	    xen_get_mc_physcpuinfo(NULL, &xen_phys_ncpus) != 0) {
9437532SSean.Ye@Sun.COM 		cmn_err(CE_WARN,
9447532SSean.Ye@Sun.COM 		    "%sxen_get_mc_physinfo failure during xen MCA startup: "
9457532SSean.Ye@Sun.COM 		    "there will be no machine check support",
9467532SSean.Ye@Sun.COM 		    xen_mca_simulate_mc_physinfo_failure ? "(simulated) " : "");
9477532SSean.Ye@Sun.COM 		return;
9487532SSean.Ye@Sun.COM 	}
9497532SSean.Ye@Sun.COM 
9507532SSean.Ye@Sun.COM 	xen_phys_cpus = kmem_alloc(xen_phys_ncpus *
9517532SSean.Ye@Sun.COM 	    sizeof (xen_mc_logical_cpu_t), KM_NOSLEEP);
9527532SSean.Ye@Sun.COM 
9537532SSean.Ye@Sun.COM 	if (xen_phys_cpus == NULL) {
9547532SSean.Ye@Sun.COM 		cmn_err(CE_WARN,
95510175SStuart.Maybee@Sun.COM 		    "xen_get_mc_physinfo failure: can't allocate CPU array");
9567532SSean.Ye@Sun.COM 		return;
9577532SSean.Ye@Sun.COM 	}
9587532SSean.Ye@Sun.COM 
9597532SSean.Ye@Sun.COM 	if (xen_get_mc_physcpuinfo(xen_phys_cpus, &xen_phys_ncpus) != 0) {
9607532SSean.Ye@Sun.COM 		cmn_err(CE_WARN, "xen_get_mc_physinfo failure: no "
9617532SSean.Ye@Sun.COM 		    "physical CPU info");
9627532SSean.Ye@Sun.COM 		kmem_free(xen_phys_cpus,
9637532SSean.Ye@Sun.COM 		    xen_phys_ncpus * sizeof (xen_mc_logical_cpu_t));
9647532SSean.Ye@Sun.COM 		xen_phys_ncpus = 0;
9657532SSean.Ye@Sun.COM 		xen_phys_cpus = NULL;
9667532SSean.Ye@Sun.COM 	}
9677532SSean.Ye@Sun.COM 
9687532SSean.Ye@Sun.COM 	if (xen_physinfo_debug) {
9697532SSean.Ye@Sun.COM 		xen_mc_logical_cpu_t *xcp;
9707532SSean.Ye@Sun.COM 		unsigned i;
9717532SSean.Ye@Sun.COM 
9727532SSean.Ye@Sun.COM 		cmn_err(CE_NOTE, "xvm mca: %u physical cpus:\n",
9737532SSean.Ye@Sun.COM 		    xen_phys_ncpus);
9747532SSean.Ye@Sun.COM 		for (i = 0; i < xen_phys_ncpus; i++) {
9757532SSean.Ye@Sun.COM 			xcp = &xen_phys_cpus[i];
9767532SSean.Ye@Sun.COM 			cmn_err(CE_NOTE, "cpu%u: (%u, %u, %u) apid %u",
9777532SSean.Ye@Sun.COM 			    xcp->mc_cpunr, xcp->mc_chipid, xcp->mc_coreid,
9787532SSean.Ye@Sun.COM 			    xcp->mc_threadid, xcp->mc_apicid);
9797532SSean.Ye@Sun.COM 		}
9807532SSean.Ye@Sun.COM 	}
9817532SSean.Ye@Sun.COM }
9827532SSean.Ye@Sun.COM 
9835084Sjohnlev /*
9845084Sjohnlev  * Miscellaneous hypercall wrappers with slightly more verbose diagnostics.
9855084Sjohnlev  */
9865084Sjohnlev 
9875084Sjohnlev void
xen_set_gdt(ulong_t * frame_list,int entries)9885084Sjohnlev xen_set_gdt(ulong_t *frame_list, int entries)
9895084Sjohnlev {
9905084Sjohnlev 	int err;
9915084Sjohnlev 	if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) {
9925084Sjohnlev 		/*
9935084Sjohnlev 		 * X_EINVAL:	reserved entry or bad frames
9945084Sjohnlev 		 * X_EFAULT:	bad address
9955084Sjohnlev 		 */
9965084Sjohnlev 		panic("xen_set_gdt(%p, %d): error %d",
9975084Sjohnlev 		    (void *)frame_list, entries, -(int)err);
9985084Sjohnlev 	}
9995084Sjohnlev }
10005084Sjohnlev 
10015084Sjohnlev void
xen_set_ldt(user_desc_t * ldt,uint_t nsels)10025084Sjohnlev xen_set_ldt(user_desc_t *ldt, uint_t nsels)
10035084Sjohnlev {
10045084Sjohnlev 	struct mmuext_op	op;
10055084Sjohnlev 	long			err;
10065084Sjohnlev 
10075084Sjohnlev 	op.cmd = MMUEXT_SET_LDT;
10085084Sjohnlev 	op.arg1.linear_addr = (uintptr_t)ldt;
10095084Sjohnlev 	op.arg2.nr_ents = nsels;
10105084Sjohnlev 
10115084Sjohnlev 	if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) {
10125084Sjohnlev 		panic("xen_set_ldt(%p, %d): error %d",
10135084Sjohnlev 		    (void *)ldt, nsels, -(int)err);
10145084Sjohnlev 	}
10155084Sjohnlev }
10165084Sjohnlev 
10175084Sjohnlev void
xen_stack_switch(ulong_t ss,ulong_t esp)10185084Sjohnlev xen_stack_switch(ulong_t ss, ulong_t esp)
10195084Sjohnlev {
10205084Sjohnlev 	long err;
10215084Sjohnlev 
10225084Sjohnlev 	if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) {
10235084Sjohnlev 		/*
10245084Sjohnlev 		 * X_EPERM:	bad selector
10255084Sjohnlev 		 */
10265084Sjohnlev 		panic("xen_stack_switch(%lx, %lx): error %d", ss, esp,
10275084Sjohnlev 		    -(int)err);
10285084Sjohnlev 	}
10295084Sjohnlev }
10305084Sjohnlev 
10315084Sjohnlev long
xen_set_trap_table(trap_info_t * table)10325084Sjohnlev xen_set_trap_table(trap_info_t *table)
10335084Sjohnlev {
10345084Sjohnlev 	long err;
10355084Sjohnlev 
10365084Sjohnlev 	if ((err = HYPERVISOR_set_trap_table(table)) != 0) {
10375084Sjohnlev 		/*
10385084Sjohnlev 		 * X_EFAULT:	bad address
10395084Sjohnlev 		 * X_EPERM:	bad selector
10405084Sjohnlev 		 */
10415084Sjohnlev 		panic("xen_set_trap_table(%p): error %d", (void *)table,
10425084Sjohnlev 		    -(int)err);
10435084Sjohnlev 	}
10445084Sjohnlev 	return (err);
10455084Sjohnlev }
10465084Sjohnlev 
10475084Sjohnlev #if defined(__amd64)
10485084Sjohnlev void
xen_set_segment_base(int reg,ulong_t value)10495084Sjohnlev xen_set_segment_base(int reg, ulong_t value)
10505084Sjohnlev {
10515084Sjohnlev 	long err;
10525084Sjohnlev 
10535084Sjohnlev 	if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) {
10545084Sjohnlev 		/*
10555084Sjohnlev 		 * X_EFAULT:	bad address
10565084Sjohnlev 		 * X_EINVAL:	bad type
10575084Sjohnlev 		 */
10585084Sjohnlev 		panic("xen_set_segment_base(%d, %lx): error %d",
10595084Sjohnlev 		    reg, value, -(int)err);
10605084Sjohnlev 	}
10615084Sjohnlev }
10625084Sjohnlev #endif	/* __amd64 */
10635084Sjohnlev 
10645084Sjohnlev /*
10655084Sjohnlev  * Translate a hypervisor errcode to a Solaris error code.
10665084Sjohnlev  */
10675084Sjohnlev int
xen_xlate_errcode(int error)10685084Sjohnlev xen_xlate_errcode(int error)
10695084Sjohnlev {
10705084Sjohnlev 	switch (-error) {
10715084Sjohnlev 
10725084Sjohnlev 	/*
10735084Sjohnlev 	 * Translate hypervisor errno's into native errno's
10745084Sjohnlev 	 */
10755084Sjohnlev 
10765084Sjohnlev #define	CASE(num)	case X_##num: error = num; break
10775084Sjohnlev 
10785084Sjohnlev 	CASE(EPERM);	CASE(ENOENT);	CASE(ESRCH);
10795084Sjohnlev 	CASE(EINTR);	CASE(EIO);	CASE(ENXIO);
10805084Sjohnlev 	CASE(E2BIG);	CASE(ENOMEM);	CASE(EACCES);
10815084Sjohnlev 	CASE(EFAULT);	CASE(EBUSY);	CASE(EEXIST);
10825084Sjohnlev 	CASE(ENODEV);	CASE(EISDIR);	CASE(EINVAL);
10835084Sjohnlev 	CASE(ENOSPC);	CASE(ESPIPE);	CASE(EROFS);
10845084Sjohnlev 	CASE(ENOSYS);	CASE(ENOTEMPTY); CASE(EISCONN);
108510175SStuart.Maybee@Sun.COM 	CASE(ENODATA);	CASE(EAGAIN);
10865084Sjohnlev 
10875084Sjohnlev #undef CASE
10885084Sjohnlev 
10895084Sjohnlev 	default:
10905084Sjohnlev 		panic("xen_xlate_errcode: unknown error %d", error);
10915084Sjohnlev 	}
10925084Sjohnlev 
10935084Sjohnlev 	return (error);
10945084Sjohnlev }
10955084Sjohnlev 
10965084Sjohnlev /*
10975084Sjohnlev  * Raise PS_IOPL on current vcpu to user level.
10985084Sjohnlev  * Caller responsible for preventing kernel preemption.
10995084Sjohnlev  */
11005084Sjohnlev void
xen_enable_user_iopl(void)11015084Sjohnlev xen_enable_user_iopl(void)
11025084Sjohnlev {
11035084Sjohnlev 	physdev_set_iopl_t set_iopl;
11045084Sjohnlev 	set_iopl.iopl = 3;		/* user ring 3 */
11055084Sjohnlev 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
11065084Sjohnlev }
11075084Sjohnlev 
11085084Sjohnlev /*
11095084Sjohnlev  * Drop PS_IOPL on current vcpu to kernel level
11105084Sjohnlev  */
11115084Sjohnlev void
xen_disable_user_iopl(void)11125084Sjohnlev xen_disable_user_iopl(void)
11135084Sjohnlev {
11145084Sjohnlev 	physdev_set_iopl_t set_iopl;
11155084Sjohnlev 	set_iopl.iopl = 1;		/* kernel pseudo ring 1 */
11165084Sjohnlev 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
11175084Sjohnlev }
11185084Sjohnlev 
11195084Sjohnlev int
xen_gdt_setprot(cpu_t * cp,uint_t prot)11205084Sjohnlev xen_gdt_setprot(cpu_t *cp, uint_t prot)
11215084Sjohnlev {
11225084Sjohnlev 	int err;
11235084Sjohnlev #if defined(__amd64)
11245084Sjohnlev 	int pt_bits = PT_VALID;
11255084Sjohnlev 	if (prot & PROT_WRITE)
11265084Sjohnlev 		pt_bits |= PT_WRITABLE;
11275084Sjohnlev #endif
11285084Sjohnlev 
11295084Sjohnlev 	if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt,
11305084Sjohnlev 	    MMU_PAGESIZE, prot)) != 0)
11315084Sjohnlev 		goto done;
11325084Sjohnlev 
11335084Sjohnlev #if defined(__amd64)
11345084Sjohnlev 	err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits);
11355084Sjohnlev #endif
11365084Sjohnlev 
11375084Sjohnlev done:
11385084Sjohnlev 	if (err) {
11395084Sjohnlev 		cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d",
11405084Sjohnlev 		    cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only",
11415084Sjohnlev 		    err);
11425084Sjohnlev 	}
11435084Sjohnlev 
11445084Sjohnlev 	return (err);
11455084Sjohnlev }
11465084Sjohnlev 
11475084Sjohnlev int
xen_ldt_setprot(user_desc_t * ldt,size_t lsize,uint_t prot)11485084Sjohnlev xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot)
11495084Sjohnlev {
11505084Sjohnlev 	int err;
11515084Sjohnlev 	caddr_t	lva = (caddr_t)ldt;
11525084Sjohnlev #if defined(__amd64)
11535084Sjohnlev 	int pt_bits = PT_VALID;
11545084Sjohnlev 	pgcnt_t npgs;
11555084Sjohnlev 	if (prot & PROT_WRITE)
11565084Sjohnlev 		pt_bits |= PT_WRITABLE;
11575084Sjohnlev #endif	/* __amd64 */
11585084Sjohnlev 
11595084Sjohnlev 	if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0)
11605084Sjohnlev 		goto done;
11615084Sjohnlev 
11625084Sjohnlev #if defined(__amd64)
11635084Sjohnlev 
11645084Sjohnlev 	ASSERT(IS_P2ALIGNED(lsize, PAGESIZE));
11655084Sjohnlev 	npgs = mmu_btop(lsize);
11665084Sjohnlev 	while (npgs--) {
11675084Sjohnlev 		if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva),
11685084Sjohnlev 		    pt_bits)) != 0)
11695084Sjohnlev 			break;
11705084Sjohnlev 		lva += PAGESIZE;
11715084Sjohnlev 	}
11725084Sjohnlev #endif	/* __amd64 */
11735084Sjohnlev 
11745084Sjohnlev done:
11755084Sjohnlev 	if (err) {
11765084Sjohnlev 		cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d",
11775084Sjohnlev 		    (void *)lva,
11785084Sjohnlev 		    (prot & PROT_WRITE) ? "writable" : "read-only", err);
11795084Sjohnlev 	}
11805084Sjohnlev 
11815084Sjohnlev 	return (err);
11825084Sjohnlev }
11837532SSean.Ye@Sun.COM 
11847532SSean.Ye@Sun.COM int
xen_get_mc_physcpuinfo(xen_mc_logical_cpu_t * log_cpus,uint_t * ncpus)11857532SSean.Ye@Sun.COM xen_get_mc_physcpuinfo(xen_mc_logical_cpu_t *log_cpus, uint_t *ncpus)
11867532SSean.Ye@Sun.COM {
1187*11120SMark.Johnson@Sun.COM 	xen_mc_t xmc;
1188*11120SMark.Johnson@Sun.COM 	struct xen_mc_physcpuinfo *cpi = &xmc.u.mc_physcpuinfo;
11897532SSean.Ye@Sun.COM 
1190*11120SMark.Johnson@Sun.COM 	cpi->ncpus = *ncpus;
11917532SSean.Ye@Sun.COM 	/*LINTED: constant in conditional context*/
1192*11120SMark.Johnson@Sun.COM 	set_xen_guest_handle(cpi->info, log_cpus);
11937532SSean.Ye@Sun.COM 
1194*11120SMark.Johnson@Sun.COM 	if (HYPERVISOR_mca(XEN_MC_physcpuinfo, &xmc) != 0)
11957532SSean.Ye@Sun.COM 		return (-1);
11967532SSean.Ye@Sun.COM 
1197*11120SMark.Johnson@Sun.COM 	*ncpus = cpi->ncpus;
11987532SSean.Ye@Sun.COM 	return (0);
11997532SSean.Ye@Sun.COM }
12007532SSean.Ye@Sun.COM 
12017532SSean.Ye@Sun.COM void
print_panic(const char * str)12027532SSean.Ye@Sun.COM print_panic(const char *str)
12037532SSean.Ye@Sun.COM {
12047532SSean.Ye@Sun.COM 	xen_printf(str);
12057532SSean.Ye@Sun.COM }
12067532SSean.Ye@Sun.COM 
12077532SSean.Ye@Sun.COM /*
12087532SSean.Ye@Sun.COM  * Interfaces to iterate over real cpu information, but only that info
12097532SSean.Ye@Sun.COM  * which we choose to expose here.  These are of interest to dom0
12107532SSean.Ye@Sun.COM  * only (and the backing hypercall should not work for domu).
12117532SSean.Ye@Sun.COM  */
12127532SSean.Ye@Sun.COM 
12137532SSean.Ye@Sun.COM xen_mc_lcpu_cookie_t
xen_physcpu_next(xen_mc_lcpu_cookie_t cookie)12147532SSean.Ye@Sun.COM xen_physcpu_next(xen_mc_lcpu_cookie_t cookie)
12157532SSean.Ye@Sun.COM {
12167532SSean.Ye@Sun.COM 	xen_mc_logical_cpu_t *xcp = (xen_mc_logical_cpu_t *)cookie;
12177532SSean.Ye@Sun.COM 
12187532SSean.Ye@Sun.COM 	if (!DOMAIN_IS_INITDOMAIN(xen_info))
12197532SSean.Ye@Sun.COM 		return (NULL);
12207532SSean.Ye@Sun.COM 
12217532SSean.Ye@Sun.COM 	if (cookie == NULL)
12227532SSean.Ye@Sun.COM 		return ((xen_mc_lcpu_cookie_t)xen_phys_cpus);
12237532SSean.Ye@Sun.COM 
12247532SSean.Ye@Sun.COM 	if (xcp == xen_phys_cpus + xen_phys_ncpus - 1)
12257532SSean.Ye@Sun.COM 		return (NULL);
12267532SSean.Ye@Sun.COM 	else
12277532SSean.Ye@Sun.COM 		return ((xen_mc_lcpu_cookie_t)++xcp);
12287532SSean.Ye@Sun.COM }
12297532SSean.Ye@Sun.COM 
12307532SSean.Ye@Sun.COM #define	COOKIE2XCP(c) ((xen_mc_logical_cpu_t *)(c))
12317532SSean.Ye@Sun.COM 
12327532SSean.Ye@Sun.COM const char *
xen_physcpu_vendorstr(xen_mc_lcpu_cookie_t cookie)12337532SSean.Ye@Sun.COM xen_physcpu_vendorstr(xen_mc_lcpu_cookie_t cookie)
12347532SSean.Ye@Sun.COM {
12357532SSean.Ye@Sun.COM 	xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie);
12367532SSean.Ye@Sun.COM 
12377532SSean.Ye@Sun.COM 	return ((const char *)&xcp->mc_vendorid[0]);
12387532SSean.Ye@Sun.COM }
12397532SSean.Ye@Sun.COM 
12407532SSean.Ye@Sun.COM int
xen_physcpu_family(xen_mc_lcpu_cookie_t cookie)12417532SSean.Ye@Sun.COM xen_physcpu_family(xen_mc_lcpu_cookie_t cookie)
12427532SSean.Ye@Sun.COM {
12437532SSean.Ye@Sun.COM 	return (COOKIE2XCP(cookie)->mc_family);
12447532SSean.Ye@Sun.COM }
12457532SSean.Ye@Sun.COM 
12467532SSean.Ye@Sun.COM int
xen_physcpu_model(xen_mc_lcpu_cookie_t cookie)12477532SSean.Ye@Sun.COM xen_physcpu_model(xen_mc_lcpu_cookie_t cookie)
12487532SSean.Ye@Sun.COM {
12497532SSean.Ye@Sun.COM 	return (COOKIE2XCP(cookie)->mc_model);
12507532SSean.Ye@Sun.COM }
12517532SSean.Ye@Sun.COM 
12527532SSean.Ye@Sun.COM int
xen_physcpu_stepping(xen_mc_lcpu_cookie_t cookie)12537532SSean.Ye@Sun.COM xen_physcpu_stepping(xen_mc_lcpu_cookie_t cookie)
12547532SSean.Ye@Sun.COM {
12557532SSean.Ye@Sun.COM 	return (COOKIE2XCP(cookie)->mc_step);
12567532SSean.Ye@Sun.COM }
12577532SSean.Ye@Sun.COM 
12587532SSean.Ye@Sun.COM id_t
xen_physcpu_chipid(xen_mc_lcpu_cookie_t cookie)12597532SSean.Ye@Sun.COM xen_physcpu_chipid(xen_mc_lcpu_cookie_t cookie)
12607532SSean.Ye@Sun.COM {
12617532SSean.Ye@Sun.COM 	return (COOKIE2XCP(cookie)->mc_chipid);
12627532SSean.Ye@Sun.COM }
12637532SSean.Ye@Sun.COM 
12647532SSean.Ye@Sun.COM id_t
xen_physcpu_coreid(xen_mc_lcpu_cookie_t cookie)12657532SSean.Ye@Sun.COM xen_physcpu_coreid(xen_mc_lcpu_cookie_t cookie)
12667532SSean.Ye@Sun.COM {
12677532SSean.Ye@Sun.COM 	return (COOKIE2XCP(cookie)->mc_coreid);
12687532SSean.Ye@Sun.COM }
12697532SSean.Ye@Sun.COM 
12707532SSean.Ye@Sun.COM id_t
xen_physcpu_strandid(xen_mc_lcpu_cookie_t cookie)12717532SSean.Ye@Sun.COM xen_physcpu_strandid(xen_mc_lcpu_cookie_t cookie)
12727532SSean.Ye@Sun.COM {
12737532SSean.Ye@Sun.COM 	return (COOKIE2XCP(cookie)->mc_threadid);
12747532SSean.Ye@Sun.COM }
12757532SSean.Ye@Sun.COM 
12767532SSean.Ye@Sun.COM id_t
xen_physcpu_initial_apicid(xen_mc_lcpu_cookie_t cookie)127710942STom.Pothier@Sun.COM xen_physcpu_initial_apicid(xen_mc_lcpu_cookie_t cookie)
127810942STom.Pothier@Sun.COM {
127910942STom.Pothier@Sun.COM 	return (COOKIE2XCP(cookie)->mc_clusterid);
128010942STom.Pothier@Sun.COM }
128110942STom.Pothier@Sun.COM 
128210942STom.Pothier@Sun.COM id_t
xen_physcpu_logical_id(xen_mc_lcpu_cookie_t cookie)12837532SSean.Ye@Sun.COM xen_physcpu_logical_id(xen_mc_lcpu_cookie_t cookie)
12847532SSean.Ye@Sun.COM {
12857532SSean.Ye@Sun.COM 	return (COOKIE2XCP(cookie)->mc_cpunr);
12867532SSean.Ye@Sun.COM }
12877532SSean.Ye@Sun.COM 
12887532SSean.Ye@Sun.COM boolean_t
xen_physcpu_is_cmt(xen_mc_lcpu_cookie_t cookie)12897532SSean.Ye@Sun.COM xen_physcpu_is_cmt(xen_mc_lcpu_cookie_t cookie)
12907532SSean.Ye@Sun.COM {
12917532SSean.Ye@Sun.COM 	return (COOKIE2XCP(cookie)->mc_nthreads > 1);
12927532SSean.Ye@Sun.COM }
12937532SSean.Ye@Sun.COM 
12947532SSean.Ye@Sun.COM uint64_t
xen_physcpu_mcg_cap(xen_mc_lcpu_cookie_t cookie)12957532SSean.Ye@Sun.COM xen_physcpu_mcg_cap(xen_mc_lcpu_cookie_t cookie)
12967532SSean.Ye@Sun.COM {
12977532SSean.Ye@Sun.COM 	xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie);
12987532SSean.Ye@Sun.COM 
12997532SSean.Ye@Sun.COM 	/*
13007532SSean.Ye@Sun.COM 	 * Need to #define the indices, or search through the array.
13017532SSean.Ye@Sun.COM 	 */
13027532SSean.Ye@Sun.COM 	return (xcp->mc_msrvalues[0].value);
13037532SSean.Ye@Sun.COM }
13047756SMark.Johnson@Sun.COM 
13057756SMark.Johnson@Sun.COM int
xen_map_gref(uint_t cmd,gnttab_map_grant_ref_t * mapop,uint_t count,boolean_t uvaddr)13067756SMark.Johnson@Sun.COM xen_map_gref(uint_t cmd, gnttab_map_grant_ref_t *mapop, uint_t count,
13077756SMark.Johnson@Sun.COM     boolean_t uvaddr)
13087756SMark.Johnson@Sun.COM {
13097756SMark.Johnson@Sun.COM 	long rc;
131010175SStuart.Maybee@Sun.COM 	uint_t i;
13117756SMark.Johnson@Sun.COM 
13127756SMark.Johnson@Sun.COM 	ASSERT(cmd == GNTTABOP_map_grant_ref);
13137756SMark.Johnson@Sun.COM 
13147756SMark.Johnson@Sun.COM #if !defined(_BOOT)
131510175SStuart.Maybee@Sun.COM 	if (uvaddr == B_FALSE) {
13167756SMark.Johnson@Sun.COM 		for (i = 0; i < count; ++i) {
131710175SStuart.Maybee@Sun.COM 			mapop[i].flags |= (PT_FOREIGN <<_GNTMAP_guest_avail0);
13187756SMark.Johnson@Sun.COM 		}
13197756SMark.Johnson@Sun.COM 	}
13207756SMark.Johnson@Sun.COM #endif
13217756SMark.Johnson@Sun.COM 
132210175SStuart.Maybee@Sun.COM 	rc = HYPERVISOR_grant_table_op(cmd, mapop, count);
132310175SStuart.Maybee@Sun.COM 
13247756SMark.Johnson@Sun.COM 	return (rc);
13257756SMark.Johnson@Sun.COM }
132610175SStuart.Maybee@Sun.COM 
132710175SStuart.Maybee@Sun.COM static int
xpv_get_physinfo(xen_sysctl_physinfo_t * pi)132810175SStuart.Maybee@Sun.COM xpv_get_physinfo(xen_sysctl_physinfo_t *pi)
132910175SStuart.Maybee@Sun.COM {
133010175SStuart.Maybee@Sun.COM 	xen_sysctl_t op;
133110175SStuart.Maybee@Sun.COM 	struct sp { void *p; } *sp = (struct sp *)&op.u.physinfo.cpu_to_node;
133210175SStuart.Maybee@Sun.COM 	int ret;
133310175SStuart.Maybee@Sun.COM 
133410175SStuart.Maybee@Sun.COM 	bzero(&op, sizeof (op));
133510175SStuart.Maybee@Sun.COM 	op.cmd = XEN_SYSCTL_physinfo;
133610175SStuart.Maybee@Sun.COM 	op.interface_version = XEN_SYSCTL_INTERFACE_VERSION;
133710175SStuart.Maybee@Sun.COM 	/*LINTED: constant in conditional context*/
133810175SStuart.Maybee@Sun.COM 	set_xen_guest_handle(*sp, NULL);
133910175SStuart.Maybee@Sun.COM 
134010175SStuart.Maybee@Sun.COM 	ret = HYPERVISOR_sysctl(&op);
134110175SStuart.Maybee@Sun.COM 
134210175SStuart.Maybee@Sun.COM 	if (ret != 0)
134310175SStuart.Maybee@Sun.COM 		return (xen_xlate_errcode(ret));
134410175SStuart.Maybee@Sun.COM 
134510175SStuart.Maybee@Sun.COM 	bcopy(&op.u.physinfo, pi, sizeof (op.u.physinfo));
134610175SStuart.Maybee@Sun.COM 	return (0);
134710175SStuart.Maybee@Sun.COM }
134810175SStuart.Maybee@Sun.COM 
134910175SStuart.Maybee@Sun.COM /*
135010175SStuart.Maybee@Sun.COM  * On dom0, we can determine the number of physical cpus on the machine.
135110175SStuart.Maybee@Sun.COM  * This number is important when figuring out what workarounds are
135210175SStuart.Maybee@Sun.COM  * appropriate, so compute it now.
135310175SStuart.Maybee@Sun.COM  */
135410175SStuart.Maybee@Sun.COM uint_t
xpv_nr_phys_cpus(void)135510175SStuart.Maybee@Sun.COM xpv_nr_phys_cpus(void)
135610175SStuart.Maybee@Sun.COM {
135710175SStuart.Maybee@Sun.COM 	static uint_t nphyscpus = 0;
135810175SStuart.Maybee@Sun.COM 
135910175SStuart.Maybee@Sun.COM 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
136010175SStuart.Maybee@Sun.COM 
136110175SStuart.Maybee@Sun.COM 	if (nphyscpus == 0) {
136210175SStuart.Maybee@Sun.COM 		xen_sysctl_physinfo_t pi;
136310175SStuart.Maybee@Sun.COM 		int ret;
136410175SStuart.Maybee@Sun.COM 
136510175SStuart.Maybee@Sun.COM 		if ((ret = xpv_get_physinfo(&pi)) != 0)
136610175SStuart.Maybee@Sun.COM 			panic("xpv_get_physinfo() failed: %d\n", ret);
136710175SStuart.Maybee@Sun.COM 		nphyscpus = pi.nr_cpus;
136810175SStuart.Maybee@Sun.COM 	}
136910175SStuart.Maybee@Sun.COM 	return (nphyscpus);
137010175SStuart.Maybee@Sun.COM }
137110175SStuart.Maybee@Sun.COM 
137210175SStuart.Maybee@Sun.COM pgcnt_t
xpv_nr_phys_pages(void)137310175SStuart.Maybee@Sun.COM xpv_nr_phys_pages(void)
137410175SStuart.Maybee@Sun.COM {
137510175SStuart.Maybee@Sun.COM 	xen_sysctl_physinfo_t pi;
137610175SStuart.Maybee@Sun.COM 	int ret;
137710175SStuart.Maybee@Sun.COM 
137810175SStuart.Maybee@Sun.COM 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
137910175SStuart.Maybee@Sun.COM 
138010175SStuart.Maybee@Sun.COM 	if ((ret = xpv_get_physinfo(&pi)) != 0)
138110175SStuart.Maybee@Sun.COM 		panic("xpv_get_physinfo() failed: %d\n", ret);
138210175SStuart.Maybee@Sun.COM 
138310175SStuart.Maybee@Sun.COM 	return ((pgcnt_t)pi.total_pages);
138410175SStuart.Maybee@Sun.COM }
138510175SStuart.Maybee@Sun.COM 
138610175SStuart.Maybee@Sun.COM uint64_t
xpv_cpu_khz(void)138710175SStuart.Maybee@Sun.COM xpv_cpu_khz(void)
138810175SStuart.Maybee@Sun.COM {
138910175SStuart.Maybee@Sun.COM 	xen_sysctl_physinfo_t pi;
139010175SStuart.Maybee@Sun.COM 	int ret;
139110175SStuart.Maybee@Sun.COM 
139210175SStuart.Maybee@Sun.COM 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
139310175SStuart.Maybee@Sun.COM 
139410175SStuart.Maybee@Sun.COM 	if ((ret = xpv_get_physinfo(&pi)) != 0)
139510175SStuart.Maybee@Sun.COM 		panic("xpv_get_physinfo() failed: %d\n", ret);
139610175SStuart.Maybee@Sun.COM 	return ((uint64_t)pi.cpu_khz);
139710175SStuart.Maybee@Sun.COM }
1398