xref: /onnv-gate/usr/src/uts/i86xpv/os/mp_xen.c (revision 9489:7aad39a516b4)
15084Sjohnlev /*
25084Sjohnlev  * CDDL HEADER START
35084Sjohnlev  *
45084Sjohnlev  * The contents of this file are subject to the terms of the
55084Sjohnlev  * Common Development and Distribution License (the "License").
65084Sjohnlev  * You may not use this file except in compliance with the License.
75084Sjohnlev  *
85084Sjohnlev  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95084Sjohnlev  * or http://www.opensolaris.org/os/licensing.
105084Sjohnlev  * See the License for the specific language governing permissions
115084Sjohnlev  * and limitations under the License.
125084Sjohnlev  *
135084Sjohnlev  * When distributing Covered Code, include this CDDL HEADER in each
145084Sjohnlev  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155084Sjohnlev  * If applicable, add the following below this CDDL HEADER, with the
165084Sjohnlev  * fields enclosed by brackets "[]" replaced with your own identifying
175084Sjohnlev  * information: Portions Copyright [yyyy] [name of copyright owner]
185084Sjohnlev  *
195084Sjohnlev  * CDDL HEADER END
205084Sjohnlev  */
215084Sjohnlev 
225084Sjohnlev /*
23*9489SJoe.Bonasera@sun.com  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
245084Sjohnlev  * Use is subject to license terms.
255084Sjohnlev  */
265084Sjohnlev 
275159Sjohnlev /*
285159Sjohnlev  * Virtual CPU management.
295159Sjohnlev  *
305159Sjohnlev  * VCPUs can be controlled in one of two ways; through the domain itself
315159Sjohnlev  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
325159Sjohnlev  * Unfortunately, the terminology is used in different ways; they work out as
335159Sjohnlev  * follows:
345159Sjohnlev  *
355159Sjohnlev  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
365159Sjohnlev  *
375159Sjohnlev  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
385159Sjohnlev  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
395159Sjohnlev  * receive interrupts, and we require this for offline CPUs in Solaris.
405159Sjohnlev  *
415159Sjohnlev  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
425159Sjohnlev  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
435159Sjohnlev  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
445159Sjohnlev  * event channels, etc.) will still exist.
455159Sjohnlev  *
465159Sjohnlev  * The hypervisor has two notions of CPU states as represented in the store:
475159Sjohnlev  *
485159Sjohnlev  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
495159Sjohnlev  *
505159Sjohnlev  * "online": the VCPU is running.  Corresponds to a CPU state other than
515159Sjohnlev  * P_POWEROFF.
525159Sjohnlev  *
535159Sjohnlev  * Currently, only a notification via xenstore can bring a CPU into a
545159Sjohnlev  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
555159Sjohnlev  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
565159Sjohnlev  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
575159Sjohnlev  *
585159Sjohnlev  * Note that the xenstore configuration is strictly advisory, in that a domain
595159Sjohnlev  * can choose to ignore it and still power up a VCPU in the offline state. To
605159Sjohnlev  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
615159Sjohnlev  * ENOTSUP from within Solaris.
625159Sjohnlev  *
635159Sjohnlev  * Powering off a VCPU and suspending the domain use similar code. The
645159Sjohnlev  * difficulty here is that we must ensure that each VCPU is in a stable
655159Sjohnlev  * state: it must have a saved PCB, and not be responding to interrupts
665159Sjohnlev  * (since we are just about to remove its ability to run on a real CPU,
675159Sjohnlev  * possibly forever).  However, an offline CPU in Solaris can take
685159Sjohnlev  * cross-call interrupts, as mentioned, so we must go through a
695159Sjohnlev  * two-stage process.  First, we use the standard Solaris pause_cpus().
705159Sjohnlev  * This ensures that all CPUs are either in mach_cpu_pause() or
715159Sjohnlev  * mach_cpu_idle(), and nothing will cross-call them.
725159Sjohnlev  *
735159Sjohnlev  * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
745159Sjohnlev  * bring them back up, and in state CPU_PHASE_POWERED_OFF.
755159Sjohnlev  *
765159Sjohnlev  * Running CPUs are spinning in mach_cpu_pause() waiting for either
775159Sjohnlev  * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
785159Sjohnlev  *
795159Sjohnlev  * Offline CPUs are either running the idle thread and periodically
805159Sjohnlev  * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
815159Sjohnlev  *
825159Sjohnlev  * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
835159Sjohnlev  * poking them to make sure they're not blocked[1]. When every CPU has
845159Sjohnlev  * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
855159Sjohnlev  * know we can suspend, or power-off a CPU, without problems.
865159Sjohnlev  *
875159Sjohnlev  * [1] note that we have to repeatedly poke offline CPUs: it's the only
885159Sjohnlev  * way to ensure that the CPU doesn't miss the state change before
895159Sjohnlev  * dropping into HYPERVISOR_block().
905159Sjohnlev  */
915159Sjohnlev 
925084Sjohnlev #include <sys/types.h>
935084Sjohnlev #include <sys/systm.h>
945084Sjohnlev #include <sys/param.h>
955084Sjohnlev #include <sys/taskq.h>
965084Sjohnlev #include <sys/cmn_err.h>
975084Sjohnlev #include <sys/archsystm.h>
985084Sjohnlev #include <sys/machsystm.h>
995084Sjohnlev #include <sys/segments.h>
1005084Sjohnlev #include <sys/cpuvar.h>
1015084Sjohnlev #include <sys/x86_archext.h>
1025084Sjohnlev #include <sys/controlregs.h>
1035159Sjohnlev #include <sys/hypervisor.h>
1045159Sjohnlev #include <sys/xpv_panic.h>
1055084Sjohnlev #include <sys/mman.h>
1065159Sjohnlev #include <sys/psw.h>
1075159Sjohnlev #include <sys/cpu.h>
1085159Sjohnlev #include <sys/sunddi.h>
1095084Sjohnlev #include <util/sscanf.h>
1105159Sjohnlev #include <vm/hat_i86.h>
1115159Sjohnlev #include <vm/hat.h>
1125159Sjohnlev #include <vm/as.h>
1135084Sjohnlev 
1145159Sjohnlev #include <xen/public/io/xs_wire.h>
1155159Sjohnlev #include <xen/sys/xenbus_impl.h>
1165084Sjohnlev #include <xen/public/vcpu.h>
1175159Sjohnlev 
118*9489SJoe.Bonasera@sun.com extern cpuset_t cpu_ready_set;
119*9489SJoe.Bonasera@sun.com 
1205159Sjohnlev #define	CPU_PHASE_NONE 0
1215159Sjohnlev #define	CPU_PHASE_WAIT_SAFE 1
1225159Sjohnlev #define	CPU_PHASE_SAFE 2
1235159Sjohnlev #define	CPU_PHASE_POWERED_OFF 3
1245084Sjohnlev 
1255159Sjohnlev /*
1265159Sjohnlev  * We can only poke CPUs during barrier enter 256 times a second at
1275159Sjohnlev  * most.
1285159Sjohnlev  */
1295159Sjohnlev #define	POKE_TIMEOUT (NANOSEC / 256)
1305084Sjohnlev 
1315084Sjohnlev static taskq_t *cpu_config_tq;
1325159Sjohnlev static int cpu_phase[NCPU];
1335159Sjohnlev 
1345084Sjohnlev static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
1355084Sjohnlev static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
1365084Sjohnlev 
1375084Sjohnlev /*
1385529Ssmaybe  * Return whether or not the vcpu is actually running on a pcpu
1395529Ssmaybe  */
1405529Ssmaybe int
vcpu_on_pcpu(processorid_t cpu)1415529Ssmaybe vcpu_on_pcpu(processorid_t cpu)
1425529Ssmaybe {
1435529Ssmaybe 	struct vcpu_runstate_info runstate;
1445529Ssmaybe 	int	ret = VCPU_STATE_UNKNOWN;
1455529Ssmaybe 
1465529Ssmaybe 	ASSERT(cpu < NCPU);
1475529Ssmaybe 	/*
1485529Ssmaybe 	 * Don't bother with hypercall if we are asking about ourself
1495529Ssmaybe 	 */
1505529Ssmaybe 	if (cpu == CPU->cpu_id)
1515529Ssmaybe 		return (VCPU_ON_PCPU);
1525529Ssmaybe 	if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
1535529Ssmaybe 		goto out;
1545529Ssmaybe 
1555529Ssmaybe 	switch (runstate.state) {
1565529Ssmaybe 	case RUNSTATE_running:
1575529Ssmaybe 		ret = VCPU_ON_PCPU;
1585529Ssmaybe 		break;
1595529Ssmaybe 
1605529Ssmaybe 	case RUNSTATE_runnable:
1615529Ssmaybe 	case RUNSTATE_offline:
1625529Ssmaybe 	case RUNSTATE_blocked:
1635529Ssmaybe 		ret = VCPU_NOT_ON_PCPU;
1645529Ssmaybe 		break;
1655529Ssmaybe 
1665529Ssmaybe 	default:
1675529Ssmaybe 		break;
1685529Ssmaybe 	}
1695529Ssmaybe 
1705529Ssmaybe out:
1715529Ssmaybe 	return (ret);
1725529Ssmaybe }
1735529Ssmaybe 
1745529Ssmaybe /*
1755084Sjohnlev  * These routines allocate any global state that might be needed
1765084Sjohnlev  * while starting cpus.  For virtual cpus, there is no such state.
1775084Sjohnlev  */
1785084Sjohnlev int
mach_cpucontext_init(void)1795084Sjohnlev mach_cpucontext_init(void)
1805084Sjohnlev {
1815084Sjohnlev 	return (0);
1825084Sjohnlev }
1835084Sjohnlev 
1845084Sjohnlev void
do_cpu_config_watch(int state)1855084Sjohnlev do_cpu_config_watch(int state)
1865084Sjohnlev {
1875084Sjohnlev 	static struct xenbus_watch cpu_config_watch;
1885084Sjohnlev 
1895084Sjohnlev 	if (state != XENSTORE_UP)
1905084Sjohnlev 		return;
1915084Sjohnlev 	cpu_config_watch.node = "cpu";
1925084Sjohnlev 	cpu_config_watch.callback = vcpu_config_event;
1935084Sjohnlev 	if (register_xenbus_watch(&cpu_config_watch)) {
1945084Sjohnlev 		taskq_destroy(cpu_config_tq);
1955084Sjohnlev 		cmn_err(CE_WARN, "do_cpu_config_watch: "
1965084Sjohnlev 		    "failed to set vcpu config watch");
1975084Sjohnlev 	}
1985084Sjohnlev 
1995084Sjohnlev }
2005084Sjohnlev 
2015084Sjohnlev /*
2025084Sjohnlev  * This routine is called after all the "normal" MP startup has
2035084Sjohnlev  * been done; a good place to start watching xen store for virtual
2045084Sjohnlev  * cpu hot plug events.
2055084Sjohnlev  */
2065084Sjohnlev void
mach_cpucontext_fini(void)2075084Sjohnlev mach_cpucontext_fini(void)
2085084Sjohnlev {
2095084Sjohnlev 
2105084Sjohnlev 	cpu_config_tq = taskq_create("vcpu config taskq", 1,
2115084Sjohnlev 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
2125084Sjohnlev 
2135084Sjohnlev 	(void) xs_register_xenbus_callback(do_cpu_config_watch);
2145084Sjohnlev }
2155084Sjohnlev 
2165084Sjohnlev /*
2175084Sjohnlev  * Fill in the remaining CPU context and initialize it.
2185084Sjohnlev  */
2195084Sjohnlev static int
mp_set_cpu_context(vcpu_guest_context_t * vgc,cpu_t * cp)2205084Sjohnlev mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
2215084Sjohnlev {
2225084Sjohnlev 	uint_t vec, iopl;
2235084Sjohnlev 
2245084Sjohnlev 	vgc->flags = VGCF_IN_KERNEL;
2255084Sjohnlev 
2265084Sjohnlev 	/*
2275084Sjohnlev 	 * fpu_ctx we leave as zero; on first fault we'll store
2285084Sjohnlev 	 * sse_initial into it anyway.
2295084Sjohnlev 	 */
2305084Sjohnlev 
2315084Sjohnlev #if defined(__amd64)
2325084Sjohnlev 	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
2335084Sjohnlev #else
2345084Sjohnlev 	vgc->user_regs.cs = KCS_SEL;
2355084Sjohnlev #endif
2365084Sjohnlev 	vgc->user_regs.ds = KDS_SEL;
2375084Sjohnlev 	vgc->user_regs.es = KDS_SEL;
2385084Sjohnlev 	vgc->user_regs.ss = KDS_SEL;
2395084Sjohnlev 	vgc->kernel_ss = KDS_SEL;
2405084Sjohnlev 
2415084Sjohnlev 	/*
2425084Sjohnlev 	 * Allow I/O privilege level for Dom0 kernel.
2435084Sjohnlev 	 */
2445084Sjohnlev 	if (DOMAIN_IS_INITDOMAIN(xen_info))
2455084Sjohnlev 		iopl = (PS_IOPL & 0x1000); /* ring 1 */
2465084Sjohnlev 	else
2475084Sjohnlev 		iopl = 0;
2485084Sjohnlev 
2495084Sjohnlev #if defined(__amd64)
2505084Sjohnlev 	vgc->user_regs.fs = 0;
2515084Sjohnlev 	vgc->user_regs.gs = 0;
2525084Sjohnlev 	vgc->user_regs.rflags = F_OFF | iopl;
2535084Sjohnlev #elif defined(__i386)
2545084Sjohnlev 	vgc->user_regs.fs = KFS_SEL;
2555084Sjohnlev 	vgc->user_regs.gs = KGS_SEL;
2565084Sjohnlev 	vgc->user_regs.eflags = F_OFF | iopl;
2575084Sjohnlev 	vgc->event_callback_cs = vgc->user_regs.cs;
2585084Sjohnlev 	vgc->failsafe_callback_cs = vgc->user_regs.cs;
2595084Sjohnlev #endif
2605084Sjohnlev 
2615084Sjohnlev 	/*
2625084Sjohnlev 	 * Initialize the trap_info_t from the IDT
2635084Sjohnlev 	 */
2645084Sjohnlev #if !defined(__lint)
2655084Sjohnlev 	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
2665084Sjohnlev #endif
2675084Sjohnlev 	for (vec = 0; vec < NIDT; vec++) {
2685084Sjohnlev 		trap_info_t *ti = &vgc->trap_ctxt[vec];
2695084Sjohnlev 
2705084Sjohnlev 		if (xen_idt_to_trap_info(vec,
2715084Sjohnlev 		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
2725084Sjohnlev 			ti->cs = KCS_SEL;
2735084Sjohnlev 			ti->vector = vec;
2745084Sjohnlev 		}
2755084Sjohnlev 	}
2765084Sjohnlev 
2775084Sjohnlev 	/*
2785084Sjohnlev 	 * No LDT
2795084Sjohnlev 	 */
2805084Sjohnlev 
2815084Sjohnlev 	/*
2825084Sjohnlev 	 * (We assert in various places that the GDT is (a) aligned on a
2835084Sjohnlev 	 * page boundary and (b) one page long, so this really should fit..)
2845084Sjohnlev 	 */
2855084Sjohnlev #ifdef CRASH_XEN
2865084Sjohnlev 	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
2875084Sjohnlev #else
2885084Sjohnlev 	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
2895084Sjohnlev #endif
2905084Sjohnlev 	vgc->gdt_ents = NGDT;
2915084Sjohnlev 
2925084Sjohnlev 	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
2935084Sjohnlev 
2945084Sjohnlev #if defined(__i386)
2955084Sjohnlev 	if (mmu.pae_hat)
2965084Sjohnlev 		vgc->ctrlreg[3] =
2975084Sjohnlev 		    xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
2985084Sjohnlev 	else
2995084Sjohnlev #endif
3005084Sjohnlev 		vgc->ctrlreg[3] =
3015084Sjohnlev 		    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
3025084Sjohnlev 
3035084Sjohnlev 	vgc->ctrlreg[4] = getcr4();
3045084Sjohnlev 
3055084Sjohnlev 	vgc->event_callback_eip = (uintptr_t)xen_callback;
3065084Sjohnlev 	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
3075084Sjohnlev 	vgc->flags |= VGCF_failsafe_disables_events;
3085084Sjohnlev 
3095084Sjohnlev #if defined(__amd64)
3105084Sjohnlev 	/*
3115084Sjohnlev 	 * XXPV should this be moved to init_cpu_syscall?
3125084Sjohnlev 	 */
3135084Sjohnlev 	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
3145084Sjohnlev 	vgc->flags |= VGCF_syscall_disables_events;
3155084Sjohnlev 
3165084Sjohnlev 	ASSERT(vgc->user_regs.gs == 0);
3175084Sjohnlev 	vgc->gs_base_kernel = (uintptr_t)cp;
3185084Sjohnlev #endif
3195084Sjohnlev 
3205084Sjohnlev 	return (xen_vcpu_initialize(cp->cpu_id, vgc));
3215084Sjohnlev }
3225084Sjohnlev 
3235084Sjohnlev /*
3245084Sjohnlev  * Create a guest virtual cpu context so that the virtual cpu
3255084Sjohnlev  * springs into life in the domain just about to call mp_startup()
3265084Sjohnlev  *
3275084Sjohnlev  * Virtual CPUs must be initialized once in the lifetime of the domain;
3285084Sjohnlev  * after that subsequent attempts to start them will fail with X_EEXIST.
3295084Sjohnlev  *
3305084Sjohnlev  * Thus 'alloc' -really- creates and initializes the virtual
3315084Sjohnlev  * CPU context just once. Once the initialisation succeeds, we never
3325084Sjohnlev  * free it, nor the regular cpu_t to which it refers.
3335084Sjohnlev  */
3345084Sjohnlev void *
mach_cpucontext_alloc(struct cpu * cp)3355084Sjohnlev mach_cpucontext_alloc(struct cpu *cp)
3365084Sjohnlev {
3375084Sjohnlev 	kthread_t *tp = cp->cpu_thread;
3385084Sjohnlev 	vcpu_guest_context_t vgc;
3395084Sjohnlev 
3405084Sjohnlev 	int err = 1;
3415084Sjohnlev 
3425084Sjohnlev 	/*
3435084Sjohnlev 	 * First, augment the incoming cpu structure
3445084Sjohnlev 	 * - vcpu pointer reference
3455084Sjohnlev 	 * - pending event storage area
3465084Sjohnlev 	 * - physical address of GDT
3475084Sjohnlev 	 */
3485084Sjohnlev 	cp->cpu_m.mcpu_vcpu_info =
3495084Sjohnlev 	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
3505084Sjohnlev 	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
3515084Sjohnlev 	    sizeof (struct xen_evt_data), KM_SLEEP);
3525084Sjohnlev 	cp->cpu_m.mcpu_gdtpa =
3535084Sjohnlev 	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
3545084Sjohnlev 
3555084Sjohnlev 	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
3565084Sjohnlev 		goto done;
3575084Sjohnlev 
3585084Sjohnlev 	/*
3595084Sjohnlev 	 * Now set up the vcpu context so that we can start this vcpu
3605084Sjohnlev 	 * in the kernel at tp->t_pc (mp_startup).  Note that the
3615084Sjohnlev 	 * thread will thread_exit() shortly after performing the
3625084Sjohnlev 	 * initialization; in particular, we will *never* take a
3635084Sjohnlev 	 * privilege transition on this thread.
3645084Sjohnlev 	 */
3655084Sjohnlev 
3665084Sjohnlev 	bzero(&vgc, sizeof (vgc));
3675084Sjohnlev 
3685084Sjohnlev #ifdef __amd64
3695084Sjohnlev 	vgc.user_regs.rip = tp->t_pc;
3705084Sjohnlev 	vgc.user_regs.rsp = tp->t_sp;
3715084Sjohnlev 	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
3725084Sjohnlev #else
3735084Sjohnlev 	vgc.user_regs.eip = tp->t_pc;
3745084Sjohnlev 	vgc.user_regs.esp = tp->t_sp;
3755084Sjohnlev 	vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
3765084Sjohnlev #endif
3775084Sjohnlev 	/*
3785084Sjohnlev 	 * XXPV	Fix resume, if Russ didn't already fix it.
3795084Sjohnlev 	 *
3805084Sjohnlev 	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
3815084Sjohnlev 	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
3825084Sjohnlev 	 * that only lwps take traps that switch to the kernel stack;
3835084Sjohnlev 	 * part of creating an lwp adjusts the stack by subtracting
3845084Sjohnlev 	 * sizeof (struct regs) off t_stk.
3855084Sjohnlev 	 *
3865084Sjohnlev 	 * The more interesting question is, why do we do all the work
3875084Sjohnlev 	 * of a fully fledged lwp for a plain thread?  In particular
3885084Sjohnlev 	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
3895084Sjohnlev 	 * or futz with the LDT.  This should probably all be done with
3905084Sjohnlev 	 * an lwp context operator to keep pure thread context switch fast.
3915084Sjohnlev 	 */
3925084Sjohnlev 	vgc.kernel_sp = (ulong_t)tp->t_stk;
3935084Sjohnlev 
3945084Sjohnlev 	err = mp_set_cpu_context(&vgc, cp);
3955084Sjohnlev 
3965084Sjohnlev done:
3975084Sjohnlev 	if (err) {
3985084Sjohnlev 		mach_cpucontext_free(cp, NULL, err);
3995084Sjohnlev 		return (NULL);
4005084Sjohnlev 	}
4015084Sjohnlev 	return (cp);
4025084Sjohnlev }
4035084Sjohnlev 
4045084Sjohnlev /*
4055084Sjohnlev  * By the time we are called either we have successfully started
4065084Sjohnlev  * the cpu, or our attempt to start it has failed.
4075084Sjohnlev  */
4085084Sjohnlev 
4095084Sjohnlev /*ARGSUSED*/
4105084Sjohnlev void
mach_cpucontext_free(struct cpu * cp,void * arg,int err)4115084Sjohnlev mach_cpucontext_free(struct cpu *cp, void *arg, int err)
4125084Sjohnlev {
4135084Sjohnlev 	switch (err) {
4145084Sjohnlev 	case 0:
4155084Sjohnlev 		break;
4165084Sjohnlev 	case ETIMEDOUT:
4175084Sjohnlev 		/*
4185084Sjohnlev 		 * The vcpu context is loaded into the hypervisor, and
4195084Sjohnlev 		 * we've tried to start it, but the vcpu has not been set
4205084Sjohnlev 		 * running yet, for whatever reason.  We arrange to -not-
4215084Sjohnlev 		 * free any data structures it may be referencing.  In
4225084Sjohnlev 		 * particular, we've already told the hypervisor about
4235084Sjohnlev 		 * the GDT, and so we can't map it read-write again.
4245084Sjohnlev 		 */
4255084Sjohnlev 		break;
4265084Sjohnlev 	default:
4275084Sjohnlev 		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
4285084Sjohnlev 		kmem_free(cp->cpu_m.mcpu_evt_pend,
4295084Sjohnlev 		    sizeof (struct xen_evt_data));
4305084Sjohnlev 		break;
4315084Sjohnlev 	}
4325084Sjohnlev }
4335084Sjohnlev 
4345084Sjohnlev /*
4355084Sjohnlev  * Reset this CPU's context.  Clear out any pending evtchn data, since event
4365084Sjohnlev  * channel numbers will all change when we resume.
4375084Sjohnlev  */
4385084Sjohnlev void
mach_cpucontext_reset(cpu_t * cp)4395084Sjohnlev mach_cpucontext_reset(cpu_t *cp)
4405084Sjohnlev {
4415084Sjohnlev 	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
4425084Sjohnlev 	/* mcpu_intr_pending ? */
4435084Sjohnlev }
4445084Sjohnlev 
4455084Sjohnlev static void
pcb_to_user_regs(label_t * pcb,vcpu_guest_context_t * vgc)4465084Sjohnlev pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
4475084Sjohnlev {
4485084Sjohnlev #ifdef __amd64
4495084Sjohnlev 	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
4505084Sjohnlev 	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
4515084Sjohnlev 	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
4525084Sjohnlev 	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
4535084Sjohnlev 	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
4545084Sjohnlev 	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
4555084Sjohnlev 	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
4565084Sjohnlev 	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
4575084Sjohnlev #else /* __amd64 */
4585084Sjohnlev 	vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
4595084Sjohnlev 	vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
4605084Sjohnlev 	vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
4615084Sjohnlev 	vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
4625084Sjohnlev 	vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
4635084Sjohnlev 	vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
4645084Sjohnlev #endif /* __amd64 */
4655084Sjohnlev }
4665084Sjohnlev 
4675084Sjohnlev /*
4685159Sjohnlev  * Restore the context of a CPU during resume.  This context is always
4695159Sjohnlev  * inside enter_safe_phase(), below.
4705084Sjohnlev  */
4715084Sjohnlev void
mach_cpucontext_restore(cpu_t * cp)4725084Sjohnlev mach_cpucontext_restore(cpu_t *cp)
4735084Sjohnlev {
4745084Sjohnlev 	vcpu_guest_context_t vgc;
4755084Sjohnlev 	int err;
4765084Sjohnlev 
4775084Sjohnlev 	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
4785084Sjohnlev 	    cp->cpu_thread == cp->cpu_idle_thread);
4795084Sjohnlev 
4805084Sjohnlev 	bzero(&vgc, sizeof (vgc));
4815084Sjohnlev 
4825084Sjohnlev 	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
4835084Sjohnlev 
4845084Sjohnlev 	/*
4855084Sjohnlev 	 * We're emulating a longjmp() here: in particular, we need to bump the
4865084Sjohnlev 	 * stack pointer to account for the pop of xIP that returning from
4875084Sjohnlev 	 * longjmp() normally would do, and set the return value in xAX to 1.
4885084Sjohnlev 	 */
4895084Sjohnlev #ifdef __amd64
4905084Sjohnlev 	vgc.user_regs.rax = 1;
4915084Sjohnlev 	vgc.user_regs.rsp += sizeof (ulong_t);
4925084Sjohnlev #else
4935084Sjohnlev 	vgc.user_regs.eax = 1;
4945084Sjohnlev 	vgc.user_regs.esp += sizeof (ulong_t);
4955084Sjohnlev #endif
4965084Sjohnlev 
4975084Sjohnlev 	vgc.kernel_sp = cp->cpu_thread->t_sp;
4985084Sjohnlev 
4995084Sjohnlev 	err = mp_set_cpu_context(&vgc, cp);
5005084Sjohnlev 
5015084Sjohnlev 	ASSERT(err == 0);
5025084Sjohnlev }
5035084Sjohnlev 
5045159Sjohnlev /*
5055159Sjohnlev  * Reach a point at which the CPU can be safely powered-off or
5065159Sjohnlev  * suspended.  Nothing can wake this CPU out of the loop.
5075159Sjohnlev  */
5085159Sjohnlev static void
enter_safe_phase(void)5095159Sjohnlev enter_safe_phase(void)
5105159Sjohnlev {
5115159Sjohnlev 	ulong_t flags = intr_clear();
5125159Sjohnlev 
5135159Sjohnlev 	if (setjmp(&curthread->t_pcb) == 0) {
5145159Sjohnlev 		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
5155159Sjohnlev 		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
5165159Sjohnlev 			SMT_PAUSE();
5175159Sjohnlev 	}
5185159Sjohnlev 
5195159Sjohnlev 	ASSERT(!interrupts_enabled());
5205159Sjohnlev 
5215159Sjohnlev 	intr_restore(flags);
5225159Sjohnlev }
5235159Sjohnlev 
5245159Sjohnlev /*
5255159Sjohnlev  * Offline CPUs run this code even under a pause_cpus(), so we must
5265159Sjohnlev  * check if we need to enter the safe phase.
5275159Sjohnlev  */
5285084Sjohnlev void
mach_cpu_idle(void)5295084Sjohnlev mach_cpu_idle(void)
5305084Sjohnlev {
5315084Sjohnlev 	if (IN_XPV_PANIC()) {
5325084Sjohnlev 		xpv_panic_halt();
5335084Sjohnlev 	} else  {
5345084Sjohnlev 		(void) HYPERVISOR_block();
5355159Sjohnlev 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
5365159Sjohnlev 			enter_safe_phase();
5375159Sjohnlev 	}
5385159Sjohnlev }
5395159Sjohnlev 
5405159Sjohnlev /*
5415159Sjohnlev  * Spin until either start_cpus() wakes us up, or we get a request to
5425159Sjohnlev  * enter the safe phase (followed by a later start_cpus()).
5435159Sjohnlev  */
5445159Sjohnlev void
mach_cpu_pause(volatile char * safe)5455159Sjohnlev mach_cpu_pause(volatile char *safe)
5465159Sjohnlev {
5475159Sjohnlev 	*safe = PAUSE_WAIT;
5485159Sjohnlev 	membar_enter();
5495159Sjohnlev 
5505159Sjohnlev 	while (*safe != PAUSE_IDLE) {
5515159Sjohnlev 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
5525159Sjohnlev 			enter_safe_phase();
5535159Sjohnlev 		SMT_PAUSE();
5545084Sjohnlev 	}
5555084Sjohnlev }
5565084Sjohnlev 
5575084Sjohnlev void
mach_cpu_halt(char * msg)5585084Sjohnlev mach_cpu_halt(char *msg)
5595084Sjohnlev {
5605084Sjohnlev 	if (msg)
5615084Sjohnlev 		prom_printf("%s\n", msg);
5625084Sjohnlev 	(void) xen_vcpu_down(CPU->cpu_id);
5635084Sjohnlev }
5645084Sjohnlev 
5655084Sjohnlev /*ARGSUSED*/
5665084Sjohnlev int
mp_cpu_poweron(struct cpu * cp)5675084Sjohnlev mp_cpu_poweron(struct cpu *cp)
5685084Sjohnlev {
5695084Sjohnlev 	return (ENOTSUP);
5705084Sjohnlev }
5715084Sjohnlev 
5725084Sjohnlev /*ARGSUSED*/
5735084Sjohnlev int
mp_cpu_poweroff(struct cpu * cp)5745084Sjohnlev mp_cpu_poweroff(struct cpu *cp)
5755084Sjohnlev {
5765084Sjohnlev 	return (ENOTSUP);
5775084Sjohnlev }
5785084Sjohnlev 
5795159Sjohnlev void
mp_enter_barrier(void)5805159Sjohnlev mp_enter_barrier(void)
5815084Sjohnlev {
5825159Sjohnlev 	hrtime_t last_poke_time = 0;
5835159Sjohnlev 	int poke_allowed = 0;
5845159Sjohnlev 	int done = 0;
5855159Sjohnlev 	int i;
5865084Sjohnlev 
5875084Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
5885084Sjohnlev 
5895159Sjohnlev 	pause_cpus(NULL);
5905159Sjohnlev 
5915159Sjohnlev 	while (!done) {
5925159Sjohnlev 		done = 1;
5935159Sjohnlev 		poke_allowed = 0;
5945159Sjohnlev 
5955159Sjohnlev 		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
5965159Sjohnlev 			last_poke_time = xpv_gethrtime();
5975159Sjohnlev 			poke_allowed = 1;
5985159Sjohnlev 		}
5995159Sjohnlev 
6005159Sjohnlev 		for (i = 0; i < NCPU; i++) {
6015159Sjohnlev 			cpu_t *cp = cpu_get(i);
6025159Sjohnlev 
6035159Sjohnlev 			if (cp == NULL || cp == CPU)
6045159Sjohnlev 				continue;
6055159Sjohnlev 
6065159Sjohnlev 			switch (cpu_phase[i]) {
6075159Sjohnlev 			case CPU_PHASE_NONE:
6085159Sjohnlev 				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
6095159Sjohnlev 				poke_cpu(i);
6105159Sjohnlev 				done = 0;
6115159Sjohnlev 				break;
6125159Sjohnlev 
6135159Sjohnlev 			case CPU_PHASE_WAIT_SAFE:
6145159Sjohnlev 				if (poke_allowed)
6155159Sjohnlev 					poke_cpu(i);
6165159Sjohnlev 				done = 0;
6175159Sjohnlev 				break;
6185159Sjohnlev 
6195159Sjohnlev 			case CPU_PHASE_SAFE:
6205159Sjohnlev 			case CPU_PHASE_POWERED_OFF:
6215159Sjohnlev 				break;
6225159Sjohnlev 			}
6235159Sjohnlev 		}
6245159Sjohnlev 
6255159Sjohnlev 		SMT_PAUSE();
6265159Sjohnlev 	}
6275159Sjohnlev }
6285159Sjohnlev 
6295159Sjohnlev void
mp_leave_barrier(void)6305159Sjohnlev mp_leave_barrier(void)
6315159Sjohnlev {
6325159Sjohnlev 	int i;
6335159Sjohnlev 
6345159Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
6355159Sjohnlev 
6365159Sjohnlev 	for (i = 0; i < NCPU; i++) {
6375159Sjohnlev 		cpu_t *cp = cpu_get(i);
6385159Sjohnlev 
6395159Sjohnlev 		if (cp == NULL || cp == CPU)
6405159Sjohnlev 			continue;
6415159Sjohnlev 
6425159Sjohnlev 		switch (cpu_phase[i]) {
6435159Sjohnlev 		/*
6445159Sjohnlev 		 * If we see a CPU in one of these phases, something has
6455159Sjohnlev 		 * gone badly wrong with the guarantees
6465159Sjohnlev 		 * mp_enter_barrier() is supposed to provide.  Rather
6475159Sjohnlev 		 * than attempt to stumble along (and since we can't
6485159Sjohnlev 		 * panic properly in this context), we tell the
6495159Sjohnlev 		 * hypervisor we've crashed.
6505159Sjohnlev 		 */
6515159Sjohnlev 		case CPU_PHASE_NONE:
6525159Sjohnlev 		case CPU_PHASE_WAIT_SAFE:
6535159Sjohnlev 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
6545159Sjohnlev 			break;
6555159Sjohnlev 
6565159Sjohnlev 		case CPU_PHASE_POWERED_OFF:
6575159Sjohnlev 			break;
6585159Sjohnlev 
6595159Sjohnlev 		case CPU_PHASE_SAFE:
6605159Sjohnlev 			cpu_phase[i] = CPU_PHASE_NONE;
6615159Sjohnlev 		}
6625084Sjohnlev 	}
6635084Sjohnlev 
6645159Sjohnlev 	start_cpus();
6655084Sjohnlev }
6665084Sjohnlev 
6675084Sjohnlev static int
poweroff_vcpu(struct cpu * cp)6685084Sjohnlev poweroff_vcpu(struct cpu *cp)
6695084Sjohnlev {
6705084Sjohnlev 	int error;
6715084Sjohnlev 
6725084Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
6735084Sjohnlev 
6745084Sjohnlev 	ASSERT(CPU->cpu_id != cp->cpu_id);
6755084Sjohnlev 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
6765084Sjohnlev 
6775159Sjohnlev 	mp_enter_barrier();
6785084Sjohnlev 
6795084Sjohnlev 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
6805159Sjohnlev 		ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
6815159Sjohnlev 
6825084Sjohnlev 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
6835159Sjohnlev 
6845084Sjohnlev 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
6855084Sjohnlev 		cp->cpu_flags &=
6865084Sjohnlev 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
6875084Sjohnlev 
6885159Sjohnlev 		cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
6895159Sjohnlev 
6905084Sjohnlev 		cpu_set_state(cp);
6915084Sjohnlev 	}
6925159Sjohnlev 
6935159Sjohnlev 	mp_leave_barrier();
6945159Sjohnlev 
6955084Sjohnlev 	return (error);
6965084Sjohnlev }
6975084Sjohnlev 
6985084Sjohnlev static int
vcpu_config_poweroff(processorid_t id)6995084Sjohnlev vcpu_config_poweroff(processorid_t id)
7005084Sjohnlev {
7015084Sjohnlev 	int oldstate;
7025084Sjohnlev 	int error;
7035084Sjohnlev 	cpu_t *cp;
7045084Sjohnlev 
7055084Sjohnlev 	mutex_enter(&cpu_lock);
7065084Sjohnlev 
7075084Sjohnlev 	if ((cp = cpu_get(id)) == NULL) {
7085084Sjohnlev 		mutex_exit(&cpu_lock);
7095084Sjohnlev 		return (ESRCH);
7105084Sjohnlev 	}
7115084Sjohnlev 
7125084Sjohnlev 	if (cpu_get_state(cp) == P_POWEROFF) {
7135084Sjohnlev 		mutex_exit(&cpu_lock);
7145084Sjohnlev 		return (0);
7155084Sjohnlev 	}
7165084Sjohnlev 
7175084Sjohnlev 	mutex_exit(&cpu_lock);
7185084Sjohnlev 
7195084Sjohnlev 	do {
7205084Sjohnlev 		error = p_online_internal(id, P_OFFLINE,
7215084Sjohnlev 		    &oldstate);
7225084Sjohnlev 
7235084Sjohnlev 		if (error != 0)
7245084Sjohnlev 			break;
7255084Sjohnlev 
7265084Sjohnlev 		/*
7275084Sjohnlev 		 * So we just changed it to P_OFFLINE.  But then we dropped
7285084Sjohnlev 		 * cpu_lock, so now it is possible for another thread to change
7295084Sjohnlev 		 * the cpu back to a different, non-quiesced state e.g.
7305084Sjohnlev 		 * P_ONLINE.
7315084Sjohnlev 		 */
7325084Sjohnlev 		mutex_enter(&cpu_lock);
7335084Sjohnlev 		if ((cp = cpu_get(id)) == NULL)
7345084Sjohnlev 			error = ESRCH;
7355084Sjohnlev 		else {
7365084Sjohnlev 			if (cp->cpu_flags & CPU_QUIESCED)
7375084Sjohnlev 				error = poweroff_vcpu(cp);
7385084Sjohnlev 			else
7395084Sjohnlev 				error = EBUSY;
7405084Sjohnlev 		}
7415084Sjohnlev 		mutex_exit(&cpu_lock);
7425084Sjohnlev 	} while (error == EBUSY);
7435084Sjohnlev 
7445084Sjohnlev 	return (error);
7455084Sjohnlev }
7465084Sjohnlev 
7475084Sjohnlev /*
7485084Sjohnlev  * Add a new virtual cpu to the domain.
7495084Sjohnlev  */
7505084Sjohnlev static int
vcpu_config_new(processorid_t id)7515084Sjohnlev vcpu_config_new(processorid_t id)
7525084Sjohnlev {
7535084Sjohnlev 	extern int start_cpu(processorid_t);
7545084Sjohnlev 	int error;
7555084Sjohnlev 
7565084Sjohnlev 	if (ncpus == 1) {
7575084Sjohnlev 		printf("cannot (yet) add cpus to a single-cpu domain\n");
7585084Sjohnlev 		return (ENOTSUP);
7595084Sjohnlev 	}
7605084Sjohnlev 
7615084Sjohnlev 	affinity_set(CPU_CURRENT);
7625084Sjohnlev 	error = start_cpu(id);
7635084Sjohnlev 	affinity_clear();
7645084Sjohnlev 	return (error);
7655084Sjohnlev }
7665084Sjohnlev 
7675084Sjohnlev static int
poweron_vcpu(struct cpu * cp)7685159Sjohnlev poweron_vcpu(struct cpu *cp)
7695159Sjohnlev {
7705159Sjohnlev 	int error;
7715159Sjohnlev 
7725159Sjohnlev 	ASSERT(MUTEX_HELD(&cpu_lock));
7735159Sjohnlev 
7745159Sjohnlev 	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
7755159Sjohnlev 		printf("poweron_vcpu: vcpu%d is not available!\n",
7765159Sjohnlev 		    cp->cpu_id);
7775159Sjohnlev 		return (ENXIO);
7785159Sjohnlev 	}
7795159Sjohnlev 
7805159Sjohnlev 	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
7815159Sjohnlev 		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
7825159Sjohnlev 		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
7835159Sjohnlev 		cp->cpu_flags &= ~CPU_POWEROFF;
7845159Sjohnlev 		/*
7855159Sjohnlev 		 * There are some nasty races possible here.
7865159Sjohnlev 		 * Tell the vcpu it's up one more time.
7875159Sjohnlev 		 * XXPV	Is this enough?  Is this safe?
7885159Sjohnlev 		 */
7895159Sjohnlev 		(void) xen_vcpu_up(cp->cpu_id);
7905159Sjohnlev 
7915159Sjohnlev 		cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
7925159Sjohnlev 
7935159Sjohnlev 		cpu_set_state(cp);
7945159Sjohnlev 	}
7955159Sjohnlev 	return (error);
7965159Sjohnlev }
7975159Sjohnlev 
7985159Sjohnlev static int
vcpu_config_poweron(processorid_t id)7995084Sjohnlev vcpu_config_poweron(processorid_t id)
8005084Sjohnlev {
8015084Sjohnlev 	cpu_t *cp;
8025084Sjohnlev 	int oldstate;
8035084Sjohnlev 	int error;
8045084Sjohnlev 
8055084Sjohnlev 	if (id >= ncpus)
8065084Sjohnlev 		return (vcpu_config_new(id));
8075084Sjohnlev 
8085084Sjohnlev 	mutex_enter(&cpu_lock);
8095084Sjohnlev 
8105084Sjohnlev 	if ((cp = cpu_get(id)) == NULL) {
8115084Sjohnlev 		mutex_exit(&cpu_lock);
8125084Sjohnlev 		return (ESRCH);
8135084Sjohnlev 	}
8145084Sjohnlev 
8155084Sjohnlev 	if (cpu_get_state(cp) != P_POWEROFF) {
8165084Sjohnlev 		mutex_exit(&cpu_lock);
8175084Sjohnlev 		return (0);
8185084Sjohnlev 	}
8195084Sjohnlev 
8205084Sjohnlev 	if ((error = poweron_vcpu(cp)) != 0) {
8215084Sjohnlev 		mutex_exit(&cpu_lock);
8225084Sjohnlev 		return (error);
8235084Sjohnlev 	}
8245084Sjohnlev 
8255084Sjohnlev 	mutex_exit(&cpu_lock);
8265084Sjohnlev 
8275084Sjohnlev 	return (p_online_internal(id, P_ONLINE, &oldstate));
8285084Sjohnlev }
8295084Sjohnlev 
8305084Sjohnlev #define	REPORT_LEN	128
8315084Sjohnlev 
8325084Sjohnlev static void
vcpu_config_report(processorid_t id,uint_t newstate,int error)8335084Sjohnlev vcpu_config_report(processorid_t id, uint_t newstate, int error)
8345084Sjohnlev {
8355084Sjohnlev 	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
8365084Sjohnlev 	size_t len;
8375084Sjohnlev 	char *ps;
8385084Sjohnlev 
8395084Sjohnlev 	switch (newstate) {
8405084Sjohnlev 	case P_ONLINE:
8415084Sjohnlev 		ps = PS_ONLINE;
8425084Sjohnlev 		break;
8435084Sjohnlev 	case P_POWEROFF:
8445084Sjohnlev 		ps = PS_POWEROFF;
8455084Sjohnlev 		break;
8465084Sjohnlev 	default:
8475084Sjohnlev 		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
8485084Sjohnlev 		break;
8495084Sjohnlev 	}
8505084Sjohnlev 
8515084Sjohnlev 	len = snprintf(report, REPORT_LEN,
8525084Sjohnlev 	    "cpu%d: externally initiated %s", id, ps);
8535084Sjohnlev 
8545084Sjohnlev 	if (!error) {
8555084Sjohnlev 		cmn_err(CE_CONT, "!%s\n", report);
8565084Sjohnlev 		kmem_free(report, REPORT_LEN);
8575084Sjohnlev 		return;
8585084Sjohnlev 	}
8595084Sjohnlev 
8605084Sjohnlev 	len += snprintf(report + len, REPORT_LEN - len,
8615084Sjohnlev 	    " failed, error %d: ", error);
8625084Sjohnlev 	switch (error) {
8635084Sjohnlev 	case EEXIST:
8645084Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
8655084Sjohnlev 		    "cpu already %s", ps ? ps : "?");
8665084Sjohnlev 		break;
8675084Sjohnlev 	case ESRCH:
8685084Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
8695084Sjohnlev 		    "cpu not found");
8705084Sjohnlev 		break;
8715084Sjohnlev 	case EINVAL:
8725084Sjohnlev 	case EALREADY:
8735084Sjohnlev 		break;
8745084Sjohnlev 	case EPERM:
8755084Sjohnlev 		len += snprintf(report + len, REPORT_LEN - len,
8765084Sjohnlev 		    "insufficient privilege (0x%x)", id);
8775084Sjohnlev 		break;
8785084Sjohnlev 	case EBUSY:
8795084Sjohnlev 		switch (newstate) {
8805084Sjohnlev 		case P_ONLINE:
8815084Sjohnlev 			/*
8825084Sjohnlev 			 * This return comes from mp_cpu_start -
8835084Sjohnlev 			 * we cannot 'start' the boot CPU.
8845084Sjohnlev 			 */
8855084Sjohnlev 			len += snprintf(report + len, REPORT_LEN - len,
8865084Sjohnlev 			    "already running");
8875084Sjohnlev 			break;
8885084Sjohnlev 		case P_POWEROFF:
8895084Sjohnlev 			len += snprintf(report + len, REPORT_LEN - len,
8905084Sjohnlev 			    "bound lwps?");
8915084Sjohnlev 			break;
8925084Sjohnlev 		default:
8935084Sjohnlev 			break;
8945084Sjohnlev 		}
8955084Sjohnlev 	default:
8965084Sjohnlev 		break;
8975084Sjohnlev 	}
8985084Sjohnlev 
8995084Sjohnlev 	cmn_err(CE_CONT, "%s\n", report);
9005084Sjohnlev 	kmem_free(report, REPORT_LEN);
9015084Sjohnlev }
9025084Sjohnlev 
9035084Sjohnlev static void
vcpu_config(void * arg)9045084Sjohnlev vcpu_config(void *arg)
9055084Sjohnlev {
9065084Sjohnlev 	int id = (int)(uintptr_t)arg;
9075084Sjohnlev 	int error;
9085084Sjohnlev 	char dir[16];
9095084Sjohnlev 	char *state;
9105084Sjohnlev 
9115084Sjohnlev 	if ((uint_t)id >= max_ncpus) {
9125084Sjohnlev 		cmn_err(CE_WARN,
9135084Sjohnlev 		    "vcpu_config: cpu%d does not fit in this domain", id);
9145084Sjohnlev 		return;
9155084Sjohnlev 	}
9165084Sjohnlev 
9175084Sjohnlev 	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
9185084Sjohnlev 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
9195084Sjohnlev 	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
9205084Sjohnlev 		if (strcmp(state, "online") == 0) {
9215084Sjohnlev 			error = vcpu_config_poweron(id);
9225084Sjohnlev 			vcpu_config_report(id, P_ONLINE, error);
9235084Sjohnlev 		} else if (strcmp(state, "offline") == 0) {
9245084Sjohnlev 			error = vcpu_config_poweroff(id);
9255084Sjohnlev 			vcpu_config_report(id, P_POWEROFF, error);
9265084Sjohnlev 		} else {
9275084Sjohnlev 			cmn_err(CE_WARN,
9285084Sjohnlev 			    "cpu%d: unknown target state '%s'", id, state);
9295084Sjohnlev 		}
9305084Sjohnlev 	} else
9315084Sjohnlev 		cmn_err(CE_WARN,
9325084Sjohnlev 		    "cpu%d: unable to read target state from xenstore", id);
9335084Sjohnlev 
9345084Sjohnlev 	kmem_free(state, MAXPATHLEN);
9355084Sjohnlev }
9365084Sjohnlev 
9375084Sjohnlev /*ARGSUSED*/
9385084Sjohnlev static void
vcpu_config_event(struct xenbus_watch * watch,const char ** vec,uint_t len)9395084Sjohnlev vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
9405084Sjohnlev {
9415084Sjohnlev 	const char *path = vec[XS_WATCH_PATH];
9425084Sjohnlev 	processorid_t id;
9435084Sjohnlev 	char *s;
9445084Sjohnlev 
9455084Sjohnlev 	if ((s = strstr(path, "cpu/")) != NULL &&
9465084Sjohnlev 	    sscanf(s, "cpu/%d", &id) == 1) {
9475084Sjohnlev 		/*
9485084Sjohnlev 		 * Run the virtual CPU configuration on a separate thread to
9495084Sjohnlev 		 * avoid blocking on this event for too long (and for now,
9505084Sjohnlev 		 * to ensure configuration requests are serialized.)
9515084Sjohnlev 		 */
9525084Sjohnlev 		(void) taskq_dispatch(cpu_config_tq,
9535084Sjohnlev 		    vcpu_config, (void *)(uintptr_t)id, 0);
9545084Sjohnlev 	}
9555084Sjohnlev }
9565084Sjohnlev 
9575084Sjohnlev static int
xen_vcpu_initialize(processorid_t id,vcpu_guest_context_t * vgc)9585084Sjohnlev xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
9595084Sjohnlev {
9605084Sjohnlev 	int err;
9615084Sjohnlev 
9625084Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
9635084Sjohnlev 		char *str;
9645084Sjohnlev 		int level = CE_WARN;
9655084Sjohnlev 
9665084Sjohnlev 		switch (err) {
9675084Sjohnlev 		case -X_EINVAL:
9685084Sjohnlev 			/*
9695084Sjohnlev 			 * This interface squashes multiple error sources
9705084Sjohnlev 			 * to one error code.  In particular, an X_EINVAL
9715084Sjohnlev 			 * code can mean:
9725084Sjohnlev 			 *
9735084Sjohnlev 			 * -	the vcpu id is out of range
9745084Sjohnlev 			 * -	cs or ss are in ring 0
9755084Sjohnlev 			 * -	cr3 is wrong
9765084Sjohnlev 			 * -	an entry in the new gdt is above the
9775084Sjohnlev 			 *	reserved entry
9785084Sjohnlev 			 * -	a frame underneath the new gdt is bad
9795084Sjohnlev 			 */
9805084Sjohnlev 			str = "something is wrong :(";
9815084Sjohnlev 			break;
9825084Sjohnlev 		case -X_ENOENT:
9835084Sjohnlev 			str = "no such cpu";
9845084Sjohnlev 			break;
9855084Sjohnlev 		case -X_ENOMEM:
9865084Sjohnlev 			str = "no mem to copy ctxt";
9875084Sjohnlev 			break;
9885084Sjohnlev 		case -X_EFAULT:
9895084Sjohnlev 			str = "bad address";
9905084Sjohnlev 			break;
9915084Sjohnlev 		case -X_EEXIST:
9925084Sjohnlev 			/*
9935084Sjohnlev 			 * Hmm.  This error is returned if the vcpu has already
9945084Sjohnlev 			 * been initialized once before in the lifetime of this
9955084Sjohnlev 			 * domain.  This is a logic error in the kernel.
9965084Sjohnlev 			 */
9975084Sjohnlev 			level = CE_PANIC;
9985084Sjohnlev 			str = "already initialized";
9995084Sjohnlev 			break;
10005084Sjohnlev 		default:
10015084Sjohnlev 			level = CE_PANIC;
10025084Sjohnlev 			str = "<unexpected>";
10035084Sjohnlev 			break;
10045084Sjohnlev 		}
10055084Sjohnlev 
10065084Sjohnlev 		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
10075084Sjohnlev 		    id, -err, str);
10085084Sjohnlev 	}
10095084Sjohnlev 	return (err);
10105084Sjohnlev }
10115084Sjohnlev 
10125084Sjohnlev long
xen_vcpu_up(processorid_t id)10135084Sjohnlev xen_vcpu_up(processorid_t id)
10145084Sjohnlev {
10155084Sjohnlev 	long err;
10165084Sjohnlev 
10175084Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
10185084Sjohnlev 		char *str;
10195084Sjohnlev 
10205084Sjohnlev 		switch (err) {
10215084Sjohnlev 		case -X_ENOENT:
10225084Sjohnlev 			str = "no such cpu";
10235084Sjohnlev 			break;
10245084Sjohnlev 		case -X_EINVAL:
10255084Sjohnlev 			/*
10265084Sjohnlev 			 * Perhaps this is diagnostic overkill.
10275084Sjohnlev 			 */
10285084Sjohnlev 			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
10295084Sjohnlev 				str = "bad cpuid";
10305084Sjohnlev 			else
10315084Sjohnlev 				str = "not initialized";
10325084Sjohnlev 			break;
10335084Sjohnlev 		default:
10345084Sjohnlev 			str = "<unexpected>";
10355084Sjohnlev 			break;
10365084Sjohnlev 		}
10375084Sjohnlev 
10385084Sjohnlev 		printf("vcpu%d: failed to start: error %d: %s\n",
10395084Sjohnlev 		    id, -(int)err, str);
10405084Sjohnlev 		return (EBFONT);	/* deliberately silly */
10415084Sjohnlev 	}
10425084Sjohnlev 	return (err);
10435084Sjohnlev }
10445084Sjohnlev 
10455084Sjohnlev long
xen_vcpu_down(processorid_t id)10465084Sjohnlev xen_vcpu_down(processorid_t id)
10475084Sjohnlev {
10485084Sjohnlev 	long err;
10495084Sjohnlev 
10505084Sjohnlev 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
10515084Sjohnlev 		/*
10525084Sjohnlev 		 * X_ENOENT:	no such cpu
10535084Sjohnlev 		 * X_EINVAL:	bad cpuid
10545084Sjohnlev 		 */
10555084Sjohnlev 		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
10565084Sjohnlev 	}
10575084Sjohnlev 
10585084Sjohnlev 	return (err);
10595084Sjohnlev }
1060