xref: /illumos-gate/usr/src/uts/i86pc/ml/syscall_asm_amd64.S (revision 5cd084edc8f03fe81d55c25eb29741c4081ae953)
15d9d9091SRichard Lowe/*
25d9d9091SRichard Lowe * CDDL HEADER START
35d9d9091SRichard Lowe *
45d9d9091SRichard Lowe * The contents of this file are subject to the terms of the
55d9d9091SRichard Lowe * Common Development and Distribution License (the "License").
65d9d9091SRichard Lowe * You may not use this file except in compliance with the License.
75d9d9091SRichard Lowe *
85d9d9091SRichard Lowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95d9d9091SRichard Lowe * or http://www.opensolaris.org/os/licensing.
105d9d9091SRichard Lowe * See the License for the specific language governing permissions
115d9d9091SRichard Lowe * and limitations under the License.
125d9d9091SRichard Lowe *
135d9d9091SRichard Lowe * When distributing Covered Code, include this CDDL HEADER in each
145d9d9091SRichard Lowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155d9d9091SRichard Lowe * If applicable, add the following below this CDDL HEADER, with the
165d9d9091SRichard Lowe * fields enclosed by brackets "[]" replaced with your own identifying
175d9d9091SRichard Lowe * information: Portions Copyright [yyyy] [name of copyright owner]
185d9d9091SRichard Lowe *
195d9d9091SRichard Lowe * CDDL HEADER END
205d9d9091SRichard Lowe */
215d9d9091SRichard Lowe/*
225d9d9091SRichard Lowe * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
235d9d9091SRichard Lowe * Copyright 2019 Joyent, Inc.
245d9d9091SRichard Lowe * Copyright (c) 2016 by Delphix. All rights reserved.
25*5cd084edSDan McDonald * Copyright 2024 MNX Cloud, Inc.
265d9d9091SRichard Lowe */
275d9d9091SRichard Lowe
285d9d9091SRichard Lowe#include <sys/asm_linkage.h>
295d9d9091SRichard Lowe#include <sys/asm_misc.h>
305d9d9091SRichard Lowe#include <sys/regset.h>
315d9d9091SRichard Lowe#include <sys/privregs.h>
325d9d9091SRichard Lowe#include <sys/psw.h>
335d9d9091SRichard Lowe#include <sys/machbrand.h>
345d9d9091SRichard Lowe
355d9d9091SRichard Lowe#include <sys/segments.h>
365d9d9091SRichard Lowe#include <sys/pcb.h>
375d9d9091SRichard Lowe#include <sys/trap.h>
385d9d9091SRichard Lowe#include <sys/ftrace.h>
395d9d9091SRichard Lowe#include <sys/traptrace.h>
405d9d9091SRichard Lowe#include <sys/clock.h>
415d9d9091SRichard Lowe#include <sys/model.h>
425d9d9091SRichard Lowe#include <sys/panic.h>
435d9d9091SRichard Lowe
445d9d9091SRichard Lowe#if defined(__xpv)
455d9d9091SRichard Lowe#include <sys/hypervisor.h>
465d9d9091SRichard Lowe#endif
475d9d9091SRichard Lowe
485d9d9091SRichard Lowe#include "assym.h"
495d9d9091SRichard Lowe
505d9d9091SRichard Lowe/*
515d9d9091SRichard Lowe * We implement five flavours of system call entry points
525d9d9091SRichard Lowe *
535d9d9091SRichard Lowe * -	syscall/sysretq		(amd64 generic)
545d9d9091SRichard Lowe * -	syscall/sysretl		(i386 plus SYSC bit)
555d9d9091SRichard Lowe * -	sysenter/sysexit	(i386 plus SEP bit)
565d9d9091SRichard Lowe * -	int/iret		(i386 generic)
575d9d9091SRichard Lowe * -	lcall/iret		(i386 generic)
585d9d9091SRichard Lowe *
595d9d9091SRichard Lowe * The current libc included in Solaris uses int/iret as the base unoptimized
605d9d9091SRichard Lowe * kernel entry method. Older libc implementations and legacy binaries may use
615d9d9091SRichard Lowe * the lcall call gate, so it must continue to be supported.
625d9d9091SRichard Lowe *
635d9d9091SRichard Lowe * System calls that use an lcall call gate are processed in trap() via a
645d9d9091SRichard Lowe * segment-not-present trap, i.e. lcalls are extremely slow(!).
655d9d9091SRichard Lowe *
665d9d9091SRichard Lowe * The basic pattern used in the 32-bit SYSC handler at this point in time is
675d9d9091SRichard Lowe * to have the bare minimum of assembler, and get to the C handlers as
685d9d9091SRichard Lowe * quickly as possible.
695d9d9091SRichard Lowe *
705d9d9091SRichard Lowe * The 64-bit handler is much closer to the sparcv9 handler; that's
715d9d9091SRichard Lowe * because of passing arguments in registers.  The 32-bit world still
725d9d9091SRichard Lowe * passes arguments on the stack -- that makes that handler substantially
735d9d9091SRichard Lowe * more complex.
745d9d9091SRichard Lowe *
755d9d9091SRichard Lowe * The two handlers share a few code fragments which are broken
765d9d9091SRichard Lowe * out into preprocessor macros below.
775d9d9091SRichard Lowe *
785d9d9091SRichard Lowe * XX64	come back and speed all this up later.  The 32-bit stuff looks
795d9d9091SRichard Lowe * especially easy to speed up the argument copying part ..
805d9d9091SRichard Lowe *
815d9d9091SRichard Lowe *
825d9d9091SRichard Lowe * Notes about segment register usage (c.f. the 32-bit kernel)
835d9d9091SRichard Lowe *
845d9d9091SRichard Lowe * In the 32-bit kernel, segment registers are dutifully saved and
855d9d9091SRichard Lowe * restored on all mode transitions because the kernel uses them directly.
865d9d9091SRichard Lowe * When the processor is running in 64-bit mode, segment registers are
875d9d9091SRichard Lowe * largely ignored.
885d9d9091SRichard Lowe *
895d9d9091SRichard Lowe * %cs and %ss
905d9d9091SRichard Lowe *	controlled by the hardware mechanisms that make mode transitions
915d9d9091SRichard Lowe *
925d9d9091SRichard Lowe * The remaining segment registers have to either be pointing at a valid
935d9d9091SRichard Lowe * descriptor i.e. with the 'present' bit set, or they can NULL descriptors
945d9d9091SRichard Lowe *
955d9d9091SRichard Lowe * %ds and %es
965d9d9091SRichard Lowe *	always ignored
975d9d9091SRichard Lowe *
985d9d9091SRichard Lowe * %fs and %gs
995d9d9091SRichard Lowe *	fsbase and gsbase are used to control the place they really point at.
1005d9d9091SRichard Lowe *	The kernel only depends on %gs, and controls its own gsbase via swapgs
1015d9d9091SRichard Lowe *
1025d9d9091SRichard Lowe * Note that loading segment registers is still costly because the GDT
1035d9d9091SRichard Lowe * lookup still happens (this is because the hardware can't know that we're
1045d9d9091SRichard Lowe * not setting up these segment registers for a 32-bit program).  Thus we
1055d9d9091SRichard Lowe * avoid doing this in the syscall path, and defer them to lwp context switch
1065d9d9091SRichard Lowe * handlers, so the register values remain virtualized to the lwp.
1075d9d9091SRichard Lowe */
1085d9d9091SRichard Lowe
1095d9d9091SRichard Lowe#if defined(SYSCALLTRACE)
1105d9d9091SRichard Lowe#define	ORL_SYSCALLTRACE(r32)		\
1115d9d9091SRichard Lowe	orl	syscalltrace(%rip), r32
1125d9d9091SRichard Lowe#else
1135d9d9091SRichard Lowe#define	ORL_SYSCALLTRACE(r32)
1145d9d9091SRichard Lowe#endif
1155d9d9091SRichard Lowe
1165d9d9091SRichard Lowe/*
1175d9d9091SRichard Lowe * In the 32-bit kernel, we do absolutely nothing before getting into the
1185d9d9091SRichard Lowe * brand callback checks.  In 64-bit land, we do swapgs and then come here.
1195d9d9091SRichard Lowe * We assume that the %rsp- and %r15-stashing fields in the CPU structure
1205d9d9091SRichard Lowe * are still unused.
1215d9d9091SRichard Lowe *
1225d9d9091SRichard Lowe * Check if a brand_mach_ops callback is defined for the specified callback_id
1235d9d9091SRichard Lowe * type.  If so invoke it with the kernel's %gs value loaded and the following
1245d9d9091SRichard Lowe * data on the stack:
1255d9d9091SRichard Lowe *
1265d9d9091SRichard Lowe * stack:  --------------------------------------
1275d9d9091SRichard Lowe *      32 | callback pointer			|
1285d9d9091SRichard Lowe *    | 24 | user (or interrupt) stack pointer	|
1295d9d9091SRichard Lowe *    | 16 | lwp pointer			|
1305d9d9091SRichard Lowe *    v  8 | userland return address		|
1315d9d9091SRichard Lowe *       0 | callback wrapper return addr	|
1325d9d9091SRichard Lowe *         --------------------------------------
1335d9d9091SRichard Lowe *
1345d9d9091SRichard Lowe * Since we're pushing the userland return address onto the kernel stack
1355d9d9091SRichard Lowe * we need to get that address without accessing the user's stack (since we
1365d9d9091SRichard Lowe * can't trust that data).  There are different ways to get the userland
1375d9d9091SRichard Lowe * return address depending on how the syscall trap was made:
1385d9d9091SRichard Lowe *
1395d9d9091SRichard Lowe * a) For sys_syscall and sys_syscall32 the return address is in %rcx.
1405d9d9091SRichard Lowe * b) For sys_sysenter the return address is in %rdx.
1415d9d9091SRichard Lowe * c) For sys_int80 and sys_syscall_int (int91), upon entry into the macro,
1425d9d9091SRichard Lowe *    the stack pointer points at the state saved when we took the interrupt:
1435d9d9091SRichard Lowe *	 ------------------------
1445d9d9091SRichard Lowe *    |  | user's %ss		|
1455d9d9091SRichard Lowe *    |  | user's %esp		|
1465d9d9091SRichard Lowe *    |  | EFLAGS register	|
1475d9d9091SRichard Lowe *    v  | user's %cs		|
1485d9d9091SRichard Lowe *       | user's %eip		|
1495d9d9091SRichard Lowe *	 ------------------------
1505d9d9091SRichard Lowe *
1515d9d9091SRichard Lowe * The 2nd parameter to the BRAND_CALLBACK macro is either the
1525d9d9091SRichard Lowe * BRAND_URET_FROM_REG or BRAND_URET_FROM_INTR_STACK macro.  These macros are
1535d9d9091SRichard Lowe * used to generate the proper code to get the userland return address for
1545d9d9091SRichard Lowe * each syscall entry point.
1555d9d9091SRichard Lowe *
1565d9d9091SRichard Lowe * The interface to the brand callbacks on the 64-bit kernel assumes %r15
1575d9d9091SRichard Lowe * is available as a scratch register within the callback.  If the callback
1585d9d9091SRichard Lowe * returns within the kernel then this macro will restore %r15.  If the
1595d9d9091SRichard Lowe * callback is going to return directly to userland then it should restore
1605d9d9091SRichard Lowe * %r15 before returning to userland.
1615d9d9091SRichard Lowe */
1625d9d9091SRichard Lowe#define BRAND_URET_FROM_REG(rip_reg)					\
1635d9d9091SRichard Lowe	pushq	rip_reg			/* push the return address	*/
1645d9d9091SRichard Lowe
1655d9d9091SRichard Lowe/*
1665d9d9091SRichard Lowe * The interrupt stack pointer we saved on entry to the BRAND_CALLBACK macro
1675d9d9091SRichard Lowe * is currently pointing at the user return address (%eip).
1685d9d9091SRichard Lowe */
1695d9d9091SRichard Lowe#define BRAND_URET_FROM_INTR_STACK()					\
1705d9d9091SRichard Lowe	movq	%gs:CPU_RTMP_RSP, %r15	/* grab the intr. stack pointer	*/ ;\
1715d9d9091SRichard Lowe	pushq	(%r15)			/* push the return address	*/
1725d9d9091SRichard Lowe
1735d9d9091SRichard Lowe#define	BRAND_CALLBACK(callback_id, push_userland_ret)			    \
1745d9d9091SRichard Lowe	movq	%rsp, %gs:CPU_RTMP_RSP	/* save the stack pointer	*/ ;\
1755d9d9091SRichard Lowe	movq	%r15, %gs:CPU_RTMP_R15	/* save %r15			*/ ;\
1765d9d9091SRichard Lowe	movq	%gs:CPU_THREAD, %r15	/* load the thread pointer	*/ ;\
1775d9d9091SRichard Lowe	movq	T_STACK(%r15), %rsp	/* switch to the kernel stack	*/ ;\
1785d9d9091SRichard Lowe	subq	$16, %rsp		/* save space for 2 pointers	*/ ;\
1795d9d9091SRichard Lowe	pushq	%r14			/* save %r14			*/ ;\
1805d9d9091SRichard Lowe	movq	%gs:CPU_RTMP_RSP, %r14					   ;\
1815d9d9091SRichard Lowe	movq	%r14, 8(%rsp)		/* stash the user stack pointer	*/ ;\
1825d9d9091SRichard Lowe	popq	%r14			/* restore %r14			*/ ;\
1835d9d9091SRichard Lowe	movq	T_LWP(%r15), %r15	/* load the lwp pointer		*/ ;\
1845d9d9091SRichard Lowe	pushq	%r15			/* push the lwp pointer		*/ ;\
1855d9d9091SRichard Lowe	movq	LWP_PROCP(%r15), %r15	/* load the proc pointer	*/ ;\
1865d9d9091SRichard Lowe	movq	P_BRAND(%r15), %r15	/* load the brand pointer	*/ ;\
1875d9d9091SRichard Lowe	movq	B_MACHOPS(%r15), %r15	/* load the machops pointer	*/ ;\
1885d9d9091SRichard Lowe	movq	_CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15		   ;\
1895d9d9091SRichard Lowe	cmpq	$0, %r15						   ;\
1905d9d9091SRichard Lowe	je	1f							   ;\
1915d9d9091SRichard Lowe	movq	%r15, 16(%rsp)		/* save the callback pointer	*/ ;\
1925d9d9091SRichard Lowe	push_userland_ret		/* push the return address	*/ ;\
1935d9d9091SRichard Lowe	movq	24(%rsp), %r15		/* load callback pointer	*/ ;\
1945d9d9091SRichard Lowe	INDIRECT_CALL_REG(r15)		/* call callback		*/ ;\
1955d9d9091SRichard Lowe1:	movq	%gs:CPU_RTMP_R15, %r15	/* restore %r15			*/ ;\
1965d9d9091SRichard Lowe	movq	%gs:CPU_RTMP_RSP, %rsp	/* restore the stack pointer	*/
1975d9d9091SRichard Lowe
1985d9d9091SRichard Lowe#define	MSTATE_TRANSITION(from, to)		\
1995d9d9091SRichard Lowe	movl	$from, %edi;			\
2005d9d9091SRichard Lowe	movl	$to, %esi;			\
2015d9d9091SRichard Lowe	call	syscall_mstate
2025d9d9091SRichard Lowe
2035d9d9091SRichard Lowe/*
2045d9d9091SRichard Lowe * Check to see if a simple (direct) return is possible i.e.
2055d9d9091SRichard Lowe *
2065d9d9091SRichard Lowe *	if (t->t_post_sys_ast | syscalltrace |
2075d9d9091SRichard Lowe *	    lwp->lwp_pcb.pcb_rupdate == 1)
2085d9d9091SRichard Lowe *		do full version	;
2095d9d9091SRichard Lowe *
2105d9d9091SRichard Lowe * Preconditions:
2115d9d9091SRichard Lowe * -	t is curthread
2125d9d9091SRichard Lowe * Postconditions:
2135d9d9091SRichard Lowe * -	condition code NE is set if post-sys is too complex
2145d9d9091SRichard Lowe * -	rtmp is zeroed if it isn't (we rely on this!)
2155d9d9091SRichard Lowe * -	ltmp is smashed
2165d9d9091SRichard Lowe */
2175d9d9091SRichard Lowe#define	CHECK_POSTSYS_NE(t, ltmp, rtmp)			\
2185d9d9091SRichard Lowe	movq	T_LWP(t), ltmp;				\
2195d9d9091SRichard Lowe	movzbl	PCB_RUPDATE(ltmp), rtmp;		\
2205d9d9091SRichard Lowe	ORL_SYSCALLTRACE(rtmp);				\
2215d9d9091SRichard Lowe	orl	T_POST_SYS_AST(t), rtmp;		\
2225d9d9091SRichard Lowe	cmpl	$0, rtmp
2235d9d9091SRichard Lowe
2245d9d9091SRichard Lowe/*
2255d9d9091SRichard Lowe * Fix up the lwp, thread, and eflags for a successful return
2265d9d9091SRichard Lowe *
2275d9d9091SRichard Lowe * Preconditions:
2285d9d9091SRichard Lowe * -	zwreg contains zero
2295d9d9091SRichard Lowe */
2305d9d9091SRichard Lowe#define	SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg)		\
2315d9d9091SRichard Lowe	movb	$LWP_USER, LWP_STATE(lwp);		\
2325d9d9091SRichard Lowe	movw	zwreg, T_SYSNUM(t);			\
2335d9d9091SRichard Lowe	andb	$_CONST(0xffff - PS_C), REGOFF_RFL(%rsp)
2345d9d9091SRichard Lowe
2355d9d9091SRichard Lowe/*
2365d9d9091SRichard Lowe * ASSERT(lwptoregs(lwp) == rp);
2375d9d9091SRichard Lowe *
2385d9d9091SRichard Lowe * This may seem obvious, but very odd things happen if this
2395d9d9091SRichard Lowe * assertion is false
2405d9d9091SRichard Lowe *
2415d9d9091SRichard Lowe * Preconditions:
2425d9d9091SRichard Lowe *	(%rsp is ready for normal call sequence)
2435d9d9091SRichard Lowe * Postconditions (if assertion is true):
2445d9d9091SRichard Lowe *	%r11 is smashed
2455d9d9091SRichard Lowe *
2465d9d9091SRichard Lowe * ASSERT(rp->r_cs == descnum)
2475d9d9091SRichard Lowe *
2485d9d9091SRichard Lowe * The code selector is written into the regs structure when the
2495d9d9091SRichard Lowe * lwp stack is created.  We use this ASSERT to validate that
2505d9d9091SRichard Lowe * the regs structure really matches how we came in.
2515d9d9091SRichard Lowe *
2525d9d9091SRichard Lowe * Preconditions:
2535d9d9091SRichard Lowe *	(%rsp is ready for normal call sequence)
2545d9d9091SRichard Lowe * Postconditions (if assertion is true):
2555d9d9091SRichard Lowe *	-none-
2565d9d9091SRichard Lowe *
2575d9d9091SRichard Lowe * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0);
2585d9d9091SRichard Lowe *
2595d9d9091SRichard Lowe * If this is false, it meant that we returned to userland without
2605d9d9091SRichard Lowe * updating the segment registers as we were supposed to.
2615d9d9091SRichard Lowe *
2625d9d9091SRichard Lowe * Note that we must ensure no interrupts or other traps intervene
2635d9d9091SRichard Lowe * between entering privileged mode and performing the assertion,
2645d9d9091SRichard Lowe * otherwise we may perform a context switch on the thread, which
2655d9d9091SRichard Lowe * will end up setting pcb_rupdate to 1 again.
2665d9d9091SRichard Lowe *
2675d9d9091SRichard Lowe * ASSERT(%cr0 & CR0_TS == 0);
2685d9d9091SRichard Lowe * Preconditions:
2695d9d9091SRichard Lowe *	(%rsp is ready for normal call sequence)
2705d9d9091SRichard Lowe * Postconditions (if assertion is true):
2715d9d9091SRichard Lowe *      (specified register is clobbered)
2725d9d9091SRichard Lowe *
2735d9d9091SRichard Lowe * Check to make sure that we are returning to user land and that CR0.TS
2745d9d9091SRichard Lowe * is not set. This is required as part of the eager FPU (see
2755d9d9091SRichard Lowe * uts/intel/os/fpu.c for more information).
2765d9d9091SRichard Lowe */
2775d9d9091SRichard Lowe
2785d9d9091SRichard Lowe#if defined(DEBUG)
2795d9d9091SRichard Lowe
2805d9d9091SRichard Lowe__lwptoregs_msg:
2815d9d9091SRichard Lowe	.string	"syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]"
2825d9d9091SRichard Lowe
2835d9d9091SRichard Lowe__codesel_msg:
2845d9d9091SRichard Lowe	.string	"syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld"
2855d9d9091SRichard Lowe
2865d9d9091SRichard Lowe__no_rupdate_msg:
2875d9d9091SRichard Lowe	.string	"syscall_asm_amd64.s:%d lwp %p, pcb_rupdate != 0"
2885d9d9091SRichard Lowe
2895d9d9091SRichard Lowe__bad_ts_msg:
2905d9d9091SRichard Lowe	.string "syscall_asm_amd64.s:%d CR0.TS set on user return"
2915d9d9091SRichard Lowe
2925d9d9091SRichard Lowe#define	ASSERT_LWPTOREGS(lwp, rp)			\
2935d9d9091SRichard Lowe	movq	LWP_REGS(lwp), %r11;			\
2945d9d9091SRichard Lowe	cmpq	rp, %r11;				\
2955d9d9091SRichard Lowe	je	7f;					\
2965d9d9091SRichard Lowe	leaq	__lwptoregs_msg(%rip), %rdi;		\
2975d9d9091SRichard Lowe	movl	$__LINE__, %esi;			\
2985d9d9091SRichard Lowe	movq	lwp, %rdx;				\
2995d9d9091SRichard Lowe	movq	%r11, %rcx;				\
3005d9d9091SRichard Lowe	movq	rp, %r8;				\
3015d9d9091SRichard Lowe	xorl	%eax, %eax;				\
3025d9d9091SRichard Lowe	call	panic;					\
3035d9d9091SRichard Lowe7:
3045d9d9091SRichard Lowe
3055d9d9091SRichard Lowe#define	ASSERT_NO_RUPDATE_PENDING(lwp)			\
3065d9d9091SRichard Lowe	testb	$0x1, PCB_RUPDATE(lwp);			\
3075d9d9091SRichard Lowe	je	8f;					\
3085d9d9091SRichard Lowe	movq	lwp, %rdx;				\
3095d9d9091SRichard Lowe	leaq	__no_rupdate_msg(%rip), %rdi;		\
3105d9d9091SRichard Lowe	movl	$__LINE__, %esi;			\
3115d9d9091SRichard Lowe	xorl	%eax, %eax;				\
3125d9d9091SRichard Lowe	call	panic;					\
3135d9d9091SRichard Lowe8:
3145d9d9091SRichard Lowe
3155d9d9091SRichard Lowe#define	ASSERT_CR0TS_ZERO(reg)				\
3165d9d9091SRichard Lowe	movq	%cr0, reg;				\
3175d9d9091SRichard Lowe	testq	$CR0_TS, reg;				\
3185d9d9091SRichard Lowe	jz	9f;					\
3195d9d9091SRichard Lowe	leaq	__bad_ts_msg(%rip), %rdi;		\
3205d9d9091SRichard Lowe	movl	$__LINE__, %esi;			\
3215d9d9091SRichard Lowe	xorl	%eax, %eax;				\
3225d9d9091SRichard Lowe	call	panic;					\
3235d9d9091SRichard Lowe9:
3245d9d9091SRichard Lowe
3255d9d9091SRichard Lowe#else
3265d9d9091SRichard Lowe#define	ASSERT_LWPTOREGS(lwp, rp)
3275d9d9091SRichard Lowe#define	ASSERT_NO_RUPDATE_PENDING(lwp)
3285d9d9091SRichard Lowe#define	ASSERT_CR0TS_ZERO(reg)
3295d9d9091SRichard Lowe#endif
3305d9d9091SRichard Lowe
3315d9d9091SRichard Lowe/*
3325d9d9091SRichard Lowe * Do the traptrace thing and restore any registers we used
3335d9d9091SRichard Lowe * in situ.  Assumes that %rsp is pointing at the base of
3345d9d9091SRichard Lowe * the struct regs, obviously ..
3355d9d9091SRichard Lowe */
3365d9d9091SRichard Lowe#ifdef TRAPTRACE
3375d9d9091SRichard Lowe#define	SYSCALL_TRAPTRACE(ttype)				\
3385d9d9091SRichard Lowe	TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype);		\
3395d9d9091SRichard Lowe	TRACE_REGS(%rdi, %rsp, %rbx, %rcx);			\
3405d9d9091SRichard Lowe	TRACE_STAMP(%rdi);	/* rdtsc clobbers %eax, %edx */	\
3415d9d9091SRichard Lowe	movq	REGOFF_RAX(%rsp), %rax;				\
3425d9d9091SRichard Lowe	movq	REGOFF_RBX(%rsp), %rbx;				\
3435d9d9091SRichard Lowe	movq	REGOFF_RCX(%rsp), %rcx;				\
3445d9d9091SRichard Lowe	movq	REGOFF_RDX(%rsp), %rdx;				\
3455d9d9091SRichard Lowe	movl	%eax, TTR_SYSNUM(%rdi);				\
3465d9d9091SRichard Lowe	movq	REGOFF_RDI(%rsp), %rdi
3475d9d9091SRichard Lowe
3485d9d9091SRichard Lowe#define	SYSCALL_TRAPTRACE32(ttype)				\
3495d9d9091SRichard Lowe	SYSCALL_TRAPTRACE(ttype);				\
3505d9d9091SRichard Lowe	/* paranoia: clean the top 32-bits of the registers */	\
3515d9d9091SRichard Lowe	orl	%eax, %eax;					\
3525d9d9091SRichard Lowe	orl	%ebx, %ebx;					\
3535d9d9091SRichard Lowe	orl	%ecx, %ecx;					\
3545d9d9091SRichard Lowe	orl	%edx, %edx;					\
3555d9d9091SRichard Lowe	orl	%edi, %edi
3565d9d9091SRichard Lowe#else	/* TRAPTRACE */
3575d9d9091SRichard Lowe#define	SYSCALL_TRAPTRACE(ttype)
3585d9d9091SRichard Lowe#define	SYSCALL_TRAPTRACE32(ttype)
3595d9d9091SRichard Lowe#endif	/* TRAPTRACE */
3605d9d9091SRichard Lowe
3615d9d9091SRichard Lowe/*
3625d9d9091SRichard Lowe * The 64-bit libc syscall wrapper does this:
3635d9d9091SRichard Lowe *
3645d9d9091SRichard Lowe * fn(<args>)
3655d9d9091SRichard Lowe * {
3665d9d9091SRichard Lowe *	movq	%rcx, %r10	-- because syscall smashes %rcx
3675d9d9091SRichard Lowe *	movl	$CODE, %eax
3685d9d9091SRichard Lowe *	syscall
3695d9d9091SRichard Lowe *	<error processing>
3705d9d9091SRichard Lowe * }
3715d9d9091SRichard Lowe *
3725d9d9091SRichard Lowe * Thus when we come into the kernel:
3735d9d9091SRichard Lowe *
3745d9d9091SRichard Lowe *	%rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args
3755d9d9091SRichard Lowe *	%rax is the syscall number
3765d9d9091SRichard Lowe *	%r12-%r15 contain caller state
3775d9d9091SRichard Lowe *
3785d9d9091SRichard Lowe * The syscall instruction arranges that:
3795d9d9091SRichard Lowe *
3805d9d9091SRichard Lowe *	%rcx contains the return %rip
3815d9d9091SRichard Lowe *	%r11d contains bottom 32-bits of %rflags
3825d9d9091SRichard Lowe *	%rflags is masked (as determined by the SFMASK msr)
3835d9d9091SRichard Lowe *	%cs is set to UCS_SEL (as determined by the STAR msr)
3845d9d9091SRichard Lowe *	%ss is set to UDS_SEL (as determined by the STAR msr)
3855d9d9091SRichard Lowe *	%rip is set to sys_syscall (as determined by the LSTAR msr)
3865d9d9091SRichard Lowe *
3875d9d9091SRichard Lowe * Or in other words, we have no registers available at all.
3885d9d9091SRichard Lowe * Only swapgs can save us!
3895d9d9091SRichard Lowe *
3905d9d9091SRichard Lowe * Under the hypervisor, the swapgs has happened already.  However, the
3915d9d9091SRichard Lowe * state of the world is very different from that we're familiar with.
3925d9d9091SRichard Lowe *
3935d9d9091SRichard Lowe * In particular, we have a stack structure like that for interrupt
3945d9d9091SRichard Lowe * gates, except that the %cs and %ss registers are modified for reasons
3955d9d9091SRichard Lowe * that are not entirely clear.  Critically, the %rcx/%r11 values do
3965d9d9091SRichard Lowe * *not* reflect the usage of those registers under a 'real' syscall[1];
3975d9d9091SRichard Lowe * the stack, therefore, looks like this:
3985d9d9091SRichard Lowe *
3995d9d9091SRichard Lowe *	0x0(rsp)	potentially junk %rcx
4005d9d9091SRichard Lowe *	0x8(rsp)	potentially junk %r11
4015d9d9091SRichard Lowe *	0x10(rsp)	user %rip
4025d9d9091SRichard Lowe *	0x18(rsp)	modified %cs
4035d9d9091SRichard Lowe *	0x20(rsp)	user %rflags
4045d9d9091SRichard Lowe *	0x28(rsp)	user %rsp
4055d9d9091SRichard Lowe *	0x30(rsp)	modified %ss
4065d9d9091SRichard Lowe *
4075d9d9091SRichard Lowe *
4085d9d9091SRichard Lowe * and before continuing on, we must load the %rip into %rcx and the
4095d9d9091SRichard Lowe * %rflags into %r11.
4105d9d9091SRichard Lowe *
4115d9d9091SRichard Lowe * [1] They used to, and we relied on it, but this was broken in 3.1.1.
4125d9d9091SRichard Lowe * Sigh.
4135d9d9091SRichard Lowe */
4145d9d9091SRichard Lowe#if defined(__xpv)
4155d9d9091SRichard Lowe#define	XPV_SYSCALL_PROD						\
4165d9d9091SRichard Lowe	movq	0x10(%rsp), %rcx;					\
4175d9d9091SRichard Lowe	movq	0x20(%rsp), %r11;					\
4185d9d9091SRichard Lowe	movq	0x28(%rsp), %rsp
4195d9d9091SRichard Lowe#else
4205d9d9091SRichard Lowe#define	XPV_SYSCALL_PROD /* nothing */
4215d9d9091SRichard Lowe#endif
4225d9d9091SRichard Lowe
4235d9d9091SRichard Lowe	ENTRY_NP2(brand_sys_syscall,_allsyscalls)
4245d9d9091SRichard Lowe	SWAPGS				/* kernel gsbase */
4255d9d9091SRichard Lowe	XPV_SYSCALL_PROD
4265d9d9091SRichard Lowe	BRAND_CALLBACK(BRAND_CB_SYSCALL, BRAND_URET_FROM_REG(%rcx))
4275d9d9091SRichard Lowe	jmp	noprod_sys_syscall
4285d9d9091SRichard Lowe
4295d9d9091SRichard Lowe	ALTENTRY(sys_syscall)
4305d9d9091SRichard Lowe	SWAPGS				/* kernel gsbase */
4315d9d9091SRichard Lowe	XPV_SYSCALL_PROD
4325d9d9091SRichard Lowe
4335d9d9091SRichard Lowenoprod_sys_syscall:
4345d9d9091SRichard Lowe	movq	%r15, %gs:CPU_RTMP_R15
4355d9d9091SRichard Lowe	movq	%rsp, %gs:CPU_RTMP_RSP
4365d9d9091SRichard Lowe
4375d9d9091SRichard Lowe	movq	%gs:CPU_THREAD, %r15
4385d9d9091SRichard Lowe	movq	T_STACK(%r15), %rsp	/* switch from user to kernel stack */
4395d9d9091SRichard Lowe
4405d9d9091SRichard Lowe	ASSERT_UPCALL_MASK_IS_SET
4415d9d9091SRichard Lowe
4425d9d9091SRichard Lowe	movl	$UCS_SEL, REGOFF_CS(%rsp)
4435d9d9091SRichard Lowe	movq	%rcx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
4445d9d9091SRichard Lowe	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
4455d9d9091SRichard Lowe	movl	$UDS_SEL, REGOFF_SS(%rsp)
4465d9d9091SRichard Lowe
4475d9d9091SRichard Lowe	movl	%eax, %eax			/* wrapper: sysc# -> %eax */
4485d9d9091SRichard Lowe	movq	%rdi, REGOFF_RDI(%rsp)
4495d9d9091SRichard Lowe	movq	%rsi, REGOFF_RSI(%rsp)
4505d9d9091SRichard Lowe	movq	%rdx, REGOFF_RDX(%rsp)
4515d9d9091SRichard Lowe	movq	%r10, REGOFF_RCX(%rsp)		/* wrapper: %rcx -> %r10 */
4525d9d9091SRichard Lowe	movq	%r10, %rcx			/* arg[3] for direct calls */
4535d9d9091SRichard Lowe
4545d9d9091SRichard Lowe	movq	%r8, REGOFF_R8(%rsp)
4555d9d9091SRichard Lowe	movq	%r9, REGOFF_R9(%rsp)
4565d9d9091SRichard Lowe	movq	%rax, REGOFF_RAX(%rsp)
4575d9d9091SRichard Lowe	movq	%rbx, REGOFF_RBX(%rsp)
4585d9d9091SRichard Lowe
4595d9d9091SRichard Lowe	movq	%rbp, REGOFF_RBP(%rsp)
4605d9d9091SRichard Lowe	movq	%r10, REGOFF_R10(%rsp)
4615d9d9091SRichard Lowe	movq	%gs:CPU_RTMP_RSP, %r11
4625d9d9091SRichard Lowe	movq	%r11, REGOFF_RSP(%rsp)
4635d9d9091SRichard Lowe	movq	%r12, REGOFF_R12(%rsp)
4645d9d9091SRichard Lowe
4655d9d9091SRichard Lowe	movq	%r13, REGOFF_R13(%rsp)
4665d9d9091SRichard Lowe	movq	%r14, REGOFF_R14(%rsp)
4675d9d9091SRichard Lowe	movq	%gs:CPU_RTMP_R15, %r10
4685d9d9091SRichard Lowe	movq	%r10, REGOFF_R15(%rsp)
4695d9d9091SRichard Lowe	movq	$0, REGOFF_SAVFP(%rsp)
4705d9d9091SRichard Lowe	movq	$0, REGOFF_SAVPC(%rsp)
4715d9d9091SRichard Lowe
4725d9d9091SRichard Lowe	/*
4735d9d9091SRichard Lowe	 * Copy these registers here in case we end up stopped with
4745d9d9091SRichard Lowe	 * someone (like, say, /proc) messing with our register state.
4755d9d9091SRichard Lowe	 * We don't -restore- them unless we have to in update_sregs.
4765d9d9091SRichard Lowe	 *
4775d9d9091SRichard Lowe	 * Since userland -can't- change fsbase or gsbase directly,
4785d9d9091SRichard Lowe	 * and capturing them involves two serializing instructions,
4795d9d9091SRichard Lowe	 * we don't bother to capture them here.
4805d9d9091SRichard Lowe	 */
4815d9d9091SRichard Lowe	xorl	%ebx, %ebx
4825d9d9091SRichard Lowe	movw	%ds, %bx
4835d9d9091SRichard Lowe	movq	%rbx, REGOFF_DS(%rsp)
4845d9d9091SRichard Lowe	movw	%es, %bx
4855d9d9091SRichard Lowe	movq	%rbx, REGOFF_ES(%rsp)
4865d9d9091SRichard Lowe	movw	%fs, %bx
4875d9d9091SRichard Lowe	movq	%rbx, REGOFF_FS(%rsp)
4885d9d9091SRichard Lowe	movw	%gs, %bx
4895d9d9091SRichard Lowe	movq	%rbx, REGOFF_GS(%rsp)
4905d9d9091SRichard Lowe
4915d9d9091SRichard Lowe	/*
4925d9d9091SRichard Lowe	 * If we're trying to use TRAPTRACE though, I take that back: we're
4935d9d9091SRichard Lowe	 * probably debugging some problem in the SWAPGS logic and want to know
4945d9d9091SRichard Lowe	 * what the incoming gsbase was.
4955d9d9091SRichard Lowe	 *
4965d9d9091SRichard Lowe	 * Since we already did SWAPGS, record the KGSBASE.
4975d9d9091SRichard Lowe	 */
4985d9d9091SRichard Lowe#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
4995d9d9091SRichard Lowe	movl	$MSR_AMD_KGSBASE, %ecx
5005d9d9091SRichard Lowe	rdmsr
5015d9d9091SRichard Lowe	movl	%eax, REGOFF_GSBASE(%rsp)
5025d9d9091SRichard Lowe	movl	%edx, REGOFF_GSBASE+4(%rsp)
5035d9d9091SRichard Lowe#endif
5045d9d9091SRichard Lowe
5055d9d9091SRichard Lowe	/*
5065d9d9091SRichard Lowe	 * Machine state saved in the regs structure on the stack
5075d9d9091SRichard Lowe	 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
5085d9d9091SRichard Lowe	 * %eax is the syscall number
5095d9d9091SRichard Lowe	 * %rsp is the thread's stack, %r15 is curthread
5105d9d9091SRichard Lowe	 * REG_RSP(%rsp) is the user's stack
5115d9d9091SRichard Lowe	 */
5125d9d9091SRichard Lowe
5135d9d9091SRichard Lowe	SYSCALL_TRAPTRACE($TT_SYSC64)
5145d9d9091SRichard Lowe
5155d9d9091SRichard Lowe	movq	%rsp, %rbp
5165d9d9091SRichard Lowe
5175d9d9091SRichard Lowe	movq	T_LWP(%r15), %r14
5185d9d9091SRichard Lowe	ASSERT_NO_RUPDATE_PENDING(%r14)
5195d9d9091SRichard Lowe	ENABLE_INTR_FLAGS
5205d9d9091SRichard Lowe
5215d9d9091SRichard Lowe	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
5225d9d9091SRichard Lowe	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
5235d9d9091SRichard Lowe
5245d9d9091SRichard Lowe	ASSERT_LWPTOREGS(%r14, %rsp)
5255d9d9091SRichard Lowe
5265d9d9091SRichard Lowe	movb	$LWP_SYS, LWP_STATE(%r14)
5275d9d9091SRichard Lowe	incq	LWP_RU_SYSC(%r14)
5285d9d9091SRichard Lowe	movb	$NORMALRETURN, LWP_EOSYS(%r14)
5295d9d9091SRichard Lowe
5305d9d9091SRichard Lowe	incq	%gs:CPU_STATS_SYS_SYSCALL
5315d9d9091SRichard Lowe
5325d9d9091SRichard Lowe	movw	%ax, T_SYSNUM(%r15)
5335d9d9091SRichard Lowe	movzbl	T_PRE_SYS(%r15), %ebx
5345d9d9091SRichard Lowe	ORL_SYSCALLTRACE(%ebx)
5355d9d9091SRichard Lowe	testl	%ebx, %ebx
5365d9d9091SRichard Lowe	jne	_syscall_pre
5375d9d9091SRichard Lowe
5385d9d9091SRichard Lowe_syscall_invoke:
5395d9d9091SRichard Lowe	movq	REGOFF_RDI(%rbp), %rdi
5405d9d9091SRichard Lowe	movq	REGOFF_RSI(%rbp), %rsi
5415d9d9091SRichard Lowe	movq	REGOFF_RDX(%rbp), %rdx
5425d9d9091SRichard Lowe	movq	REGOFF_RCX(%rbp), %rcx
5435d9d9091SRichard Lowe	movq	REGOFF_R8(%rbp), %r8
5445d9d9091SRichard Lowe	movq	REGOFF_R9(%rbp), %r9
5455d9d9091SRichard Lowe
5465d9d9091SRichard Lowe	cmpl	$NSYSCALL, %eax
5475d9d9091SRichard Lowe	jae	_syscall_ill
5485d9d9091SRichard Lowe	shll	$SYSENT_SIZE_SHIFT, %eax
5495d9d9091SRichard Lowe	leaq	sysent(%rax), %rbx
5505d9d9091SRichard Lowe
5515d9d9091SRichard Lowe	movq	SY_CALLC(%rbx), %rax
5525d9d9091SRichard Lowe	INDIRECT_CALL_REG(rax)
5535d9d9091SRichard Lowe
5545d9d9091SRichard Lowe	movq	%rax, %r12
5555d9d9091SRichard Lowe	movq	%rdx, %r13
5565d9d9091SRichard Lowe
5575d9d9091SRichard Lowe	/*
5585d9d9091SRichard Lowe	 * If the handler returns two ints, then we need to split the
5595d9d9091SRichard Lowe	 * 64-bit return value into two 32-bit values.
5605d9d9091SRichard Lowe	 */
5615d9d9091SRichard Lowe	testw	$SE_32RVAL2, SY_FLAGS(%rbx)
5625d9d9091SRichard Lowe	je	5f
5635d9d9091SRichard Lowe	movq	%r12, %r13
5645d9d9091SRichard Lowe	shrq	$32, %r13	/* upper 32-bits into %edx */
5655d9d9091SRichard Lowe	movl	%r12d, %r12d	/* lower 32-bits into %eax */
5665d9d9091SRichard Lowe5:
5675d9d9091SRichard Lowe	/*
5685d9d9091SRichard Lowe	 * Optimistically assume that there's no post-syscall
5695d9d9091SRichard Lowe	 * work to do.  (This is to avoid having to call syscall_mstate()
5705d9d9091SRichard Lowe	 * with interrupts disabled)
5715d9d9091SRichard Lowe	 */
5725d9d9091SRichard Lowe	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
5735d9d9091SRichard Lowe
5745d9d9091SRichard Lowe	/*
5755d9d9091SRichard Lowe	 * We must protect ourselves from being descheduled here;
5765d9d9091SRichard Lowe	 * If we were, and we ended up on another cpu, or another
5775d9d9091SRichard Lowe	 * lwp got in ahead of us, it could change the segment
5785d9d9091SRichard Lowe	 * registers without us noticing before we return to userland.
5795d9d9091SRichard Lowe	 */
5805d9d9091SRichard Lowe	CLI(%r14)
5815d9d9091SRichard Lowe	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
5825d9d9091SRichard Lowe	jne	_syscall_post
5835d9d9091SRichard Lowe
5845d9d9091SRichard Lowe	/*
5855d9d9091SRichard Lowe	 * We need to protect ourselves against non-canonical return values
5865d9d9091SRichard Lowe	 * because Intel doesn't check for them on sysret (AMD does).  Canonical
5875d9d9091SRichard Lowe	 * addresses on current amd64 processors only use 48-bits for VAs; an
5885d9d9091SRichard Lowe	 * address is canonical if all upper bits (47-63) are identical. If we
5895d9d9091SRichard Lowe	 * find a non-canonical %rip, we opt to go through the full
5905d9d9091SRichard Lowe	 * _syscall_post path which takes us into an iretq which is not
5915d9d9091SRichard Lowe	 * susceptible to the same problems sysret is.
5925d9d9091SRichard Lowe	 *
5935d9d9091SRichard Lowe	 * We're checking for a canonical address by first doing an arithmetic
5945d9d9091SRichard Lowe	 * shift. This will fill in the remaining bits with the value of bit 63.
5955d9d9091SRichard Lowe	 * If the address were canonical, the register would now have either all
5965d9d9091SRichard Lowe	 * zeroes or all ones in it. Therefore we add one (inducing overflow)
5975d9d9091SRichard Lowe	 * and compare against 1. A canonical address will either be zero or one
5985d9d9091SRichard Lowe	 * at this point, hence the use of ja.
5995d9d9091SRichard Lowe	 *
6005d9d9091SRichard Lowe	 * At this point, r12 and r13 have the return value so we can't use
6015d9d9091SRichard Lowe	 * those registers.
6025d9d9091SRichard Lowe	 */
6035d9d9091SRichard Lowe	movq	REGOFF_RIP(%rsp), %rcx
6045d9d9091SRichard Lowe	sarq	$47, %rcx
6055d9d9091SRichard Lowe	incq	%rcx
6065d9d9091SRichard Lowe	cmpq	$1, %rcx
6075d9d9091SRichard Lowe	ja	_syscall_post
6085d9d9091SRichard Lowe
6095d9d9091SRichard Lowe
6105d9d9091SRichard Lowe	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
6115d9d9091SRichard Lowe
6125d9d9091SRichard Lowe	movq	%r12, REGOFF_RAX(%rsp)
6135d9d9091SRichard Lowe	movq	%r13, REGOFF_RDX(%rsp)
6145d9d9091SRichard Lowe
6155d9d9091SRichard Lowe	/*
6165d9d9091SRichard Lowe	 * Clobber %r11 as we check CR0.TS.
6175d9d9091SRichard Lowe	 */
6185d9d9091SRichard Lowe	ASSERT_CR0TS_ZERO(%r11)
6195d9d9091SRichard Lowe
6205d9d9091SRichard Lowe	/*
6215d9d9091SRichard Lowe	 * To get back to userland, we need the return %rip in %rcx and
6225d9d9091SRichard Lowe	 * the return %rfl in %r11d.  The sysretq instruction also arranges
6235d9d9091SRichard Lowe	 * to fix up %cs and %ss; everything else is our responsibility.
6245d9d9091SRichard Lowe	 */
6255d9d9091SRichard Lowe	movq	REGOFF_RDI(%rsp), %rdi
6265d9d9091SRichard Lowe	movq	REGOFF_RSI(%rsp), %rsi
6275d9d9091SRichard Lowe	movq	REGOFF_RDX(%rsp), %rdx
6285d9d9091SRichard Lowe	/* %rcx used to restore %rip value */
6295d9d9091SRichard Lowe
6305d9d9091SRichard Lowe	movq	REGOFF_R8(%rsp), %r8
6315d9d9091SRichard Lowe	movq	REGOFF_R9(%rsp), %r9
6325d9d9091SRichard Lowe	movq	REGOFF_RAX(%rsp), %rax
6335d9d9091SRichard Lowe	movq	REGOFF_RBX(%rsp), %rbx
6345d9d9091SRichard Lowe
6355d9d9091SRichard Lowe	movq	REGOFF_RBP(%rsp), %rbp
6365d9d9091SRichard Lowe	movq	REGOFF_R10(%rsp), %r10
6375d9d9091SRichard Lowe	/* %r11 used to restore %rfl value */
6385d9d9091SRichard Lowe	movq	REGOFF_R12(%rsp), %r12
6395d9d9091SRichard Lowe
6405d9d9091SRichard Lowe	movq	REGOFF_R13(%rsp), %r13
6415d9d9091SRichard Lowe	movq	REGOFF_R14(%rsp), %r14
6425d9d9091SRichard Lowe	movq	REGOFF_R15(%rsp), %r15
6435d9d9091SRichard Lowe
6445d9d9091SRichard Lowe	movq	REGOFF_RIP(%rsp), %rcx
6455d9d9091SRichard Lowe	movl	REGOFF_RFL(%rsp), %r11d
6465d9d9091SRichard Lowe
647*5cd084edSDan McDonald	/*
648*5cd084edSDan McDonald	 * Unlike other cases, because we need to restore the user stack pointer
649*5cd084edSDan McDonald	 * before exiting the kernel we must clear the microarch state before
650*5cd084edSDan McDonald	 * getting here. This should be safe because it means that the only
651*5cd084edSDan McDonald	 * values on the bus after this are based on the user's registers and
652*5cd084edSDan McDonald	 * potentially the addresses where we stored them. Given the constraints
653*5cd084edSDan McDonald	 * of sysret, that's how it has to be.
654*5cd084edSDan McDonald	 */
655*5cd084edSDan McDonald	call	x86_md_clear
656*5cd084edSDan McDonald
6575d9d9091SRichard Lowe#if defined(__xpv)
6585d9d9091SRichard Lowe	addq	$REGOFF_RIP, %rsp
6595d9d9091SRichard Lowe#else
6605d9d9091SRichard Lowe	movq	REGOFF_RSP(%rsp), %rsp
6615d9d9091SRichard Lowe#endif
6625d9d9091SRichard Lowe
6635d9d9091SRichard Lowe        /*
6645d9d9091SRichard Lowe         * There can be no instructions between the ALTENTRY below and
6655d9d9091SRichard Lowe	 * SYSRET or we could end up breaking brand support. See label usage
6665d9d9091SRichard Lowe         * in sn1_brand_syscall_callback for an example.
6675d9d9091SRichard Lowe         */
6685d9d9091SRichard Lowe	ASSERT_UPCALL_MASK_IS_SET
6695d9d9091SRichard Lowe#if defined(__xpv)
6705d9d9091SRichard Lowe	SYSRETQ
6715d9d9091SRichard Lowe        ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
6725d9d9091SRichard Lowe
6735d9d9091SRichard Lowe	/*
6745d9d9091SRichard Lowe	 * We can only get here after executing a brand syscall
6755d9d9091SRichard Lowe	 * interposition callback handler and simply need to
6765d9d9091SRichard Lowe	 * "sysretq" back to userland. On the hypervisor this
6775d9d9091SRichard Lowe	 * involves the iret hypercall which requires us to construct
6785d9d9091SRichard Lowe	 * just enough of the stack needed for the hypercall.
6795d9d9091SRichard Lowe	 * (rip, cs, rflags, rsp, ss).
6805d9d9091SRichard Lowe	 */
6815d9d9091SRichard Lowe	movq    %rsp, %gs:CPU_RTMP_RSP		/* save user's rsp */
6825d9d9091SRichard Lowe	movq	%gs:CPU_THREAD, %r11
6835d9d9091SRichard Lowe	movq	T_STACK(%r11), %rsp
6845d9d9091SRichard Lowe
6855d9d9091SRichard Lowe	movq	%rcx, REGOFF_RIP(%rsp)
6865d9d9091SRichard Lowe	movl	$UCS_SEL, REGOFF_CS(%rsp)
6875d9d9091SRichard Lowe	movq	%gs:CPU_RTMP_RSP, %r11
6885d9d9091SRichard Lowe	movq	%r11, REGOFF_RSP(%rsp)
6895d9d9091SRichard Lowe	pushfq
6905d9d9091SRichard Lowe	popq	%r11				/* hypercall enables ints */
6915d9d9091SRichard Lowe	movq	%r11, REGOFF_RFL(%rsp)
6925d9d9091SRichard Lowe	movl	$UDS_SEL, REGOFF_SS(%rsp)
6935d9d9091SRichard Lowe	addq	$REGOFF_RIP, %rsp
6945d9d9091SRichard Lowe	/*
6955d9d9091SRichard Lowe	 * XXPV: see comment in SYSRETQ definition for future optimization
6965d9d9091SRichard Lowe	 *       we could take.
6975d9d9091SRichard Lowe	 */
6985d9d9091SRichard Lowe	ASSERT_UPCALL_MASK_IS_SET
6995d9d9091SRichard Lowe	SYSRETQ
7005d9d9091SRichard Lowe#else
7015d9d9091SRichard Lowe        ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
7025d9d9091SRichard Lowe	jmp	tr_sysretq
7035d9d9091SRichard Lowe#endif
7045d9d9091SRichard Lowe        /*NOTREACHED*/
7055d9d9091SRichard Lowe        SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
7065d9d9091SRichard Lowe
7075d9d9091SRichard Lowe_syscall_pre:
7085d9d9091SRichard Lowe	call	pre_syscall
7095d9d9091SRichard Lowe	movl	%eax, %r12d
7105d9d9091SRichard Lowe	testl	%eax, %eax
7115d9d9091SRichard Lowe	jne	_syscall_post_call
7125d9d9091SRichard Lowe	/*
7135d9d9091SRichard Lowe	 * Didn't abort, so reload the syscall args and invoke the handler.
7145d9d9091SRichard Lowe	 */
7155d9d9091SRichard Lowe	movzwl	T_SYSNUM(%r15), %eax
7165d9d9091SRichard Lowe	jmp	_syscall_invoke
7175d9d9091SRichard Lowe
7185d9d9091SRichard Lowe_syscall_ill:
7195d9d9091SRichard Lowe	call	nosys
7205d9d9091SRichard Lowe	movq	%rax, %r12
7215d9d9091SRichard Lowe	movq	%rdx, %r13
7225d9d9091SRichard Lowe	jmp	_syscall_post_call
7235d9d9091SRichard Lowe
7245d9d9091SRichard Lowe_syscall_post:
7255d9d9091SRichard Lowe	STI
7265d9d9091SRichard Lowe	/*
7275d9d9091SRichard Lowe	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
7285d9d9091SRichard Lowe	 * so that we can account for the extra work it takes us to finish.
7295d9d9091SRichard Lowe	 */
7305d9d9091SRichard Lowe	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
7315d9d9091SRichard Lowe_syscall_post_call:
7325d9d9091SRichard Lowe	movq	%r12, %rdi
7335d9d9091SRichard Lowe	movq	%r13, %rsi
7345d9d9091SRichard Lowe	call	post_syscall
7355d9d9091SRichard Lowe	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
7365d9d9091SRichard Lowe	jmp	_sys_rtt
7375d9d9091SRichard Lowe	SET_SIZE(sys_syscall)
7385d9d9091SRichard Lowe	SET_SIZE(brand_sys_syscall)
7395d9d9091SRichard Lowe
7405d9d9091SRichard Lowe	ENTRY_NP(brand_sys_syscall32)
7415d9d9091SRichard Lowe	SWAPGS				/* kernel gsbase */
7425d9d9091SRichard Lowe	XPV_TRAP_POP
7435d9d9091SRichard Lowe	BRAND_CALLBACK(BRAND_CB_SYSCALL32, BRAND_URET_FROM_REG(%rcx))
7445d9d9091SRichard Lowe	jmp	nopop_sys_syscall32
7455d9d9091SRichard Lowe
7465d9d9091SRichard Lowe	ALTENTRY(sys_syscall32)
7475d9d9091SRichard Lowe	SWAPGS				/* kernel gsbase */
7485d9d9091SRichard Lowe	XPV_TRAP_POP
7495d9d9091SRichard Lowe
7505d9d9091SRichard Lowenopop_sys_syscall32:
7515d9d9091SRichard Lowe	movl	%esp, %r10d
7525d9d9091SRichard Lowe	movq	%gs:CPU_THREAD, %r15
7535d9d9091SRichard Lowe	movq	T_STACK(%r15), %rsp
7545d9d9091SRichard Lowe	movl	%eax, %eax
7555d9d9091SRichard Lowe
7565d9d9091SRichard Lowe	movl	$U32CS_SEL, REGOFF_CS(%rsp)
7575d9d9091SRichard Lowe	movl	%ecx, REGOFF_RIP(%rsp)		/* syscall: %rip -> %rcx */
7585d9d9091SRichard Lowe	movq	%r11, REGOFF_RFL(%rsp)		/* syscall: %rfl -> %r11d */
7595d9d9091SRichard Lowe	movq	%r10, REGOFF_RSP(%rsp)
7605d9d9091SRichard Lowe	movl	$UDS_SEL, REGOFF_SS(%rsp)
7615d9d9091SRichard Lowe
7625d9d9091SRichard Lowe_syscall32_save:
7635d9d9091SRichard Lowe	movl	%edi, REGOFF_RDI(%rsp)
7645d9d9091SRichard Lowe	movl	%esi, REGOFF_RSI(%rsp)
7655d9d9091SRichard Lowe	movl	%ebp, REGOFF_RBP(%rsp)
7665d9d9091SRichard Lowe	movl	%ebx, REGOFF_RBX(%rsp)
7675d9d9091SRichard Lowe	movl	%edx, REGOFF_RDX(%rsp)
7685d9d9091SRichard Lowe	movl	%ecx, REGOFF_RCX(%rsp)
7695d9d9091SRichard Lowe	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
7705d9d9091SRichard Lowe	movq	$0, REGOFF_SAVFP(%rsp)
7715d9d9091SRichard Lowe	movq	$0, REGOFF_SAVPC(%rsp)
7725d9d9091SRichard Lowe
7735d9d9091SRichard Lowe	/*
7745d9d9091SRichard Lowe	 * Copy these registers here in case we end up stopped with
7755d9d9091SRichard Lowe	 * someone (like, say, /proc) messing with our register state.
7765d9d9091SRichard Lowe	 * We don't -restore- them unless we have to in update_sregs.
7775d9d9091SRichard Lowe	 *
7785d9d9091SRichard Lowe	 * Since userland -can't- change fsbase or gsbase directly,
7795d9d9091SRichard Lowe	 * we don't bother to capture them here.
7805d9d9091SRichard Lowe	 */
7815d9d9091SRichard Lowe	xorl	%ebx, %ebx
7825d9d9091SRichard Lowe	movw	%ds, %bx
7835d9d9091SRichard Lowe	movq	%rbx, REGOFF_DS(%rsp)
7845d9d9091SRichard Lowe	movw	%es, %bx
7855d9d9091SRichard Lowe	movq	%rbx, REGOFF_ES(%rsp)
7865d9d9091SRichard Lowe	movw	%fs, %bx
7875d9d9091SRichard Lowe	movq	%rbx, REGOFF_FS(%rsp)
7885d9d9091SRichard Lowe	movw	%gs, %bx
7895d9d9091SRichard Lowe	movq	%rbx, REGOFF_GS(%rsp)
7905d9d9091SRichard Lowe
7915d9d9091SRichard Lowe	/*
7925d9d9091SRichard Lowe	 * If we're trying to use TRAPTRACE though, I take that back: we're
7935d9d9091SRichard Lowe	 * probably debugging some problem in the SWAPGS logic and want to know
7945d9d9091SRichard Lowe	 * what the incoming gsbase was.
7955d9d9091SRichard Lowe	 *
7965d9d9091SRichard Lowe	 * Since we already did SWAPGS, record the KGSBASE.
7975d9d9091SRichard Lowe	 */
7985d9d9091SRichard Lowe#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
7995d9d9091SRichard Lowe	movl	$MSR_AMD_KGSBASE, %ecx
8005d9d9091SRichard Lowe	rdmsr
8015d9d9091SRichard Lowe	movl	%eax, REGOFF_GSBASE(%rsp)
8025d9d9091SRichard Lowe	movl	%edx, REGOFF_GSBASE+4(%rsp)
8035d9d9091SRichard Lowe#endif
8045d9d9091SRichard Lowe
8055d9d9091SRichard Lowe	/*
8065d9d9091SRichard Lowe	 * Application state saved in the regs structure on the stack
8075d9d9091SRichard Lowe	 * %eax is the syscall number
8085d9d9091SRichard Lowe	 * %rsp is the thread's stack, %r15 is curthread
8095d9d9091SRichard Lowe	 * REG_RSP(%rsp) is the user's stack
8105d9d9091SRichard Lowe	 */
8115d9d9091SRichard Lowe
8125d9d9091SRichard Lowe	SYSCALL_TRAPTRACE32($TT_SYSC)
8135d9d9091SRichard Lowe
8145d9d9091SRichard Lowe	movq	%rsp, %rbp
8155d9d9091SRichard Lowe
8165d9d9091SRichard Lowe	movq	T_LWP(%r15), %r14
8175d9d9091SRichard Lowe	ASSERT_NO_RUPDATE_PENDING(%r14)
8185d9d9091SRichard Lowe
8195d9d9091SRichard Lowe	ENABLE_INTR_FLAGS
8205d9d9091SRichard Lowe
8215d9d9091SRichard Lowe	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
8225d9d9091SRichard Lowe	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate call) */
8235d9d9091SRichard Lowe
8245d9d9091SRichard Lowe	ASSERT_LWPTOREGS(%r14, %rsp)
8255d9d9091SRichard Lowe
8265d9d9091SRichard Lowe	incq	 %gs:CPU_STATS_SYS_SYSCALL
8275d9d9091SRichard Lowe
8285d9d9091SRichard Lowe	/*
8295d9d9091SRichard Lowe	 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed
8305d9d9091SRichard Lowe	 * into 64-bit (long) arg slots, maintaining 16 byte alignment.  Or
8315d9d9091SRichard Lowe	 * more succinctly:
8325d9d9091SRichard Lowe	 *
8335d9d9091SRichard Lowe	 *	SA(MAXSYSARGS * sizeof (long)) == 64
8345d9d9091SRichard Lowe	 */
8355d9d9091SRichard Lowe#define	SYS_DROP	64			/* drop for args */
8365d9d9091SRichard Lowe	subq	$SYS_DROP, %rsp
8375d9d9091SRichard Lowe	movb	$LWP_SYS, LWP_STATE(%r14)
8385d9d9091SRichard Lowe	movq	%r15, %rdi
8395d9d9091SRichard Lowe	movq	%rsp, %rsi
8405d9d9091SRichard Lowe	call	syscall_entry
8415d9d9091SRichard Lowe
8425d9d9091SRichard Lowe	/*
8435d9d9091SRichard Lowe	 * Fetch the arguments copied onto the kernel stack and put
8445d9d9091SRichard Lowe	 * them in the right registers to invoke a C-style syscall handler.
8455d9d9091SRichard Lowe	 * %rax contains the handler address.
8465d9d9091SRichard Lowe	 *
8475d9d9091SRichard Lowe	 * Ideas for making all this go faster of course include simply
8485d9d9091SRichard Lowe	 * forcibly fetching 6 arguments from the user stack under lofault
8495d9d9091SRichard Lowe	 * protection, reverting to copyin_args only when watchpoints
8505d9d9091SRichard Lowe	 * are in effect.
8515d9d9091SRichard Lowe	 *
8525d9d9091SRichard Lowe	 * (If we do this, make sure that exec and libthread leave
8535d9d9091SRichard Lowe	 * enough space at the top of the stack to ensure that we'll
8545d9d9091SRichard Lowe	 * never do a fetch from an invalid page.)
8555d9d9091SRichard Lowe	 *
8565d9d9091SRichard Lowe	 * Lots of ideas here, but they won't really help with bringup B-)
8575d9d9091SRichard Lowe	 * Correctness can't wait, performance can wait a little longer ..
8585d9d9091SRichard Lowe	 */
8595d9d9091SRichard Lowe
8605d9d9091SRichard Lowe	movq	%rax, %rbx
8615d9d9091SRichard Lowe	movl	0(%rsp), %edi
8625d9d9091SRichard Lowe	movl	8(%rsp), %esi
8635d9d9091SRichard Lowe	movl	0x10(%rsp), %edx
8645d9d9091SRichard Lowe	movl	0x18(%rsp), %ecx
8655d9d9091SRichard Lowe	movl	0x20(%rsp), %r8d
8665d9d9091SRichard Lowe	movl	0x28(%rsp), %r9d
8675d9d9091SRichard Lowe
8685d9d9091SRichard Lowe	movq	SY_CALLC(%rbx), %rax
8695d9d9091SRichard Lowe	INDIRECT_CALL_REG(rax)
8705d9d9091SRichard Lowe
8715d9d9091SRichard Lowe	movq	%rbp, %rsp	/* pop the args */
8725d9d9091SRichard Lowe
8735d9d9091SRichard Lowe	/*
8745d9d9091SRichard Lowe	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
8755d9d9091SRichard Lowe	 * On the 32-bit kernel, they always return that value in %eax:%edx
8765d9d9091SRichard Lowe	 * as required by the 32-bit ABI.
8775d9d9091SRichard Lowe	 *
8785d9d9091SRichard Lowe	 * Simulate the same behaviour by unconditionally splitting the
8795d9d9091SRichard Lowe	 * return value in the same way.
8805d9d9091SRichard Lowe	 */
8815d9d9091SRichard Lowe	movq	%rax, %r13
8825d9d9091SRichard Lowe	shrq	$32, %r13	/* upper 32-bits into %edx */
8835d9d9091SRichard Lowe	movl	%eax, %r12d	/* lower 32-bits into %eax */
8845d9d9091SRichard Lowe
8855d9d9091SRichard Lowe	/*
8865d9d9091SRichard Lowe	 * Optimistically assume that there's no post-syscall
8875d9d9091SRichard Lowe	 * work to do.  (This is to avoid having to call syscall_mstate()
8885d9d9091SRichard Lowe	 * with interrupts disabled)
8895d9d9091SRichard Lowe	 */
8905d9d9091SRichard Lowe	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
8915d9d9091SRichard Lowe
8925d9d9091SRichard Lowe	/*
8935d9d9091SRichard Lowe	 * We must protect ourselves from being descheduled here;
8945d9d9091SRichard Lowe	 * If we were, and we ended up on another cpu, or another
8955d9d9091SRichard Lowe	 * lwp got in ahead of us, it could change the segment
8965d9d9091SRichard Lowe	 * registers without us noticing before we return to userland.
8975d9d9091SRichard Lowe	 */
8985d9d9091SRichard Lowe	CLI(%r14)
8995d9d9091SRichard Lowe	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
9005d9d9091SRichard Lowe	jne	_full_syscall_postsys32
9015d9d9091SRichard Lowe	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
9025d9d9091SRichard Lowe
9035d9d9091SRichard Lowe	/*
9045d9d9091SRichard Lowe	 * Clobber %r11 as we check CR0.TS.
9055d9d9091SRichard Lowe	 */
9065d9d9091SRichard Lowe	ASSERT_CR0TS_ZERO(%r11)
9075d9d9091SRichard Lowe
9085d9d9091SRichard Lowe	/*
9095d9d9091SRichard Lowe	 * To get back to userland, we need to put the return %rip in %rcx and
9105d9d9091SRichard Lowe	 * the return %rfl in %r11d.  The sysret instruction also arranges
9115d9d9091SRichard Lowe	 * to fix up %cs and %ss; everything else is our responsibility.
9125d9d9091SRichard Lowe	 */
9135d9d9091SRichard Lowe
9145d9d9091SRichard Lowe	movl	%r12d, %eax			/* %eax: rval1 */
9155d9d9091SRichard Lowe	movl	REGOFF_RBX(%rsp), %ebx
9165d9d9091SRichard Lowe	/* %ecx used for return pointer */
9175d9d9091SRichard Lowe	movl	%r13d, %edx			/* %edx: rval2 */
9185d9d9091SRichard Lowe	movl	REGOFF_RBP(%rsp), %ebp
9195d9d9091SRichard Lowe	movl	REGOFF_RSI(%rsp), %esi
9205d9d9091SRichard Lowe	movl	REGOFF_RDI(%rsp), %edi
9215d9d9091SRichard Lowe
9225d9d9091SRichard Lowe	movl	REGOFF_RFL(%rsp), %r11d		/* %r11 -> eflags */
9235d9d9091SRichard Lowe	movl	REGOFF_RIP(%rsp), %ecx		/* %ecx -> %eip */
924*5cd084edSDan McDonald	/*
925*5cd084edSDan McDonald	 * Unlike other cases, because we need to restore the user stack pointer
926*5cd084edSDan McDonald	 * before exiting the kernel we must clear the microarch state before
927*5cd084edSDan McDonald	 * getting here. This should be safe because it means that the only
928*5cd084edSDan McDonald	 * values on the bus after this are based on the user's registers and
929*5cd084edSDan McDonald	 * potentially the addresses where we stored them. Given the constraints
930*5cd084edSDan McDonald	 * of sysret, that's how it has to be.
931*5cd084edSDan McDonald	 */
932*5cd084edSDan McDonald	call	x86_md_clear
933*5cd084edSDan McDonald
9345d9d9091SRichard Lowe	movl	REGOFF_RSP(%rsp), %esp
9355d9d9091SRichard Lowe
9365d9d9091SRichard Lowe	ASSERT_UPCALL_MASK_IS_SET
9375d9d9091SRichard Lowe        ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
9385d9d9091SRichard Lowe	jmp	tr_sysretl
9395d9d9091SRichard Lowe        SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
9405d9d9091SRichard Lowe	/*NOTREACHED*/
9415d9d9091SRichard Lowe
9425d9d9091SRichard Lowe_full_syscall_postsys32:
9435d9d9091SRichard Lowe	STI
9445d9d9091SRichard Lowe	/*
9455d9d9091SRichard Lowe	 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
9465d9d9091SRichard Lowe	 * so that we can account for the extra work it takes us to finish.
9475d9d9091SRichard Lowe	 */
9485d9d9091SRichard Lowe	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
9495d9d9091SRichard Lowe	movq	%r15, %rdi
9505d9d9091SRichard Lowe	movq	%r12, %rsi			/* rval1 - %eax */
9515d9d9091SRichard Lowe	movq	%r13, %rdx			/* rval2 - %edx */
9525d9d9091SRichard Lowe	call	syscall_exit
9535d9d9091SRichard Lowe	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
9545d9d9091SRichard Lowe	jmp	_sys_rtt
9555d9d9091SRichard Lowe	SET_SIZE(sys_syscall32)
9565d9d9091SRichard Lowe	SET_SIZE(brand_sys_syscall32)
9575d9d9091SRichard Lowe
9585d9d9091SRichard Lowe/*
9595d9d9091SRichard Lowe * System call handler via the sysenter instruction
9605d9d9091SRichard Lowe * Used only for 32-bit system calls on the 64-bit kernel.
9615d9d9091SRichard Lowe *
9625d9d9091SRichard Lowe * The caller in userland has arranged that:
9635d9d9091SRichard Lowe *
9645d9d9091SRichard Lowe * -	%eax contains the syscall number
9655d9d9091SRichard Lowe * -	%ecx contains the user %esp
9665d9d9091SRichard Lowe * -	%edx contains the return %eip
9675d9d9091SRichard Lowe * -	the user stack contains the args to the syscall
9685d9d9091SRichard Lowe *
9695d9d9091SRichard Lowe * Hardware and (privileged) initialization code have arranged that by
9705d9d9091SRichard Lowe * the time the sysenter instructions completes:
9715d9d9091SRichard Lowe *
9725d9d9091SRichard Lowe * - %rip is pointing to sys_sysenter (below).
9735d9d9091SRichard Lowe * - %cs and %ss are set to kernel text and stack (data) selectors.
9745d9d9091SRichard Lowe * - %rsp is pointing at the lwp's stack
9755d9d9091SRichard Lowe * - interrupts have been disabled.
9765d9d9091SRichard Lowe *
9775d9d9091SRichard Lowe * Note that we are unable to return both "rvals" to userland with
9785d9d9091SRichard Lowe * this call, as %edx is used by the sysexit instruction.
9795d9d9091SRichard Lowe *
9805d9d9091SRichard Lowe * One final complication in this routine is its interaction with
9815d9d9091SRichard Lowe * single-stepping in a debugger.  For most of the system call mechanisms, the
9825d9d9091SRichard Lowe * CPU automatically clears the single-step flag before we enter the kernel.
9835d9d9091SRichard Lowe * The sysenter mechanism does not clear the flag, so a user single-stepping
9845d9d9091SRichard Lowe * through a libc routine may suddenly find themself single-stepping through the
9855d9d9091SRichard Lowe * kernel.  To detect this, kmdb and trap() both compare the trap %pc to the
9865d9d9091SRichard Lowe * [brand_]sys_enter addresses on each single-step trap.  If it finds that we
9875d9d9091SRichard Lowe * have single-stepped to a sysenter entry point, it explicitly clears the flag
9885d9d9091SRichard Lowe * and executes the sys_sysenter routine.
9895d9d9091SRichard Lowe *
9905d9d9091SRichard Lowe * One final complication in this final complication is the fact that we have
9915d9d9091SRichard Lowe * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter.
9925d9d9091SRichard Lowe * If we enter at brand_sys_sysenter and start single-stepping through the
9935d9d9091SRichard Lowe * kernel with kmdb, we will eventually hit the instruction at sys_sysenter.
9945d9d9091SRichard Lowe * kmdb cannot distinguish between that valid single-step and the undesirable
9955d9d9091SRichard Lowe * one mentioned above.  To avoid this situation, we simply add a jump over the
9965d9d9091SRichard Lowe * instruction at sys_sysenter to make it impossible to single-step to it.
9975d9d9091SRichard Lowe */
9985d9d9091SRichard Lowe
9995d9d9091SRichard Lowe	ENTRY_NP(brand_sys_sysenter)
10005d9d9091SRichard Lowe	SWAPGS				/* kernel gsbase */
10015d9d9091SRichard Lowe	ALTENTRY(_brand_sys_sysenter_post_swapgs)
10025d9d9091SRichard Lowe
10035d9d9091SRichard Lowe	BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
10045d9d9091SRichard Lowe	/*
10055d9d9091SRichard Lowe	 * Jump over sys_sysenter to allow single-stepping as described
10065d9d9091SRichard Lowe	 * above.
10075d9d9091SRichard Lowe	 */
10085d9d9091SRichard Lowe	jmp	_sys_sysenter_post_swapgs
10095d9d9091SRichard Lowe
10105d9d9091SRichard Lowe	ALTENTRY(sys_sysenter)
10115d9d9091SRichard Lowe	SWAPGS				/* kernel gsbase */
10125d9d9091SRichard Lowe	ALTENTRY(_sys_sysenter_post_swapgs)
10135d9d9091SRichard Lowe
10145d9d9091SRichard Lowe	movq	%gs:CPU_THREAD, %r15
10155d9d9091SRichard Lowe
10165d9d9091SRichard Lowe	movl	$U32CS_SEL, REGOFF_CS(%rsp)
10175d9d9091SRichard Lowe	movl	%ecx, REGOFF_RSP(%rsp)		/* wrapper: %esp -> %ecx */
10185d9d9091SRichard Lowe	movl	%edx, REGOFF_RIP(%rsp)		/* wrapper: %eip -> %edx */
10195d9d9091SRichard Lowe	/*
10205d9d9091SRichard Lowe	 * NOTE: none of the instructions that run before we get here should
10215d9d9091SRichard Lowe	 * clobber bits in (R)FLAGS! This includes the kpti trampoline.
10225d9d9091SRichard Lowe	 */
10235d9d9091SRichard Lowe	pushfq
10245d9d9091SRichard Lowe	popq	%r10
10255d9d9091SRichard Lowe	movl	$UDS_SEL, REGOFF_SS(%rsp)
10265d9d9091SRichard Lowe
10275d9d9091SRichard Lowe	/*
10285d9d9091SRichard Lowe	 * Set the interrupt flag before storing the flags to the
10295d9d9091SRichard Lowe	 * flags image on the stack so we can return to user with
10305d9d9091SRichard Lowe	 * interrupts enabled if we return via sys_rtt_syscall32
10315d9d9091SRichard Lowe	 */
10325d9d9091SRichard Lowe	orq	$PS_IE, %r10
10335d9d9091SRichard Lowe	movq	%r10, REGOFF_RFL(%rsp)
10345d9d9091SRichard Lowe
10355d9d9091SRichard Lowe	movl	%edi, REGOFF_RDI(%rsp)
10365d9d9091SRichard Lowe	movl	%esi, REGOFF_RSI(%rsp)
10375d9d9091SRichard Lowe	movl	%ebp, REGOFF_RBP(%rsp)
10385d9d9091SRichard Lowe	movl	%ebx, REGOFF_RBX(%rsp)
10395d9d9091SRichard Lowe	movl	%edx, REGOFF_RDX(%rsp)
10405d9d9091SRichard Lowe	movl	%ecx, REGOFF_RCX(%rsp)
10415d9d9091SRichard Lowe	movl	%eax, REGOFF_RAX(%rsp)		/* wrapper: sysc# -> %eax */
10425d9d9091SRichard Lowe	movq	$0, REGOFF_SAVFP(%rsp)
10435d9d9091SRichard Lowe	movq	$0, REGOFF_SAVPC(%rsp)
10445d9d9091SRichard Lowe
10455d9d9091SRichard Lowe	/*
10465d9d9091SRichard Lowe	 * Copy these registers here in case we end up stopped with
10475d9d9091SRichard Lowe	 * someone (like, say, /proc) messing with our register state.
10485d9d9091SRichard Lowe	 * We don't -restore- them unless we have to in update_sregs.
10495d9d9091SRichard Lowe	 *
10505d9d9091SRichard Lowe	 * Since userland -can't- change fsbase or gsbase directly,
10515d9d9091SRichard Lowe	 * we don't bother to capture them here.
10525d9d9091SRichard Lowe	 */
10535d9d9091SRichard Lowe	xorl	%ebx, %ebx
10545d9d9091SRichard Lowe	movw	%ds, %bx
10555d9d9091SRichard Lowe	movq	%rbx, REGOFF_DS(%rsp)
10565d9d9091SRichard Lowe	movw	%es, %bx
10575d9d9091SRichard Lowe	movq	%rbx, REGOFF_ES(%rsp)
10585d9d9091SRichard Lowe	movw	%fs, %bx
10595d9d9091SRichard Lowe	movq	%rbx, REGOFF_FS(%rsp)
10605d9d9091SRichard Lowe	movw	%gs, %bx
10615d9d9091SRichard Lowe	movq	%rbx, REGOFF_GS(%rsp)
10625d9d9091SRichard Lowe
10635d9d9091SRichard Lowe	/*
10645d9d9091SRichard Lowe	 * If we're trying to use TRAPTRACE though, I take that back: we're
10655d9d9091SRichard Lowe	 * probably debugging some problem in the SWAPGS logic and want to know
10665d9d9091SRichard Lowe	 * what the incoming gsbase was.
10675d9d9091SRichard Lowe	 *
10685d9d9091SRichard Lowe	 * Since we already did SWAPGS, record the KGSBASE.
10695d9d9091SRichard Lowe	 */
10705d9d9091SRichard Lowe#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
10715d9d9091SRichard Lowe	movl	$MSR_AMD_KGSBASE, %ecx
10725d9d9091SRichard Lowe	rdmsr
10735d9d9091SRichard Lowe	movl	%eax, REGOFF_GSBASE(%rsp)
10745d9d9091SRichard Lowe	movl	%edx, REGOFF_GSBASE+4(%rsp)
10755d9d9091SRichard Lowe#endif
10765d9d9091SRichard Lowe
10775d9d9091SRichard Lowe	/*
10785d9d9091SRichard Lowe	 * Application state saved in the regs structure on the stack
10795d9d9091SRichard Lowe	 * %eax is the syscall number
10805d9d9091SRichard Lowe	 * %rsp is the thread's stack, %r15 is curthread
10815d9d9091SRichard Lowe	 * REG_RSP(%rsp) is the user's stack
10825d9d9091SRichard Lowe	 */
10835d9d9091SRichard Lowe
10845d9d9091SRichard Lowe	SYSCALL_TRAPTRACE($TT_SYSENTER)
10855d9d9091SRichard Lowe
10865d9d9091SRichard Lowe	movq	%rsp, %rbp
10875d9d9091SRichard Lowe
10885d9d9091SRichard Lowe	movq	T_LWP(%r15), %r14
10895d9d9091SRichard Lowe	ASSERT_NO_RUPDATE_PENDING(%r14)
10905d9d9091SRichard Lowe
10915d9d9091SRichard Lowe	ENABLE_INTR_FLAGS
10925d9d9091SRichard Lowe
10935d9d9091SRichard Lowe	/*
10945d9d9091SRichard Lowe	 * Catch 64-bit process trying to issue sysenter instruction
10955d9d9091SRichard Lowe	 * on Nocona based systems.
10965d9d9091SRichard Lowe	 */
10975d9d9091SRichard Lowe	movq	LWP_PROCP(%r14), %rax
10985d9d9091SRichard Lowe	cmpq	$DATAMODEL_ILP32, P_MODEL(%rax)
10995d9d9091SRichard Lowe	je	7f
11005d9d9091SRichard Lowe
11015d9d9091SRichard Lowe	/*
11025d9d9091SRichard Lowe	 * For a non-32-bit process, simulate a #ud, since that's what
11035d9d9091SRichard Lowe	 * native hardware does.  The traptrace entry (above) will
11045d9d9091SRichard Lowe	 * let you know what really happened.
11055d9d9091SRichard Lowe	 */
11065d9d9091SRichard Lowe	movq	$T_ILLINST, REGOFF_TRAPNO(%rsp)
11075d9d9091SRichard Lowe	movq	REGOFF_CS(%rsp), %rdi
11085d9d9091SRichard Lowe	movq	%rdi, REGOFF_ERR(%rsp)
11095d9d9091SRichard Lowe	movq	%rsp, %rdi
11105d9d9091SRichard Lowe	movq	REGOFF_RIP(%rsp), %rsi
11115d9d9091SRichard Lowe	movl	%gs:CPU_ID, %edx
11125d9d9091SRichard Lowe	call	trap
11135d9d9091SRichard Lowe	jmp	_sys_rtt
11145d9d9091SRichard Lowe7:
11155d9d9091SRichard Lowe
11165d9d9091SRichard Lowe	MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
11175d9d9091SRichard Lowe	movl	REGOFF_RAX(%rsp), %eax	/* (%rax damaged by mstate calls) */
11185d9d9091SRichard Lowe
11195d9d9091SRichard Lowe	ASSERT_LWPTOREGS(%r14, %rsp)
11205d9d9091SRichard Lowe
11215d9d9091SRichard Lowe	incq	%gs:CPU_STATS_SYS_SYSCALL
11225d9d9091SRichard Lowe
11235d9d9091SRichard Lowe	/*
11245d9d9091SRichard Lowe	 * Make some space for MAXSYSARGS (currently 8) 32-bit args
11255d9d9091SRichard Lowe	 * placed into 64-bit (long) arg slots, plus one 64-bit
11265d9d9091SRichard Lowe	 * (long) arg count, maintaining 16 byte alignment.
11275d9d9091SRichard Lowe	 */
11285d9d9091SRichard Lowe	subq	$SYS_DROP, %rsp
11295d9d9091SRichard Lowe	movb	$LWP_SYS, LWP_STATE(%r14)
11305d9d9091SRichard Lowe	movq	%r15, %rdi
11315d9d9091SRichard Lowe	movq	%rsp, %rsi
11325d9d9091SRichard Lowe	call	syscall_entry
11335d9d9091SRichard Lowe
11345d9d9091SRichard Lowe	/*
11355d9d9091SRichard Lowe	 * Fetch the arguments copied onto the kernel stack and put
11365d9d9091SRichard Lowe	 * them in the right registers to invoke a C-style syscall handler.
11375d9d9091SRichard Lowe	 * %rax contains the handler address.
11385d9d9091SRichard Lowe	 */
11395d9d9091SRichard Lowe	movq	%rax, %rbx
11405d9d9091SRichard Lowe	movl	0(%rsp), %edi
11415d9d9091SRichard Lowe	movl	8(%rsp), %esi
11425d9d9091SRichard Lowe	movl	0x10(%rsp), %edx
11435d9d9091SRichard Lowe	movl	0x18(%rsp), %ecx
11445d9d9091SRichard Lowe	movl	0x20(%rsp), %r8d
11455d9d9091SRichard Lowe	movl	0x28(%rsp), %r9d
11465d9d9091SRichard Lowe
11475d9d9091SRichard Lowe	movq	SY_CALLC(%rbx), %rax
11485d9d9091SRichard Lowe	INDIRECT_CALL_REG(rax)
11495d9d9091SRichard Lowe
11505d9d9091SRichard Lowe	movq	%rbp, %rsp	/* pop the args */
11515d9d9091SRichard Lowe
11525d9d9091SRichard Lowe	/*
11535d9d9091SRichard Lowe	 * amd64 syscall handlers -always- return a 64-bit value in %rax.
11545d9d9091SRichard Lowe	 * On the 32-bit kernel, the always return that value in %eax:%edx
11555d9d9091SRichard Lowe	 * as required by the 32-bit ABI.
11565d9d9091SRichard Lowe	 *
11575d9d9091SRichard Lowe	 * Simulate the same behaviour by unconditionally splitting the
11585d9d9091SRichard Lowe	 * return value in the same way.
11595d9d9091SRichard Lowe	 */
11605d9d9091SRichard Lowe	movq	%rax, %r13
11615d9d9091SRichard Lowe	shrq	$32, %r13	/* upper 32-bits into %edx */
11625d9d9091SRichard Lowe	movl	%eax, %r12d	/* lower 32-bits into %eax */
11635d9d9091SRichard Lowe
11645d9d9091SRichard Lowe	/*
11655d9d9091SRichard Lowe	 * Optimistically assume that there's no post-syscall
11665d9d9091SRichard Lowe	 * work to do.  (This is to avoid having to call syscall_mstate()
11675d9d9091SRichard Lowe	 * with interrupts disabled)
11685d9d9091SRichard Lowe	 */
11695d9d9091SRichard Lowe	MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
11705d9d9091SRichard Lowe
11715d9d9091SRichard Lowe	/*
11725d9d9091SRichard Lowe	 * We must protect ourselves from being descheduled here;
11735d9d9091SRichard Lowe	 * If we were, and we ended up on another cpu, or another
11745d9d9091SRichard Lowe	 * lwp got int ahead of us, it could change the segment
11755d9d9091SRichard Lowe	 * registers without us noticing before we return to userland.
11765d9d9091SRichard Lowe	 *
11775d9d9091SRichard Lowe	 * This cli is undone in the tr_sysexit trampoline code.
11785d9d9091SRichard Lowe	 */
11795d9d9091SRichard Lowe	cli
11805d9d9091SRichard Lowe	CHECK_POSTSYS_NE(%r15, %r14, %ebx)
11815d9d9091SRichard Lowe	jne	_full_syscall_postsys32
11825d9d9091SRichard Lowe	SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
11835d9d9091SRichard Lowe
11845d9d9091SRichard Lowe	/*
11855d9d9091SRichard Lowe	 * To get back to userland, load up the 32-bit registers and
11865d9d9091SRichard Lowe	 * sysexit back where we came from.
11875d9d9091SRichard Lowe	 */
11885d9d9091SRichard Lowe
11895d9d9091SRichard Lowe	/*
11905d9d9091SRichard Lowe	 * Interrupts will be turned on by the 'sti' executed just before
11915d9d9091SRichard Lowe	 * sysexit.  The following ensures that restoring the user's rflags
11925d9d9091SRichard Lowe	 * doesn't enable interrupts too soon.
11935d9d9091SRichard Lowe	 */
11945d9d9091SRichard Lowe	andq	$_BITNOT(PS_IE), REGOFF_RFL(%rsp)
11955d9d9091SRichard Lowe
11965d9d9091SRichard Lowe	/*
11975d9d9091SRichard Lowe	 * Clobber %r11 as we check CR0.TS.
11985d9d9091SRichard Lowe	 */
11995d9d9091SRichard Lowe	ASSERT_CR0TS_ZERO(%r11)
12005d9d9091SRichard Lowe
12015d9d9091SRichard Lowe	/*
12025d9d9091SRichard Lowe	 * (There's no point in loading up %edx because the sysexit
12035d9d9091SRichard Lowe	 * mechanism smashes it.)
12045d9d9091SRichard Lowe	 */
12055d9d9091SRichard Lowe	movl	%r12d, %eax
12065d9d9091SRichard Lowe	movl	REGOFF_RBX(%rsp), %ebx
12075d9d9091SRichard Lowe	movl	REGOFF_RBP(%rsp), %ebp
12085d9d9091SRichard Lowe	movl	REGOFF_RSI(%rsp), %esi
12095d9d9091SRichard Lowe	movl	REGOFF_RDI(%rsp), %edi
12105d9d9091SRichard Lowe
12115d9d9091SRichard Lowe	movl	REGOFF_RIP(%rsp), %edx	/* sysexit: %edx -> %eip */
12125d9d9091SRichard Lowe	pushq	REGOFF_RFL(%rsp)
12135d9d9091SRichard Lowe	popfq
12145d9d9091SRichard Lowe	movl	REGOFF_RSP(%rsp), %ecx	/* sysexit: %ecx -> %esp */
12155d9d9091SRichard Lowe        ALTENTRY(sys_sysenter_swapgs_sysexit)
12165d9d9091SRichard Lowe	call	x86_md_clear
12175d9d9091SRichard Lowe	jmp	tr_sysexit
12185d9d9091SRichard Lowe	SET_SIZE(sys_sysenter_swapgs_sysexit)
12195d9d9091SRichard Lowe	SET_SIZE(sys_sysenter)
12205d9d9091SRichard Lowe	SET_SIZE(_sys_sysenter_post_swapgs)
12215d9d9091SRichard Lowe	SET_SIZE(brand_sys_sysenter)
12225d9d9091SRichard Lowe
12235d9d9091SRichard Lowe/*
12245d9d9091SRichard Lowe * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
12255d9d9091SRichard Lowe * the generic i386 libc to do system calls. We do a small amount of setup
12265d9d9091SRichard Lowe * before jumping into the existing sys_syscall32 path.
12275d9d9091SRichard Lowe */
12285d9d9091SRichard Lowe
12295d9d9091SRichard Lowe	ENTRY_NP(brand_sys_syscall_int)
12305d9d9091SRichard Lowe	SWAPGS				/* kernel gsbase */
12315d9d9091SRichard Lowe	XPV_TRAP_POP
12325d9d9091SRichard Lowe	call	smap_enable
12335d9d9091SRichard Lowe	BRAND_CALLBACK(BRAND_CB_INT91, BRAND_URET_FROM_INTR_STACK())
12345d9d9091SRichard Lowe	jmp	nopop_syscall_int
12355d9d9091SRichard Lowe
12365d9d9091SRichard Lowe	ALTENTRY(sys_syscall_int)
12375d9d9091SRichard Lowe	SWAPGS				/* kernel gsbase */
12385d9d9091SRichard Lowe	XPV_TRAP_POP
12395d9d9091SRichard Lowe	call	smap_enable
12405d9d9091SRichard Lowe
12415d9d9091SRichard Lowenopop_syscall_int:
12425d9d9091SRichard Lowe	movq	%gs:CPU_THREAD, %r15
12435d9d9091SRichard Lowe	movq	T_STACK(%r15), %rsp
12445d9d9091SRichard Lowe	movl	%eax, %eax
12455d9d9091SRichard Lowe	/*
12465d9d9091SRichard Lowe	 * Set t_post_sys on this thread to force ourselves out via the slow
12475d9d9091SRichard Lowe	 * path. It might be possible at some later date to optimize this out
12485d9d9091SRichard Lowe	 * and use a faster return mechanism.
12495d9d9091SRichard Lowe	 */
12505d9d9091SRichard Lowe	movb	$1, T_POST_SYS(%r15)
12515d9d9091SRichard Lowe	CLEAN_CS
12525d9d9091SRichard Lowe	jmp	_syscall32_save
12535d9d9091SRichard Lowe	/*
12545d9d9091SRichard Lowe	 * There should be no instructions between this label and SWAPGS/IRET
12555d9d9091SRichard Lowe	 * or we could end up breaking branded zone support. See the usage of
12565d9d9091SRichard Lowe	 * this label in lx_brand_int80_callback and sn1_brand_int91_callback
12575d9d9091SRichard Lowe	 * for examples.
12585d9d9091SRichard Lowe	 *
12595d9d9091SRichard Lowe	 * We want to swapgs to maintain the invariant that all entries into
12605d9d9091SRichard Lowe	 * tr_iret_user are done on the user gsbase.
12615d9d9091SRichard Lowe	 */
12625d9d9091SRichard Lowe	ALTENTRY(sys_sysint_swapgs_iret)
12635d9d9091SRichard Lowe	call	x86_md_clear
12645d9d9091SRichard Lowe	SWAPGS
12655d9d9091SRichard Lowe	jmp	tr_iret_user
12665d9d9091SRichard Lowe	/*NOTREACHED*/
12675d9d9091SRichard Lowe	SET_SIZE(sys_sysint_swapgs_iret)
12685d9d9091SRichard Lowe	SET_SIZE(sys_syscall_int)
12695d9d9091SRichard Lowe	SET_SIZE(brand_sys_syscall_int)
12705d9d9091SRichard Lowe
12715d9d9091SRichard Lowe/*
12725d9d9091SRichard Lowe * Legacy 32-bit applications and old libc implementations do lcalls;
12735d9d9091SRichard Lowe * we should never get here because the LDT entry containing the syscall
12745d9d9091SRichard Lowe * segment descriptor has the "segment present" bit cleared, which means
12755d9d9091SRichard Lowe * we end up processing those system calls in trap() via a not-present trap.
12765d9d9091SRichard Lowe *
12775d9d9091SRichard Lowe * We do it this way because a call gate unhelpfully does -nothing- to the
12785d9d9091SRichard Lowe * interrupt flag bit, so an interrupt can run us just after the lcall
12795d9d9091SRichard Lowe * completes, but just before the swapgs takes effect.   Thus the INTR_PUSH and
12805d9d9091SRichard Lowe * INTR_POP paths would have to be slightly more complex to dance around
12815d9d9091SRichard Lowe * this problem, and end up depending explicitly on the first
12825d9d9091SRichard Lowe * instruction of this handler being either swapgs or cli.
12835d9d9091SRichard Lowe */
12845d9d9091SRichard Lowe
12855d9d9091SRichard Lowe	ENTRY_NP(sys_lcall32)
12865d9d9091SRichard Lowe	SWAPGS				/* kernel gsbase */
12875d9d9091SRichard Lowe	pushq	$0
12885d9d9091SRichard Lowe	pushq	%rbp
12895d9d9091SRichard Lowe	movq	%rsp, %rbp
12905d9d9091SRichard Lowe	leaq	__lcall_panic_str(%rip), %rdi
12915d9d9091SRichard Lowe	xorl	%eax, %eax
12925d9d9091SRichard Lowe	call	panic
12935d9d9091SRichard Lowe	SET_SIZE(sys_lcall32)
12945d9d9091SRichard Lowe
12955d9d9091SRichard Lowe__lcall_panic_str:
12965d9d9091SRichard Lowe	.string	"sys_lcall32: shouldn't be here!"
12975d9d9091SRichard Lowe
12985d9d9091SRichard Lowe/*
12995d9d9091SRichard Lowe * Declare a uintptr_t which covers the entire pc range of syscall
13005d9d9091SRichard Lowe * handlers for the stack walkers that need this.
13015d9d9091SRichard Lowe */
13025d9d9091SRichard Lowe	.align	CPTRSIZE
13035d9d9091SRichard Lowe	.globl	_allsyscalls_size
13045d9d9091SRichard Lowe	.type	_allsyscalls_size, @object
13055d9d9091SRichard Lowe_allsyscalls_size:
13065d9d9091SRichard Lowe	.NWORD	. - _allsyscalls
13075d9d9091SRichard Lowe	SET_SIZE(_allsyscalls_size)
13085d9d9091SRichard Lowe
13095d9d9091SRichard Lowe/*
13105d9d9091SRichard Lowe * These are the thread context handlers for lwps using sysenter/sysexit.
13115d9d9091SRichard Lowe */
13125d9d9091SRichard Lowe
13135d9d9091SRichard Lowe	/*
13145d9d9091SRichard Lowe	 * setting this value to zero as we switch away causes the
13155d9d9091SRichard Lowe	 * stack-pointer-on-sysenter to be NULL, ensuring that we
13165d9d9091SRichard Lowe	 * don't silently corrupt another (preempted) thread stack
13175d9d9091SRichard Lowe	 * when running an lwp that (somehow) didn't get sep_restore'd
13185d9d9091SRichard Lowe	 */
13195d9d9091SRichard Lowe	ENTRY_NP(sep_save)
13205d9d9091SRichard Lowe	xorl	%edx, %edx
13215d9d9091SRichard Lowe	xorl	%eax, %eax
13225d9d9091SRichard Lowe	movl	$MSR_INTC_SEP_ESP, %ecx
13235d9d9091SRichard Lowe	wrmsr
13245d9d9091SRichard Lowe	ret
13255d9d9091SRichard Lowe	SET_SIZE(sep_save)
13265d9d9091SRichard Lowe
13275d9d9091SRichard Lowe	/*
13285d9d9091SRichard Lowe	 * Update the kernel stack pointer as we resume onto this cpu.
13295d9d9091SRichard Lowe	 */
13305d9d9091SRichard Lowe	ENTRY_NP(sep_restore)
13315d9d9091SRichard Lowe	movq	%rdi, %rdx
13325d9d9091SRichard Lowe	shrq	$32, %rdx
13335d9d9091SRichard Lowe	movl	%edi, %eax
13345d9d9091SRichard Lowe	movl	$MSR_INTC_SEP_ESP, %ecx
13355d9d9091SRichard Lowe	wrmsr
13365d9d9091SRichard Lowe	ret
13375d9d9091SRichard Lowe	SET_SIZE(sep_restore)
1338