15d9d9091SRichard Lowe/* 25d9d9091SRichard Lowe * CDDL HEADER START 35d9d9091SRichard Lowe * 45d9d9091SRichard Lowe * The contents of this file are subject to the terms of the 55d9d9091SRichard Lowe * Common Development and Distribution License (the "License"). 65d9d9091SRichard Lowe * You may not use this file except in compliance with the License. 75d9d9091SRichard Lowe * 85d9d9091SRichard Lowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 95d9d9091SRichard Lowe * or http://www.opensolaris.org/os/licensing. 105d9d9091SRichard Lowe * See the License for the specific language governing permissions 115d9d9091SRichard Lowe * and limitations under the License. 125d9d9091SRichard Lowe * 135d9d9091SRichard Lowe * When distributing Covered Code, include this CDDL HEADER in each 145d9d9091SRichard Lowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 155d9d9091SRichard Lowe * If applicable, add the following below this CDDL HEADER, with the 165d9d9091SRichard Lowe * fields enclosed by brackets "[]" replaced with your own identifying 175d9d9091SRichard Lowe * information: Portions Copyright [yyyy] [name of copyright owner] 185d9d9091SRichard Lowe * 195d9d9091SRichard Lowe * CDDL HEADER END 205d9d9091SRichard Lowe */ 215d9d9091SRichard Lowe/* 225d9d9091SRichard Lowe * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 235d9d9091SRichard Lowe * Copyright 2019 Joyent, Inc. 245d9d9091SRichard Lowe * Copyright (c) 2016 by Delphix. All rights reserved. 25*5cd084edSDan McDonald * Copyright 2024 MNX Cloud, Inc. 265d9d9091SRichard Lowe */ 275d9d9091SRichard Lowe 285d9d9091SRichard Lowe#include <sys/asm_linkage.h> 295d9d9091SRichard Lowe#include <sys/asm_misc.h> 305d9d9091SRichard Lowe#include <sys/regset.h> 315d9d9091SRichard Lowe#include <sys/privregs.h> 325d9d9091SRichard Lowe#include <sys/psw.h> 335d9d9091SRichard Lowe#include <sys/machbrand.h> 345d9d9091SRichard Lowe 355d9d9091SRichard Lowe#include <sys/segments.h> 365d9d9091SRichard Lowe#include <sys/pcb.h> 375d9d9091SRichard Lowe#include <sys/trap.h> 385d9d9091SRichard Lowe#include <sys/ftrace.h> 395d9d9091SRichard Lowe#include <sys/traptrace.h> 405d9d9091SRichard Lowe#include <sys/clock.h> 415d9d9091SRichard Lowe#include <sys/model.h> 425d9d9091SRichard Lowe#include <sys/panic.h> 435d9d9091SRichard Lowe 445d9d9091SRichard Lowe#if defined(__xpv) 455d9d9091SRichard Lowe#include <sys/hypervisor.h> 465d9d9091SRichard Lowe#endif 475d9d9091SRichard Lowe 485d9d9091SRichard Lowe#include "assym.h" 495d9d9091SRichard Lowe 505d9d9091SRichard Lowe/* 515d9d9091SRichard Lowe * We implement five flavours of system call entry points 525d9d9091SRichard Lowe * 535d9d9091SRichard Lowe * - syscall/sysretq (amd64 generic) 545d9d9091SRichard Lowe * - syscall/sysretl (i386 plus SYSC bit) 555d9d9091SRichard Lowe * - sysenter/sysexit (i386 plus SEP bit) 565d9d9091SRichard Lowe * - int/iret (i386 generic) 575d9d9091SRichard Lowe * - lcall/iret (i386 generic) 585d9d9091SRichard Lowe * 595d9d9091SRichard Lowe * The current libc included in Solaris uses int/iret as the base unoptimized 605d9d9091SRichard Lowe * kernel entry method. Older libc implementations and legacy binaries may use 615d9d9091SRichard Lowe * the lcall call gate, so it must continue to be supported. 625d9d9091SRichard Lowe * 635d9d9091SRichard Lowe * System calls that use an lcall call gate are processed in trap() via a 645d9d9091SRichard Lowe * segment-not-present trap, i.e. lcalls are extremely slow(!). 655d9d9091SRichard Lowe * 665d9d9091SRichard Lowe * The basic pattern used in the 32-bit SYSC handler at this point in time is 675d9d9091SRichard Lowe * to have the bare minimum of assembler, and get to the C handlers as 685d9d9091SRichard Lowe * quickly as possible. 695d9d9091SRichard Lowe * 705d9d9091SRichard Lowe * The 64-bit handler is much closer to the sparcv9 handler; that's 715d9d9091SRichard Lowe * because of passing arguments in registers. The 32-bit world still 725d9d9091SRichard Lowe * passes arguments on the stack -- that makes that handler substantially 735d9d9091SRichard Lowe * more complex. 745d9d9091SRichard Lowe * 755d9d9091SRichard Lowe * The two handlers share a few code fragments which are broken 765d9d9091SRichard Lowe * out into preprocessor macros below. 775d9d9091SRichard Lowe * 785d9d9091SRichard Lowe * XX64 come back and speed all this up later. The 32-bit stuff looks 795d9d9091SRichard Lowe * especially easy to speed up the argument copying part .. 805d9d9091SRichard Lowe * 815d9d9091SRichard Lowe * 825d9d9091SRichard Lowe * Notes about segment register usage (c.f. the 32-bit kernel) 835d9d9091SRichard Lowe * 845d9d9091SRichard Lowe * In the 32-bit kernel, segment registers are dutifully saved and 855d9d9091SRichard Lowe * restored on all mode transitions because the kernel uses them directly. 865d9d9091SRichard Lowe * When the processor is running in 64-bit mode, segment registers are 875d9d9091SRichard Lowe * largely ignored. 885d9d9091SRichard Lowe * 895d9d9091SRichard Lowe * %cs and %ss 905d9d9091SRichard Lowe * controlled by the hardware mechanisms that make mode transitions 915d9d9091SRichard Lowe * 925d9d9091SRichard Lowe * The remaining segment registers have to either be pointing at a valid 935d9d9091SRichard Lowe * descriptor i.e. with the 'present' bit set, or they can NULL descriptors 945d9d9091SRichard Lowe * 955d9d9091SRichard Lowe * %ds and %es 965d9d9091SRichard Lowe * always ignored 975d9d9091SRichard Lowe * 985d9d9091SRichard Lowe * %fs and %gs 995d9d9091SRichard Lowe * fsbase and gsbase are used to control the place they really point at. 1005d9d9091SRichard Lowe * The kernel only depends on %gs, and controls its own gsbase via swapgs 1015d9d9091SRichard Lowe * 1025d9d9091SRichard Lowe * Note that loading segment registers is still costly because the GDT 1035d9d9091SRichard Lowe * lookup still happens (this is because the hardware can't know that we're 1045d9d9091SRichard Lowe * not setting up these segment registers for a 32-bit program). Thus we 1055d9d9091SRichard Lowe * avoid doing this in the syscall path, and defer them to lwp context switch 1065d9d9091SRichard Lowe * handlers, so the register values remain virtualized to the lwp. 1075d9d9091SRichard Lowe */ 1085d9d9091SRichard Lowe 1095d9d9091SRichard Lowe#if defined(SYSCALLTRACE) 1105d9d9091SRichard Lowe#define ORL_SYSCALLTRACE(r32) \ 1115d9d9091SRichard Lowe orl syscalltrace(%rip), r32 1125d9d9091SRichard Lowe#else 1135d9d9091SRichard Lowe#define ORL_SYSCALLTRACE(r32) 1145d9d9091SRichard Lowe#endif 1155d9d9091SRichard Lowe 1165d9d9091SRichard Lowe/* 1175d9d9091SRichard Lowe * In the 32-bit kernel, we do absolutely nothing before getting into the 1185d9d9091SRichard Lowe * brand callback checks. In 64-bit land, we do swapgs and then come here. 1195d9d9091SRichard Lowe * We assume that the %rsp- and %r15-stashing fields in the CPU structure 1205d9d9091SRichard Lowe * are still unused. 1215d9d9091SRichard Lowe * 1225d9d9091SRichard Lowe * Check if a brand_mach_ops callback is defined for the specified callback_id 1235d9d9091SRichard Lowe * type. If so invoke it with the kernel's %gs value loaded and the following 1245d9d9091SRichard Lowe * data on the stack: 1255d9d9091SRichard Lowe * 1265d9d9091SRichard Lowe * stack: -------------------------------------- 1275d9d9091SRichard Lowe * 32 | callback pointer | 1285d9d9091SRichard Lowe * | 24 | user (or interrupt) stack pointer | 1295d9d9091SRichard Lowe * | 16 | lwp pointer | 1305d9d9091SRichard Lowe * v 8 | userland return address | 1315d9d9091SRichard Lowe * 0 | callback wrapper return addr | 1325d9d9091SRichard Lowe * -------------------------------------- 1335d9d9091SRichard Lowe * 1345d9d9091SRichard Lowe * Since we're pushing the userland return address onto the kernel stack 1355d9d9091SRichard Lowe * we need to get that address without accessing the user's stack (since we 1365d9d9091SRichard Lowe * can't trust that data). There are different ways to get the userland 1375d9d9091SRichard Lowe * return address depending on how the syscall trap was made: 1385d9d9091SRichard Lowe * 1395d9d9091SRichard Lowe * a) For sys_syscall and sys_syscall32 the return address is in %rcx. 1405d9d9091SRichard Lowe * b) For sys_sysenter the return address is in %rdx. 1415d9d9091SRichard Lowe * c) For sys_int80 and sys_syscall_int (int91), upon entry into the macro, 1425d9d9091SRichard Lowe * the stack pointer points at the state saved when we took the interrupt: 1435d9d9091SRichard Lowe * ------------------------ 1445d9d9091SRichard Lowe * | | user's %ss | 1455d9d9091SRichard Lowe * | | user's %esp | 1465d9d9091SRichard Lowe * | | EFLAGS register | 1475d9d9091SRichard Lowe * v | user's %cs | 1485d9d9091SRichard Lowe * | user's %eip | 1495d9d9091SRichard Lowe * ------------------------ 1505d9d9091SRichard Lowe * 1515d9d9091SRichard Lowe * The 2nd parameter to the BRAND_CALLBACK macro is either the 1525d9d9091SRichard Lowe * BRAND_URET_FROM_REG or BRAND_URET_FROM_INTR_STACK macro. These macros are 1535d9d9091SRichard Lowe * used to generate the proper code to get the userland return address for 1545d9d9091SRichard Lowe * each syscall entry point. 1555d9d9091SRichard Lowe * 1565d9d9091SRichard Lowe * The interface to the brand callbacks on the 64-bit kernel assumes %r15 1575d9d9091SRichard Lowe * is available as a scratch register within the callback. If the callback 1585d9d9091SRichard Lowe * returns within the kernel then this macro will restore %r15. If the 1595d9d9091SRichard Lowe * callback is going to return directly to userland then it should restore 1605d9d9091SRichard Lowe * %r15 before returning to userland. 1615d9d9091SRichard Lowe */ 1625d9d9091SRichard Lowe#define BRAND_URET_FROM_REG(rip_reg) \ 1635d9d9091SRichard Lowe pushq rip_reg /* push the return address */ 1645d9d9091SRichard Lowe 1655d9d9091SRichard Lowe/* 1665d9d9091SRichard Lowe * The interrupt stack pointer we saved on entry to the BRAND_CALLBACK macro 1675d9d9091SRichard Lowe * is currently pointing at the user return address (%eip). 1685d9d9091SRichard Lowe */ 1695d9d9091SRichard Lowe#define BRAND_URET_FROM_INTR_STACK() \ 1705d9d9091SRichard Lowe movq %gs:CPU_RTMP_RSP, %r15 /* grab the intr. stack pointer */ ;\ 1715d9d9091SRichard Lowe pushq (%r15) /* push the return address */ 1725d9d9091SRichard Lowe 1735d9d9091SRichard Lowe#define BRAND_CALLBACK(callback_id, push_userland_ret) \ 1745d9d9091SRichard Lowe movq %rsp, %gs:CPU_RTMP_RSP /* save the stack pointer */ ;\ 1755d9d9091SRichard Lowe movq %r15, %gs:CPU_RTMP_R15 /* save %r15 */ ;\ 1765d9d9091SRichard Lowe movq %gs:CPU_THREAD, %r15 /* load the thread pointer */ ;\ 1775d9d9091SRichard Lowe movq T_STACK(%r15), %rsp /* switch to the kernel stack */ ;\ 1785d9d9091SRichard Lowe subq $16, %rsp /* save space for 2 pointers */ ;\ 1795d9d9091SRichard Lowe pushq %r14 /* save %r14 */ ;\ 1805d9d9091SRichard Lowe movq %gs:CPU_RTMP_RSP, %r14 ;\ 1815d9d9091SRichard Lowe movq %r14, 8(%rsp) /* stash the user stack pointer */ ;\ 1825d9d9091SRichard Lowe popq %r14 /* restore %r14 */ ;\ 1835d9d9091SRichard Lowe movq T_LWP(%r15), %r15 /* load the lwp pointer */ ;\ 1845d9d9091SRichard Lowe pushq %r15 /* push the lwp pointer */ ;\ 1855d9d9091SRichard Lowe movq LWP_PROCP(%r15), %r15 /* load the proc pointer */ ;\ 1865d9d9091SRichard Lowe movq P_BRAND(%r15), %r15 /* load the brand pointer */ ;\ 1875d9d9091SRichard Lowe movq B_MACHOPS(%r15), %r15 /* load the machops pointer */ ;\ 1885d9d9091SRichard Lowe movq _CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15 ;\ 1895d9d9091SRichard Lowe cmpq $0, %r15 ;\ 1905d9d9091SRichard Lowe je 1f ;\ 1915d9d9091SRichard Lowe movq %r15, 16(%rsp) /* save the callback pointer */ ;\ 1925d9d9091SRichard Lowe push_userland_ret /* push the return address */ ;\ 1935d9d9091SRichard Lowe movq 24(%rsp), %r15 /* load callback pointer */ ;\ 1945d9d9091SRichard Lowe INDIRECT_CALL_REG(r15) /* call callback */ ;\ 1955d9d9091SRichard Lowe1: movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\ 1965d9d9091SRichard Lowe movq %gs:CPU_RTMP_RSP, %rsp /* restore the stack pointer */ 1975d9d9091SRichard Lowe 1985d9d9091SRichard Lowe#define MSTATE_TRANSITION(from, to) \ 1995d9d9091SRichard Lowe movl $from, %edi; \ 2005d9d9091SRichard Lowe movl $to, %esi; \ 2015d9d9091SRichard Lowe call syscall_mstate 2025d9d9091SRichard Lowe 2035d9d9091SRichard Lowe/* 2045d9d9091SRichard Lowe * Check to see if a simple (direct) return is possible i.e. 2055d9d9091SRichard Lowe * 2065d9d9091SRichard Lowe * if (t->t_post_sys_ast | syscalltrace | 2075d9d9091SRichard Lowe * lwp->lwp_pcb.pcb_rupdate == 1) 2085d9d9091SRichard Lowe * do full version ; 2095d9d9091SRichard Lowe * 2105d9d9091SRichard Lowe * Preconditions: 2115d9d9091SRichard Lowe * - t is curthread 2125d9d9091SRichard Lowe * Postconditions: 2135d9d9091SRichard Lowe * - condition code NE is set if post-sys is too complex 2145d9d9091SRichard Lowe * - rtmp is zeroed if it isn't (we rely on this!) 2155d9d9091SRichard Lowe * - ltmp is smashed 2165d9d9091SRichard Lowe */ 2175d9d9091SRichard Lowe#define CHECK_POSTSYS_NE(t, ltmp, rtmp) \ 2185d9d9091SRichard Lowe movq T_LWP(t), ltmp; \ 2195d9d9091SRichard Lowe movzbl PCB_RUPDATE(ltmp), rtmp; \ 2205d9d9091SRichard Lowe ORL_SYSCALLTRACE(rtmp); \ 2215d9d9091SRichard Lowe orl T_POST_SYS_AST(t), rtmp; \ 2225d9d9091SRichard Lowe cmpl $0, rtmp 2235d9d9091SRichard Lowe 2245d9d9091SRichard Lowe/* 2255d9d9091SRichard Lowe * Fix up the lwp, thread, and eflags for a successful return 2265d9d9091SRichard Lowe * 2275d9d9091SRichard Lowe * Preconditions: 2285d9d9091SRichard Lowe * - zwreg contains zero 2295d9d9091SRichard Lowe */ 2305d9d9091SRichard Lowe#define SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg) \ 2315d9d9091SRichard Lowe movb $LWP_USER, LWP_STATE(lwp); \ 2325d9d9091SRichard Lowe movw zwreg, T_SYSNUM(t); \ 2335d9d9091SRichard Lowe andb $_CONST(0xffff - PS_C), REGOFF_RFL(%rsp) 2345d9d9091SRichard Lowe 2355d9d9091SRichard Lowe/* 2365d9d9091SRichard Lowe * ASSERT(lwptoregs(lwp) == rp); 2375d9d9091SRichard Lowe * 2385d9d9091SRichard Lowe * This may seem obvious, but very odd things happen if this 2395d9d9091SRichard Lowe * assertion is false 2405d9d9091SRichard Lowe * 2415d9d9091SRichard Lowe * Preconditions: 2425d9d9091SRichard Lowe * (%rsp is ready for normal call sequence) 2435d9d9091SRichard Lowe * Postconditions (if assertion is true): 2445d9d9091SRichard Lowe * %r11 is smashed 2455d9d9091SRichard Lowe * 2465d9d9091SRichard Lowe * ASSERT(rp->r_cs == descnum) 2475d9d9091SRichard Lowe * 2485d9d9091SRichard Lowe * The code selector is written into the regs structure when the 2495d9d9091SRichard Lowe * lwp stack is created. We use this ASSERT to validate that 2505d9d9091SRichard Lowe * the regs structure really matches how we came in. 2515d9d9091SRichard Lowe * 2525d9d9091SRichard Lowe * Preconditions: 2535d9d9091SRichard Lowe * (%rsp is ready for normal call sequence) 2545d9d9091SRichard Lowe * Postconditions (if assertion is true): 2555d9d9091SRichard Lowe * -none- 2565d9d9091SRichard Lowe * 2575d9d9091SRichard Lowe * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0); 2585d9d9091SRichard Lowe * 2595d9d9091SRichard Lowe * If this is false, it meant that we returned to userland without 2605d9d9091SRichard Lowe * updating the segment registers as we were supposed to. 2615d9d9091SRichard Lowe * 2625d9d9091SRichard Lowe * Note that we must ensure no interrupts or other traps intervene 2635d9d9091SRichard Lowe * between entering privileged mode and performing the assertion, 2645d9d9091SRichard Lowe * otherwise we may perform a context switch on the thread, which 2655d9d9091SRichard Lowe * will end up setting pcb_rupdate to 1 again. 2665d9d9091SRichard Lowe * 2675d9d9091SRichard Lowe * ASSERT(%cr0 & CR0_TS == 0); 2685d9d9091SRichard Lowe * Preconditions: 2695d9d9091SRichard Lowe * (%rsp is ready for normal call sequence) 2705d9d9091SRichard Lowe * Postconditions (if assertion is true): 2715d9d9091SRichard Lowe * (specified register is clobbered) 2725d9d9091SRichard Lowe * 2735d9d9091SRichard Lowe * Check to make sure that we are returning to user land and that CR0.TS 2745d9d9091SRichard Lowe * is not set. This is required as part of the eager FPU (see 2755d9d9091SRichard Lowe * uts/intel/os/fpu.c for more information). 2765d9d9091SRichard Lowe */ 2775d9d9091SRichard Lowe 2785d9d9091SRichard Lowe#if defined(DEBUG) 2795d9d9091SRichard Lowe 2805d9d9091SRichard Lowe__lwptoregs_msg: 2815d9d9091SRichard Lowe .string "syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]" 2825d9d9091SRichard Lowe 2835d9d9091SRichard Lowe__codesel_msg: 2845d9d9091SRichard Lowe .string "syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld" 2855d9d9091SRichard Lowe 2865d9d9091SRichard Lowe__no_rupdate_msg: 2875d9d9091SRichard Lowe .string "syscall_asm_amd64.s:%d lwp %p, pcb_rupdate != 0" 2885d9d9091SRichard Lowe 2895d9d9091SRichard Lowe__bad_ts_msg: 2905d9d9091SRichard Lowe .string "syscall_asm_amd64.s:%d CR0.TS set on user return" 2915d9d9091SRichard Lowe 2925d9d9091SRichard Lowe#define ASSERT_LWPTOREGS(lwp, rp) \ 2935d9d9091SRichard Lowe movq LWP_REGS(lwp), %r11; \ 2945d9d9091SRichard Lowe cmpq rp, %r11; \ 2955d9d9091SRichard Lowe je 7f; \ 2965d9d9091SRichard Lowe leaq __lwptoregs_msg(%rip), %rdi; \ 2975d9d9091SRichard Lowe movl $__LINE__, %esi; \ 2985d9d9091SRichard Lowe movq lwp, %rdx; \ 2995d9d9091SRichard Lowe movq %r11, %rcx; \ 3005d9d9091SRichard Lowe movq rp, %r8; \ 3015d9d9091SRichard Lowe xorl %eax, %eax; \ 3025d9d9091SRichard Lowe call panic; \ 3035d9d9091SRichard Lowe7: 3045d9d9091SRichard Lowe 3055d9d9091SRichard Lowe#define ASSERT_NO_RUPDATE_PENDING(lwp) \ 3065d9d9091SRichard Lowe testb $0x1, PCB_RUPDATE(lwp); \ 3075d9d9091SRichard Lowe je 8f; \ 3085d9d9091SRichard Lowe movq lwp, %rdx; \ 3095d9d9091SRichard Lowe leaq __no_rupdate_msg(%rip), %rdi; \ 3105d9d9091SRichard Lowe movl $__LINE__, %esi; \ 3115d9d9091SRichard Lowe xorl %eax, %eax; \ 3125d9d9091SRichard Lowe call panic; \ 3135d9d9091SRichard Lowe8: 3145d9d9091SRichard Lowe 3155d9d9091SRichard Lowe#define ASSERT_CR0TS_ZERO(reg) \ 3165d9d9091SRichard Lowe movq %cr0, reg; \ 3175d9d9091SRichard Lowe testq $CR0_TS, reg; \ 3185d9d9091SRichard Lowe jz 9f; \ 3195d9d9091SRichard Lowe leaq __bad_ts_msg(%rip), %rdi; \ 3205d9d9091SRichard Lowe movl $__LINE__, %esi; \ 3215d9d9091SRichard Lowe xorl %eax, %eax; \ 3225d9d9091SRichard Lowe call panic; \ 3235d9d9091SRichard Lowe9: 3245d9d9091SRichard Lowe 3255d9d9091SRichard Lowe#else 3265d9d9091SRichard Lowe#define ASSERT_LWPTOREGS(lwp, rp) 3275d9d9091SRichard Lowe#define ASSERT_NO_RUPDATE_PENDING(lwp) 3285d9d9091SRichard Lowe#define ASSERT_CR0TS_ZERO(reg) 3295d9d9091SRichard Lowe#endif 3305d9d9091SRichard Lowe 3315d9d9091SRichard Lowe/* 3325d9d9091SRichard Lowe * Do the traptrace thing and restore any registers we used 3335d9d9091SRichard Lowe * in situ. Assumes that %rsp is pointing at the base of 3345d9d9091SRichard Lowe * the struct regs, obviously .. 3355d9d9091SRichard Lowe */ 3365d9d9091SRichard Lowe#ifdef TRAPTRACE 3375d9d9091SRichard Lowe#define SYSCALL_TRAPTRACE(ttype) \ 3385d9d9091SRichard Lowe TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype); \ 3395d9d9091SRichard Lowe TRACE_REGS(%rdi, %rsp, %rbx, %rcx); \ 3405d9d9091SRichard Lowe TRACE_STAMP(%rdi); /* rdtsc clobbers %eax, %edx */ \ 3415d9d9091SRichard Lowe movq REGOFF_RAX(%rsp), %rax; \ 3425d9d9091SRichard Lowe movq REGOFF_RBX(%rsp), %rbx; \ 3435d9d9091SRichard Lowe movq REGOFF_RCX(%rsp), %rcx; \ 3445d9d9091SRichard Lowe movq REGOFF_RDX(%rsp), %rdx; \ 3455d9d9091SRichard Lowe movl %eax, TTR_SYSNUM(%rdi); \ 3465d9d9091SRichard Lowe movq REGOFF_RDI(%rsp), %rdi 3475d9d9091SRichard Lowe 3485d9d9091SRichard Lowe#define SYSCALL_TRAPTRACE32(ttype) \ 3495d9d9091SRichard Lowe SYSCALL_TRAPTRACE(ttype); \ 3505d9d9091SRichard Lowe /* paranoia: clean the top 32-bits of the registers */ \ 3515d9d9091SRichard Lowe orl %eax, %eax; \ 3525d9d9091SRichard Lowe orl %ebx, %ebx; \ 3535d9d9091SRichard Lowe orl %ecx, %ecx; \ 3545d9d9091SRichard Lowe orl %edx, %edx; \ 3555d9d9091SRichard Lowe orl %edi, %edi 3565d9d9091SRichard Lowe#else /* TRAPTRACE */ 3575d9d9091SRichard Lowe#define SYSCALL_TRAPTRACE(ttype) 3585d9d9091SRichard Lowe#define SYSCALL_TRAPTRACE32(ttype) 3595d9d9091SRichard Lowe#endif /* TRAPTRACE */ 3605d9d9091SRichard Lowe 3615d9d9091SRichard Lowe/* 3625d9d9091SRichard Lowe * The 64-bit libc syscall wrapper does this: 3635d9d9091SRichard Lowe * 3645d9d9091SRichard Lowe * fn(<args>) 3655d9d9091SRichard Lowe * { 3665d9d9091SRichard Lowe * movq %rcx, %r10 -- because syscall smashes %rcx 3675d9d9091SRichard Lowe * movl $CODE, %eax 3685d9d9091SRichard Lowe * syscall 3695d9d9091SRichard Lowe * <error processing> 3705d9d9091SRichard Lowe * } 3715d9d9091SRichard Lowe * 3725d9d9091SRichard Lowe * Thus when we come into the kernel: 3735d9d9091SRichard Lowe * 3745d9d9091SRichard Lowe * %rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args 3755d9d9091SRichard Lowe * %rax is the syscall number 3765d9d9091SRichard Lowe * %r12-%r15 contain caller state 3775d9d9091SRichard Lowe * 3785d9d9091SRichard Lowe * The syscall instruction arranges that: 3795d9d9091SRichard Lowe * 3805d9d9091SRichard Lowe * %rcx contains the return %rip 3815d9d9091SRichard Lowe * %r11d contains bottom 32-bits of %rflags 3825d9d9091SRichard Lowe * %rflags is masked (as determined by the SFMASK msr) 3835d9d9091SRichard Lowe * %cs is set to UCS_SEL (as determined by the STAR msr) 3845d9d9091SRichard Lowe * %ss is set to UDS_SEL (as determined by the STAR msr) 3855d9d9091SRichard Lowe * %rip is set to sys_syscall (as determined by the LSTAR msr) 3865d9d9091SRichard Lowe * 3875d9d9091SRichard Lowe * Or in other words, we have no registers available at all. 3885d9d9091SRichard Lowe * Only swapgs can save us! 3895d9d9091SRichard Lowe * 3905d9d9091SRichard Lowe * Under the hypervisor, the swapgs has happened already. However, the 3915d9d9091SRichard Lowe * state of the world is very different from that we're familiar with. 3925d9d9091SRichard Lowe * 3935d9d9091SRichard Lowe * In particular, we have a stack structure like that for interrupt 3945d9d9091SRichard Lowe * gates, except that the %cs and %ss registers are modified for reasons 3955d9d9091SRichard Lowe * that are not entirely clear. Critically, the %rcx/%r11 values do 3965d9d9091SRichard Lowe * *not* reflect the usage of those registers under a 'real' syscall[1]; 3975d9d9091SRichard Lowe * the stack, therefore, looks like this: 3985d9d9091SRichard Lowe * 3995d9d9091SRichard Lowe * 0x0(rsp) potentially junk %rcx 4005d9d9091SRichard Lowe * 0x8(rsp) potentially junk %r11 4015d9d9091SRichard Lowe * 0x10(rsp) user %rip 4025d9d9091SRichard Lowe * 0x18(rsp) modified %cs 4035d9d9091SRichard Lowe * 0x20(rsp) user %rflags 4045d9d9091SRichard Lowe * 0x28(rsp) user %rsp 4055d9d9091SRichard Lowe * 0x30(rsp) modified %ss 4065d9d9091SRichard Lowe * 4075d9d9091SRichard Lowe * 4085d9d9091SRichard Lowe * and before continuing on, we must load the %rip into %rcx and the 4095d9d9091SRichard Lowe * %rflags into %r11. 4105d9d9091SRichard Lowe * 4115d9d9091SRichard Lowe * [1] They used to, and we relied on it, but this was broken in 3.1.1. 4125d9d9091SRichard Lowe * Sigh. 4135d9d9091SRichard Lowe */ 4145d9d9091SRichard Lowe#if defined(__xpv) 4155d9d9091SRichard Lowe#define XPV_SYSCALL_PROD \ 4165d9d9091SRichard Lowe movq 0x10(%rsp), %rcx; \ 4175d9d9091SRichard Lowe movq 0x20(%rsp), %r11; \ 4185d9d9091SRichard Lowe movq 0x28(%rsp), %rsp 4195d9d9091SRichard Lowe#else 4205d9d9091SRichard Lowe#define XPV_SYSCALL_PROD /* nothing */ 4215d9d9091SRichard Lowe#endif 4225d9d9091SRichard Lowe 4235d9d9091SRichard Lowe ENTRY_NP2(brand_sys_syscall,_allsyscalls) 4245d9d9091SRichard Lowe SWAPGS /* kernel gsbase */ 4255d9d9091SRichard Lowe XPV_SYSCALL_PROD 4265d9d9091SRichard Lowe BRAND_CALLBACK(BRAND_CB_SYSCALL, BRAND_URET_FROM_REG(%rcx)) 4275d9d9091SRichard Lowe jmp noprod_sys_syscall 4285d9d9091SRichard Lowe 4295d9d9091SRichard Lowe ALTENTRY(sys_syscall) 4305d9d9091SRichard Lowe SWAPGS /* kernel gsbase */ 4315d9d9091SRichard Lowe XPV_SYSCALL_PROD 4325d9d9091SRichard Lowe 4335d9d9091SRichard Lowenoprod_sys_syscall: 4345d9d9091SRichard Lowe movq %r15, %gs:CPU_RTMP_R15 4355d9d9091SRichard Lowe movq %rsp, %gs:CPU_RTMP_RSP 4365d9d9091SRichard Lowe 4375d9d9091SRichard Lowe movq %gs:CPU_THREAD, %r15 4385d9d9091SRichard Lowe movq T_STACK(%r15), %rsp /* switch from user to kernel stack */ 4395d9d9091SRichard Lowe 4405d9d9091SRichard Lowe ASSERT_UPCALL_MASK_IS_SET 4415d9d9091SRichard Lowe 4425d9d9091SRichard Lowe movl $UCS_SEL, REGOFF_CS(%rsp) 4435d9d9091SRichard Lowe movq %rcx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 4445d9d9091SRichard Lowe movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 4455d9d9091SRichard Lowe movl $UDS_SEL, REGOFF_SS(%rsp) 4465d9d9091SRichard Lowe 4475d9d9091SRichard Lowe movl %eax, %eax /* wrapper: sysc# -> %eax */ 4485d9d9091SRichard Lowe movq %rdi, REGOFF_RDI(%rsp) 4495d9d9091SRichard Lowe movq %rsi, REGOFF_RSI(%rsp) 4505d9d9091SRichard Lowe movq %rdx, REGOFF_RDX(%rsp) 4515d9d9091SRichard Lowe movq %r10, REGOFF_RCX(%rsp) /* wrapper: %rcx -> %r10 */ 4525d9d9091SRichard Lowe movq %r10, %rcx /* arg[3] for direct calls */ 4535d9d9091SRichard Lowe 4545d9d9091SRichard Lowe movq %r8, REGOFF_R8(%rsp) 4555d9d9091SRichard Lowe movq %r9, REGOFF_R9(%rsp) 4565d9d9091SRichard Lowe movq %rax, REGOFF_RAX(%rsp) 4575d9d9091SRichard Lowe movq %rbx, REGOFF_RBX(%rsp) 4585d9d9091SRichard Lowe 4595d9d9091SRichard Lowe movq %rbp, REGOFF_RBP(%rsp) 4605d9d9091SRichard Lowe movq %r10, REGOFF_R10(%rsp) 4615d9d9091SRichard Lowe movq %gs:CPU_RTMP_RSP, %r11 4625d9d9091SRichard Lowe movq %r11, REGOFF_RSP(%rsp) 4635d9d9091SRichard Lowe movq %r12, REGOFF_R12(%rsp) 4645d9d9091SRichard Lowe 4655d9d9091SRichard Lowe movq %r13, REGOFF_R13(%rsp) 4665d9d9091SRichard Lowe movq %r14, REGOFF_R14(%rsp) 4675d9d9091SRichard Lowe movq %gs:CPU_RTMP_R15, %r10 4685d9d9091SRichard Lowe movq %r10, REGOFF_R15(%rsp) 4695d9d9091SRichard Lowe movq $0, REGOFF_SAVFP(%rsp) 4705d9d9091SRichard Lowe movq $0, REGOFF_SAVPC(%rsp) 4715d9d9091SRichard Lowe 4725d9d9091SRichard Lowe /* 4735d9d9091SRichard Lowe * Copy these registers here in case we end up stopped with 4745d9d9091SRichard Lowe * someone (like, say, /proc) messing with our register state. 4755d9d9091SRichard Lowe * We don't -restore- them unless we have to in update_sregs. 4765d9d9091SRichard Lowe * 4775d9d9091SRichard Lowe * Since userland -can't- change fsbase or gsbase directly, 4785d9d9091SRichard Lowe * and capturing them involves two serializing instructions, 4795d9d9091SRichard Lowe * we don't bother to capture them here. 4805d9d9091SRichard Lowe */ 4815d9d9091SRichard Lowe xorl %ebx, %ebx 4825d9d9091SRichard Lowe movw %ds, %bx 4835d9d9091SRichard Lowe movq %rbx, REGOFF_DS(%rsp) 4845d9d9091SRichard Lowe movw %es, %bx 4855d9d9091SRichard Lowe movq %rbx, REGOFF_ES(%rsp) 4865d9d9091SRichard Lowe movw %fs, %bx 4875d9d9091SRichard Lowe movq %rbx, REGOFF_FS(%rsp) 4885d9d9091SRichard Lowe movw %gs, %bx 4895d9d9091SRichard Lowe movq %rbx, REGOFF_GS(%rsp) 4905d9d9091SRichard Lowe 4915d9d9091SRichard Lowe /* 4925d9d9091SRichard Lowe * If we're trying to use TRAPTRACE though, I take that back: we're 4935d9d9091SRichard Lowe * probably debugging some problem in the SWAPGS logic and want to know 4945d9d9091SRichard Lowe * what the incoming gsbase was. 4955d9d9091SRichard Lowe * 4965d9d9091SRichard Lowe * Since we already did SWAPGS, record the KGSBASE. 4975d9d9091SRichard Lowe */ 4985d9d9091SRichard Lowe#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) 4995d9d9091SRichard Lowe movl $MSR_AMD_KGSBASE, %ecx 5005d9d9091SRichard Lowe rdmsr 5015d9d9091SRichard Lowe movl %eax, REGOFF_GSBASE(%rsp) 5025d9d9091SRichard Lowe movl %edx, REGOFF_GSBASE+4(%rsp) 5035d9d9091SRichard Lowe#endif 5045d9d9091SRichard Lowe 5055d9d9091SRichard Lowe /* 5065d9d9091SRichard Lowe * Machine state saved in the regs structure on the stack 5075d9d9091SRichard Lowe * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9 5085d9d9091SRichard Lowe * %eax is the syscall number 5095d9d9091SRichard Lowe * %rsp is the thread's stack, %r15 is curthread 5105d9d9091SRichard Lowe * REG_RSP(%rsp) is the user's stack 5115d9d9091SRichard Lowe */ 5125d9d9091SRichard Lowe 5135d9d9091SRichard Lowe SYSCALL_TRAPTRACE($TT_SYSC64) 5145d9d9091SRichard Lowe 5155d9d9091SRichard Lowe movq %rsp, %rbp 5165d9d9091SRichard Lowe 5175d9d9091SRichard Lowe movq T_LWP(%r15), %r14 5185d9d9091SRichard Lowe ASSERT_NO_RUPDATE_PENDING(%r14) 5195d9d9091SRichard Lowe ENABLE_INTR_FLAGS 5205d9d9091SRichard Lowe 5215d9d9091SRichard Lowe MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 5225d9d9091SRichard Lowe movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 5235d9d9091SRichard Lowe 5245d9d9091SRichard Lowe ASSERT_LWPTOREGS(%r14, %rsp) 5255d9d9091SRichard Lowe 5265d9d9091SRichard Lowe movb $LWP_SYS, LWP_STATE(%r14) 5275d9d9091SRichard Lowe incq LWP_RU_SYSC(%r14) 5285d9d9091SRichard Lowe movb $NORMALRETURN, LWP_EOSYS(%r14) 5295d9d9091SRichard Lowe 5305d9d9091SRichard Lowe incq %gs:CPU_STATS_SYS_SYSCALL 5315d9d9091SRichard Lowe 5325d9d9091SRichard Lowe movw %ax, T_SYSNUM(%r15) 5335d9d9091SRichard Lowe movzbl T_PRE_SYS(%r15), %ebx 5345d9d9091SRichard Lowe ORL_SYSCALLTRACE(%ebx) 5355d9d9091SRichard Lowe testl %ebx, %ebx 5365d9d9091SRichard Lowe jne _syscall_pre 5375d9d9091SRichard Lowe 5385d9d9091SRichard Lowe_syscall_invoke: 5395d9d9091SRichard Lowe movq REGOFF_RDI(%rbp), %rdi 5405d9d9091SRichard Lowe movq REGOFF_RSI(%rbp), %rsi 5415d9d9091SRichard Lowe movq REGOFF_RDX(%rbp), %rdx 5425d9d9091SRichard Lowe movq REGOFF_RCX(%rbp), %rcx 5435d9d9091SRichard Lowe movq REGOFF_R8(%rbp), %r8 5445d9d9091SRichard Lowe movq REGOFF_R9(%rbp), %r9 5455d9d9091SRichard Lowe 5465d9d9091SRichard Lowe cmpl $NSYSCALL, %eax 5475d9d9091SRichard Lowe jae _syscall_ill 5485d9d9091SRichard Lowe shll $SYSENT_SIZE_SHIFT, %eax 5495d9d9091SRichard Lowe leaq sysent(%rax), %rbx 5505d9d9091SRichard Lowe 5515d9d9091SRichard Lowe movq SY_CALLC(%rbx), %rax 5525d9d9091SRichard Lowe INDIRECT_CALL_REG(rax) 5535d9d9091SRichard Lowe 5545d9d9091SRichard Lowe movq %rax, %r12 5555d9d9091SRichard Lowe movq %rdx, %r13 5565d9d9091SRichard Lowe 5575d9d9091SRichard Lowe /* 5585d9d9091SRichard Lowe * If the handler returns two ints, then we need to split the 5595d9d9091SRichard Lowe * 64-bit return value into two 32-bit values. 5605d9d9091SRichard Lowe */ 5615d9d9091SRichard Lowe testw $SE_32RVAL2, SY_FLAGS(%rbx) 5625d9d9091SRichard Lowe je 5f 5635d9d9091SRichard Lowe movq %r12, %r13 5645d9d9091SRichard Lowe shrq $32, %r13 /* upper 32-bits into %edx */ 5655d9d9091SRichard Lowe movl %r12d, %r12d /* lower 32-bits into %eax */ 5665d9d9091SRichard Lowe5: 5675d9d9091SRichard Lowe /* 5685d9d9091SRichard Lowe * Optimistically assume that there's no post-syscall 5695d9d9091SRichard Lowe * work to do. (This is to avoid having to call syscall_mstate() 5705d9d9091SRichard Lowe * with interrupts disabled) 5715d9d9091SRichard Lowe */ 5725d9d9091SRichard Lowe MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 5735d9d9091SRichard Lowe 5745d9d9091SRichard Lowe /* 5755d9d9091SRichard Lowe * We must protect ourselves from being descheduled here; 5765d9d9091SRichard Lowe * If we were, and we ended up on another cpu, or another 5775d9d9091SRichard Lowe * lwp got in ahead of us, it could change the segment 5785d9d9091SRichard Lowe * registers without us noticing before we return to userland. 5795d9d9091SRichard Lowe */ 5805d9d9091SRichard Lowe CLI(%r14) 5815d9d9091SRichard Lowe CHECK_POSTSYS_NE(%r15, %r14, %ebx) 5825d9d9091SRichard Lowe jne _syscall_post 5835d9d9091SRichard Lowe 5845d9d9091SRichard Lowe /* 5855d9d9091SRichard Lowe * We need to protect ourselves against non-canonical return values 5865d9d9091SRichard Lowe * because Intel doesn't check for them on sysret (AMD does). Canonical 5875d9d9091SRichard Lowe * addresses on current amd64 processors only use 48-bits for VAs; an 5885d9d9091SRichard Lowe * address is canonical if all upper bits (47-63) are identical. If we 5895d9d9091SRichard Lowe * find a non-canonical %rip, we opt to go through the full 5905d9d9091SRichard Lowe * _syscall_post path which takes us into an iretq which is not 5915d9d9091SRichard Lowe * susceptible to the same problems sysret is. 5925d9d9091SRichard Lowe * 5935d9d9091SRichard Lowe * We're checking for a canonical address by first doing an arithmetic 5945d9d9091SRichard Lowe * shift. This will fill in the remaining bits with the value of bit 63. 5955d9d9091SRichard Lowe * If the address were canonical, the register would now have either all 5965d9d9091SRichard Lowe * zeroes or all ones in it. Therefore we add one (inducing overflow) 5975d9d9091SRichard Lowe * and compare against 1. A canonical address will either be zero or one 5985d9d9091SRichard Lowe * at this point, hence the use of ja. 5995d9d9091SRichard Lowe * 6005d9d9091SRichard Lowe * At this point, r12 and r13 have the return value so we can't use 6015d9d9091SRichard Lowe * those registers. 6025d9d9091SRichard Lowe */ 6035d9d9091SRichard Lowe movq REGOFF_RIP(%rsp), %rcx 6045d9d9091SRichard Lowe sarq $47, %rcx 6055d9d9091SRichard Lowe incq %rcx 6065d9d9091SRichard Lowe cmpq $1, %rcx 6075d9d9091SRichard Lowe ja _syscall_post 6085d9d9091SRichard Lowe 6095d9d9091SRichard Lowe 6105d9d9091SRichard Lowe SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 6115d9d9091SRichard Lowe 6125d9d9091SRichard Lowe movq %r12, REGOFF_RAX(%rsp) 6135d9d9091SRichard Lowe movq %r13, REGOFF_RDX(%rsp) 6145d9d9091SRichard Lowe 6155d9d9091SRichard Lowe /* 6165d9d9091SRichard Lowe * Clobber %r11 as we check CR0.TS. 6175d9d9091SRichard Lowe */ 6185d9d9091SRichard Lowe ASSERT_CR0TS_ZERO(%r11) 6195d9d9091SRichard Lowe 6205d9d9091SRichard Lowe /* 6215d9d9091SRichard Lowe * To get back to userland, we need the return %rip in %rcx and 6225d9d9091SRichard Lowe * the return %rfl in %r11d. The sysretq instruction also arranges 6235d9d9091SRichard Lowe * to fix up %cs and %ss; everything else is our responsibility. 6245d9d9091SRichard Lowe */ 6255d9d9091SRichard Lowe movq REGOFF_RDI(%rsp), %rdi 6265d9d9091SRichard Lowe movq REGOFF_RSI(%rsp), %rsi 6275d9d9091SRichard Lowe movq REGOFF_RDX(%rsp), %rdx 6285d9d9091SRichard Lowe /* %rcx used to restore %rip value */ 6295d9d9091SRichard Lowe 6305d9d9091SRichard Lowe movq REGOFF_R8(%rsp), %r8 6315d9d9091SRichard Lowe movq REGOFF_R9(%rsp), %r9 6325d9d9091SRichard Lowe movq REGOFF_RAX(%rsp), %rax 6335d9d9091SRichard Lowe movq REGOFF_RBX(%rsp), %rbx 6345d9d9091SRichard Lowe 6355d9d9091SRichard Lowe movq REGOFF_RBP(%rsp), %rbp 6365d9d9091SRichard Lowe movq REGOFF_R10(%rsp), %r10 6375d9d9091SRichard Lowe /* %r11 used to restore %rfl value */ 6385d9d9091SRichard Lowe movq REGOFF_R12(%rsp), %r12 6395d9d9091SRichard Lowe 6405d9d9091SRichard Lowe movq REGOFF_R13(%rsp), %r13 6415d9d9091SRichard Lowe movq REGOFF_R14(%rsp), %r14 6425d9d9091SRichard Lowe movq REGOFF_R15(%rsp), %r15 6435d9d9091SRichard Lowe 6445d9d9091SRichard Lowe movq REGOFF_RIP(%rsp), %rcx 6455d9d9091SRichard Lowe movl REGOFF_RFL(%rsp), %r11d 6465d9d9091SRichard Lowe 647*5cd084edSDan McDonald /* 648*5cd084edSDan McDonald * Unlike other cases, because we need to restore the user stack pointer 649*5cd084edSDan McDonald * before exiting the kernel we must clear the microarch state before 650*5cd084edSDan McDonald * getting here. This should be safe because it means that the only 651*5cd084edSDan McDonald * values on the bus after this are based on the user's registers and 652*5cd084edSDan McDonald * potentially the addresses where we stored them. Given the constraints 653*5cd084edSDan McDonald * of sysret, that's how it has to be. 654*5cd084edSDan McDonald */ 655*5cd084edSDan McDonald call x86_md_clear 656*5cd084edSDan McDonald 6575d9d9091SRichard Lowe#if defined(__xpv) 6585d9d9091SRichard Lowe addq $REGOFF_RIP, %rsp 6595d9d9091SRichard Lowe#else 6605d9d9091SRichard Lowe movq REGOFF_RSP(%rsp), %rsp 6615d9d9091SRichard Lowe#endif 6625d9d9091SRichard Lowe 6635d9d9091SRichard Lowe /* 6645d9d9091SRichard Lowe * There can be no instructions between the ALTENTRY below and 6655d9d9091SRichard Lowe * SYSRET or we could end up breaking brand support. See label usage 6665d9d9091SRichard Lowe * in sn1_brand_syscall_callback for an example. 6675d9d9091SRichard Lowe */ 6685d9d9091SRichard Lowe ASSERT_UPCALL_MASK_IS_SET 6695d9d9091SRichard Lowe#if defined(__xpv) 6705d9d9091SRichard Lowe SYSRETQ 6715d9d9091SRichard Lowe ALTENTRY(nopop_sys_syscall_swapgs_sysretq) 6725d9d9091SRichard Lowe 6735d9d9091SRichard Lowe /* 6745d9d9091SRichard Lowe * We can only get here after executing a brand syscall 6755d9d9091SRichard Lowe * interposition callback handler and simply need to 6765d9d9091SRichard Lowe * "sysretq" back to userland. On the hypervisor this 6775d9d9091SRichard Lowe * involves the iret hypercall which requires us to construct 6785d9d9091SRichard Lowe * just enough of the stack needed for the hypercall. 6795d9d9091SRichard Lowe * (rip, cs, rflags, rsp, ss). 6805d9d9091SRichard Lowe */ 6815d9d9091SRichard Lowe movq %rsp, %gs:CPU_RTMP_RSP /* save user's rsp */ 6825d9d9091SRichard Lowe movq %gs:CPU_THREAD, %r11 6835d9d9091SRichard Lowe movq T_STACK(%r11), %rsp 6845d9d9091SRichard Lowe 6855d9d9091SRichard Lowe movq %rcx, REGOFF_RIP(%rsp) 6865d9d9091SRichard Lowe movl $UCS_SEL, REGOFF_CS(%rsp) 6875d9d9091SRichard Lowe movq %gs:CPU_RTMP_RSP, %r11 6885d9d9091SRichard Lowe movq %r11, REGOFF_RSP(%rsp) 6895d9d9091SRichard Lowe pushfq 6905d9d9091SRichard Lowe popq %r11 /* hypercall enables ints */ 6915d9d9091SRichard Lowe movq %r11, REGOFF_RFL(%rsp) 6925d9d9091SRichard Lowe movl $UDS_SEL, REGOFF_SS(%rsp) 6935d9d9091SRichard Lowe addq $REGOFF_RIP, %rsp 6945d9d9091SRichard Lowe /* 6955d9d9091SRichard Lowe * XXPV: see comment in SYSRETQ definition for future optimization 6965d9d9091SRichard Lowe * we could take. 6975d9d9091SRichard Lowe */ 6985d9d9091SRichard Lowe ASSERT_UPCALL_MASK_IS_SET 6995d9d9091SRichard Lowe SYSRETQ 7005d9d9091SRichard Lowe#else 7015d9d9091SRichard Lowe ALTENTRY(nopop_sys_syscall_swapgs_sysretq) 7025d9d9091SRichard Lowe jmp tr_sysretq 7035d9d9091SRichard Lowe#endif 7045d9d9091SRichard Lowe /*NOTREACHED*/ 7055d9d9091SRichard Lowe SET_SIZE(nopop_sys_syscall_swapgs_sysretq) 7065d9d9091SRichard Lowe 7075d9d9091SRichard Lowe_syscall_pre: 7085d9d9091SRichard Lowe call pre_syscall 7095d9d9091SRichard Lowe movl %eax, %r12d 7105d9d9091SRichard Lowe testl %eax, %eax 7115d9d9091SRichard Lowe jne _syscall_post_call 7125d9d9091SRichard Lowe /* 7135d9d9091SRichard Lowe * Didn't abort, so reload the syscall args and invoke the handler. 7145d9d9091SRichard Lowe */ 7155d9d9091SRichard Lowe movzwl T_SYSNUM(%r15), %eax 7165d9d9091SRichard Lowe jmp _syscall_invoke 7175d9d9091SRichard Lowe 7185d9d9091SRichard Lowe_syscall_ill: 7195d9d9091SRichard Lowe call nosys 7205d9d9091SRichard Lowe movq %rax, %r12 7215d9d9091SRichard Lowe movq %rdx, %r13 7225d9d9091SRichard Lowe jmp _syscall_post_call 7235d9d9091SRichard Lowe 7245d9d9091SRichard Lowe_syscall_post: 7255d9d9091SRichard Lowe STI 7265d9d9091SRichard Lowe /* 7275d9d9091SRichard Lowe * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 7285d9d9091SRichard Lowe * so that we can account for the extra work it takes us to finish. 7295d9d9091SRichard Lowe */ 7305d9d9091SRichard Lowe MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 7315d9d9091SRichard Lowe_syscall_post_call: 7325d9d9091SRichard Lowe movq %r12, %rdi 7335d9d9091SRichard Lowe movq %r13, %rsi 7345d9d9091SRichard Lowe call post_syscall 7355d9d9091SRichard Lowe MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 7365d9d9091SRichard Lowe jmp _sys_rtt 7375d9d9091SRichard Lowe SET_SIZE(sys_syscall) 7385d9d9091SRichard Lowe SET_SIZE(brand_sys_syscall) 7395d9d9091SRichard Lowe 7405d9d9091SRichard Lowe ENTRY_NP(brand_sys_syscall32) 7415d9d9091SRichard Lowe SWAPGS /* kernel gsbase */ 7425d9d9091SRichard Lowe XPV_TRAP_POP 7435d9d9091SRichard Lowe BRAND_CALLBACK(BRAND_CB_SYSCALL32, BRAND_URET_FROM_REG(%rcx)) 7445d9d9091SRichard Lowe jmp nopop_sys_syscall32 7455d9d9091SRichard Lowe 7465d9d9091SRichard Lowe ALTENTRY(sys_syscall32) 7475d9d9091SRichard Lowe SWAPGS /* kernel gsbase */ 7485d9d9091SRichard Lowe XPV_TRAP_POP 7495d9d9091SRichard Lowe 7505d9d9091SRichard Lowenopop_sys_syscall32: 7515d9d9091SRichard Lowe movl %esp, %r10d 7525d9d9091SRichard Lowe movq %gs:CPU_THREAD, %r15 7535d9d9091SRichard Lowe movq T_STACK(%r15), %rsp 7545d9d9091SRichard Lowe movl %eax, %eax 7555d9d9091SRichard Lowe 7565d9d9091SRichard Lowe movl $U32CS_SEL, REGOFF_CS(%rsp) 7575d9d9091SRichard Lowe movl %ecx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */ 7585d9d9091SRichard Lowe movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */ 7595d9d9091SRichard Lowe movq %r10, REGOFF_RSP(%rsp) 7605d9d9091SRichard Lowe movl $UDS_SEL, REGOFF_SS(%rsp) 7615d9d9091SRichard Lowe 7625d9d9091SRichard Lowe_syscall32_save: 7635d9d9091SRichard Lowe movl %edi, REGOFF_RDI(%rsp) 7645d9d9091SRichard Lowe movl %esi, REGOFF_RSI(%rsp) 7655d9d9091SRichard Lowe movl %ebp, REGOFF_RBP(%rsp) 7665d9d9091SRichard Lowe movl %ebx, REGOFF_RBX(%rsp) 7675d9d9091SRichard Lowe movl %edx, REGOFF_RDX(%rsp) 7685d9d9091SRichard Lowe movl %ecx, REGOFF_RCX(%rsp) 7695d9d9091SRichard Lowe movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 7705d9d9091SRichard Lowe movq $0, REGOFF_SAVFP(%rsp) 7715d9d9091SRichard Lowe movq $0, REGOFF_SAVPC(%rsp) 7725d9d9091SRichard Lowe 7735d9d9091SRichard Lowe /* 7745d9d9091SRichard Lowe * Copy these registers here in case we end up stopped with 7755d9d9091SRichard Lowe * someone (like, say, /proc) messing with our register state. 7765d9d9091SRichard Lowe * We don't -restore- them unless we have to in update_sregs. 7775d9d9091SRichard Lowe * 7785d9d9091SRichard Lowe * Since userland -can't- change fsbase or gsbase directly, 7795d9d9091SRichard Lowe * we don't bother to capture them here. 7805d9d9091SRichard Lowe */ 7815d9d9091SRichard Lowe xorl %ebx, %ebx 7825d9d9091SRichard Lowe movw %ds, %bx 7835d9d9091SRichard Lowe movq %rbx, REGOFF_DS(%rsp) 7845d9d9091SRichard Lowe movw %es, %bx 7855d9d9091SRichard Lowe movq %rbx, REGOFF_ES(%rsp) 7865d9d9091SRichard Lowe movw %fs, %bx 7875d9d9091SRichard Lowe movq %rbx, REGOFF_FS(%rsp) 7885d9d9091SRichard Lowe movw %gs, %bx 7895d9d9091SRichard Lowe movq %rbx, REGOFF_GS(%rsp) 7905d9d9091SRichard Lowe 7915d9d9091SRichard Lowe /* 7925d9d9091SRichard Lowe * If we're trying to use TRAPTRACE though, I take that back: we're 7935d9d9091SRichard Lowe * probably debugging some problem in the SWAPGS logic and want to know 7945d9d9091SRichard Lowe * what the incoming gsbase was. 7955d9d9091SRichard Lowe * 7965d9d9091SRichard Lowe * Since we already did SWAPGS, record the KGSBASE. 7975d9d9091SRichard Lowe */ 7985d9d9091SRichard Lowe#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) 7995d9d9091SRichard Lowe movl $MSR_AMD_KGSBASE, %ecx 8005d9d9091SRichard Lowe rdmsr 8015d9d9091SRichard Lowe movl %eax, REGOFF_GSBASE(%rsp) 8025d9d9091SRichard Lowe movl %edx, REGOFF_GSBASE+4(%rsp) 8035d9d9091SRichard Lowe#endif 8045d9d9091SRichard Lowe 8055d9d9091SRichard Lowe /* 8065d9d9091SRichard Lowe * Application state saved in the regs structure on the stack 8075d9d9091SRichard Lowe * %eax is the syscall number 8085d9d9091SRichard Lowe * %rsp is the thread's stack, %r15 is curthread 8095d9d9091SRichard Lowe * REG_RSP(%rsp) is the user's stack 8105d9d9091SRichard Lowe */ 8115d9d9091SRichard Lowe 8125d9d9091SRichard Lowe SYSCALL_TRAPTRACE32($TT_SYSC) 8135d9d9091SRichard Lowe 8145d9d9091SRichard Lowe movq %rsp, %rbp 8155d9d9091SRichard Lowe 8165d9d9091SRichard Lowe movq T_LWP(%r15), %r14 8175d9d9091SRichard Lowe ASSERT_NO_RUPDATE_PENDING(%r14) 8185d9d9091SRichard Lowe 8195d9d9091SRichard Lowe ENABLE_INTR_FLAGS 8205d9d9091SRichard Lowe 8215d9d9091SRichard Lowe MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 8225d9d9091SRichard Lowe movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */ 8235d9d9091SRichard Lowe 8245d9d9091SRichard Lowe ASSERT_LWPTOREGS(%r14, %rsp) 8255d9d9091SRichard Lowe 8265d9d9091SRichard Lowe incq %gs:CPU_STATS_SYS_SYSCALL 8275d9d9091SRichard Lowe 8285d9d9091SRichard Lowe /* 8295d9d9091SRichard Lowe * Make some space for MAXSYSARGS (currently 8) 32-bit args placed 8305d9d9091SRichard Lowe * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or 8315d9d9091SRichard Lowe * more succinctly: 8325d9d9091SRichard Lowe * 8335d9d9091SRichard Lowe * SA(MAXSYSARGS * sizeof (long)) == 64 8345d9d9091SRichard Lowe */ 8355d9d9091SRichard Lowe#define SYS_DROP 64 /* drop for args */ 8365d9d9091SRichard Lowe subq $SYS_DROP, %rsp 8375d9d9091SRichard Lowe movb $LWP_SYS, LWP_STATE(%r14) 8385d9d9091SRichard Lowe movq %r15, %rdi 8395d9d9091SRichard Lowe movq %rsp, %rsi 8405d9d9091SRichard Lowe call syscall_entry 8415d9d9091SRichard Lowe 8425d9d9091SRichard Lowe /* 8435d9d9091SRichard Lowe * Fetch the arguments copied onto the kernel stack and put 8445d9d9091SRichard Lowe * them in the right registers to invoke a C-style syscall handler. 8455d9d9091SRichard Lowe * %rax contains the handler address. 8465d9d9091SRichard Lowe * 8475d9d9091SRichard Lowe * Ideas for making all this go faster of course include simply 8485d9d9091SRichard Lowe * forcibly fetching 6 arguments from the user stack under lofault 8495d9d9091SRichard Lowe * protection, reverting to copyin_args only when watchpoints 8505d9d9091SRichard Lowe * are in effect. 8515d9d9091SRichard Lowe * 8525d9d9091SRichard Lowe * (If we do this, make sure that exec and libthread leave 8535d9d9091SRichard Lowe * enough space at the top of the stack to ensure that we'll 8545d9d9091SRichard Lowe * never do a fetch from an invalid page.) 8555d9d9091SRichard Lowe * 8565d9d9091SRichard Lowe * Lots of ideas here, but they won't really help with bringup B-) 8575d9d9091SRichard Lowe * Correctness can't wait, performance can wait a little longer .. 8585d9d9091SRichard Lowe */ 8595d9d9091SRichard Lowe 8605d9d9091SRichard Lowe movq %rax, %rbx 8615d9d9091SRichard Lowe movl 0(%rsp), %edi 8625d9d9091SRichard Lowe movl 8(%rsp), %esi 8635d9d9091SRichard Lowe movl 0x10(%rsp), %edx 8645d9d9091SRichard Lowe movl 0x18(%rsp), %ecx 8655d9d9091SRichard Lowe movl 0x20(%rsp), %r8d 8665d9d9091SRichard Lowe movl 0x28(%rsp), %r9d 8675d9d9091SRichard Lowe 8685d9d9091SRichard Lowe movq SY_CALLC(%rbx), %rax 8695d9d9091SRichard Lowe INDIRECT_CALL_REG(rax) 8705d9d9091SRichard Lowe 8715d9d9091SRichard Lowe movq %rbp, %rsp /* pop the args */ 8725d9d9091SRichard Lowe 8735d9d9091SRichard Lowe /* 8745d9d9091SRichard Lowe * amd64 syscall handlers -always- return a 64-bit value in %rax. 8755d9d9091SRichard Lowe * On the 32-bit kernel, they always return that value in %eax:%edx 8765d9d9091SRichard Lowe * as required by the 32-bit ABI. 8775d9d9091SRichard Lowe * 8785d9d9091SRichard Lowe * Simulate the same behaviour by unconditionally splitting the 8795d9d9091SRichard Lowe * return value in the same way. 8805d9d9091SRichard Lowe */ 8815d9d9091SRichard Lowe movq %rax, %r13 8825d9d9091SRichard Lowe shrq $32, %r13 /* upper 32-bits into %edx */ 8835d9d9091SRichard Lowe movl %eax, %r12d /* lower 32-bits into %eax */ 8845d9d9091SRichard Lowe 8855d9d9091SRichard Lowe /* 8865d9d9091SRichard Lowe * Optimistically assume that there's no post-syscall 8875d9d9091SRichard Lowe * work to do. (This is to avoid having to call syscall_mstate() 8885d9d9091SRichard Lowe * with interrupts disabled) 8895d9d9091SRichard Lowe */ 8905d9d9091SRichard Lowe MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 8915d9d9091SRichard Lowe 8925d9d9091SRichard Lowe /* 8935d9d9091SRichard Lowe * We must protect ourselves from being descheduled here; 8945d9d9091SRichard Lowe * If we were, and we ended up on another cpu, or another 8955d9d9091SRichard Lowe * lwp got in ahead of us, it could change the segment 8965d9d9091SRichard Lowe * registers without us noticing before we return to userland. 8975d9d9091SRichard Lowe */ 8985d9d9091SRichard Lowe CLI(%r14) 8995d9d9091SRichard Lowe CHECK_POSTSYS_NE(%r15, %r14, %ebx) 9005d9d9091SRichard Lowe jne _full_syscall_postsys32 9015d9d9091SRichard Lowe SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 9025d9d9091SRichard Lowe 9035d9d9091SRichard Lowe /* 9045d9d9091SRichard Lowe * Clobber %r11 as we check CR0.TS. 9055d9d9091SRichard Lowe */ 9065d9d9091SRichard Lowe ASSERT_CR0TS_ZERO(%r11) 9075d9d9091SRichard Lowe 9085d9d9091SRichard Lowe /* 9095d9d9091SRichard Lowe * To get back to userland, we need to put the return %rip in %rcx and 9105d9d9091SRichard Lowe * the return %rfl in %r11d. The sysret instruction also arranges 9115d9d9091SRichard Lowe * to fix up %cs and %ss; everything else is our responsibility. 9125d9d9091SRichard Lowe */ 9135d9d9091SRichard Lowe 9145d9d9091SRichard Lowe movl %r12d, %eax /* %eax: rval1 */ 9155d9d9091SRichard Lowe movl REGOFF_RBX(%rsp), %ebx 9165d9d9091SRichard Lowe /* %ecx used for return pointer */ 9175d9d9091SRichard Lowe movl %r13d, %edx /* %edx: rval2 */ 9185d9d9091SRichard Lowe movl REGOFF_RBP(%rsp), %ebp 9195d9d9091SRichard Lowe movl REGOFF_RSI(%rsp), %esi 9205d9d9091SRichard Lowe movl REGOFF_RDI(%rsp), %edi 9215d9d9091SRichard Lowe 9225d9d9091SRichard Lowe movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */ 9235d9d9091SRichard Lowe movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */ 924*5cd084edSDan McDonald /* 925*5cd084edSDan McDonald * Unlike other cases, because we need to restore the user stack pointer 926*5cd084edSDan McDonald * before exiting the kernel we must clear the microarch state before 927*5cd084edSDan McDonald * getting here. This should be safe because it means that the only 928*5cd084edSDan McDonald * values on the bus after this are based on the user's registers and 929*5cd084edSDan McDonald * potentially the addresses where we stored them. Given the constraints 930*5cd084edSDan McDonald * of sysret, that's how it has to be. 931*5cd084edSDan McDonald */ 932*5cd084edSDan McDonald call x86_md_clear 933*5cd084edSDan McDonald 9345d9d9091SRichard Lowe movl REGOFF_RSP(%rsp), %esp 9355d9d9091SRichard Lowe 9365d9d9091SRichard Lowe ASSERT_UPCALL_MASK_IS_SET 9375d9d9091SRichard Lowe ALTENTRY(nopop_sys_syscall32_swapgs_sysretl) 9385d9d9091SRichard Lowe jmp tr_sysretl 9395d9d9091SRichard Lowe SET_SIZE(nopop_sys_syscall32_swapgs_sysretl) 9405d9d9091SRichard Lowe /*NOTREACHED*/ 9415d9d9091SRichard Lowe 9425d9d9091SRichard Lowe_full_syscall_postsys32: 9435d9d9091SRichard Lowe STI 9445d9d9091SRichard Lowe /* 9455d9d9091SRichard Lowe * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM 9465d9d9091SRichard Lowe * so that we can account for the extra work it takes us to finish. 9475d9d9091SRichard Lowe */ 9485d9d9091SRichard Lowe MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 9495d9d9091SRichard Lowe movq %r15, %rdi 9505d9d9091SRichard Lowe movq %r12, %rsi /* rval1 - %eax */ 9515d9d9091SRichard Lowe movq %r13, %rdx /* rval2 - %edx */ 9525d9d9091SRichard Lowe call syscall_exit 9535d9d9091SRichard Lowe MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 9545d9d9091SRichard Lowe jmp _sys_rtt 9555d9d9091SRichard Lowe SET_SIZE(sys_syscall32) 9565d9d9091SRichard Lowe SET_SIZE(brand_sys_syscall32) 9575d9d9091SRichard Lowe 9585d9d9091SRichard Lowe/* 9595d9d9091SRichard Lowe * System call handler via the sysenter instruction 9605d9d9091SRichard Lowe * Used only for 32-bit system calls on the 64-bit kernel. 9615d9d9091SRichard Lowe * 9625d9d9091SRichard Lowe * The caller in userland has arranged that: 9635d9d9091SRichard Lowe * 9645d9d9091SRichard Lowe * - %eax contains the syscall number 9655d9d9091SRichard Lowe * - %ecx contains the user %esp 9665d9d9091SRichard Lowe * - %edx contains the return %eip 9675d9d9091SRichard Lowe * - the user stack contains the args to the syscall 9685d9d9091SRichard Lowe * 9695d9d9091SRichard Lowe * Hardware and (privileged) initialization code have arranged that by 9705d9d9091SRichard Lowe * the time the sysenter instructions completes: 9715d9d9091SRichard Lowe * 9725d9d9091SRichard Lowe * - %rip is pointing to sys_sysenter (below). 9735d9d9091SRichard Lowe * - %cs and %ss are set to kernel text and stack (data) selectors. 9745d9d9091SRichard Lowe * - %rsp is pointing at the lwp's stack 9755d9d9091SRichard Lowe * - interrupts have been disabled. 9765d9d9091SRichard Lowe * 9775d9d9091SRichard Lowe * Note that we are unable to return both "rvals" to userland with 9785d9d9091SRichard Lowe * this call, as %edx is used by the sysexit instruction. 9795d9d9091SRichard Lowe * 9805d9d9091SRichard Lowe * One final complication in this routine is its interaction with 9815d9d9091SRichard Lowe * single-stepping in a debugger. For most of the system call mechanisms, the 9825d9d9091SRichard Lowe * CPU automatically clears the single-step flag before we enter the kernel. 9835d9d9091SRichard Lowe * The sysenter mechanism does not clear the flag, so a user single-stepping 9845d9d9091SRichard Lowe * through a libc routine may suddenly find themself single-stepping through the 9855d9d9091SRichard Lowe * kernel. To detect this, kmdb and trap() both compare the trap %pc to the 9865d9d9091SRichard Lowe * [brand_]sys_enter addresses on each single-step trap. If it finds that we 9875d9d9091SRichard Lowe * have single-stepped to a sysenter entry point, it explicitly clears the flag 9885d9d9091SRichard Lowe * and executes the sys_sysenter routine. 9895d9d9091SRichard Lowe * 9905d9d9091SRichard Lowe * One final complication in this final complication is the fact that we have 9915d9d9091SRichard Lowe * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter. 9925d9d9091SRichard Lowe * If we enter at brand_sys_sysenter and start single-stepping through the 9935d9d9091SRichard Lowe * kernel with kmdb, we will eventually hit the instruction at sys_sysenter. 9945d9d9091SRichard Lowe * kmdb cannot distinguish between that valid single-step and the undesirable 9955d9d9091SRichard Lowe * one mentioned above. To avoid this situation, we simply add a jump over the 9965d9d9091SRichard Lowe * instruction at sys_sysenter to make it impossible to single-step to it. 9975d9d9091SRichard Lowe */ 9985d9d9091SRichard Lowe 9995d9d9091SRichard Lowe ENTRY_NP(brand_sys_sysenter) 10005d9d9091SRichard Lowe SWAPGS /* kernel gsbase */ 10015d9d9091SRichard Lowe ALTENTRY(_brand_sys_sysenter_post_swapgs) 10025d9d9091SRichard Lowe 10035d9d9091SRichard Lowe BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx)) 10045d9d9091SRichard Lowe /* 10055d9d9091SRichard Lowe * Jump over sys_sysenter to allow single-stepping as described 10065d9d9091SRichard Lowe * above. 10075d9d9091SRichard Lowe */ 10085d9d9091SRichard Lowe jmp _sys_sysenter_post_swapgs 10095d9d9091SRichard Lowe 10105d9d9091SRichard Lowe ALTENTRY(sys_sysenter) 10115d9d9091SRichard Lowe SWAPGS /* kernel gsbase */ 10125d9d9091SRichard Lowe ALTENTRY(_sys_sysenter_post_swapgs) 10135d9d9091SRichard Lowe 10145d9d9091SRichard Lowe movq %gs:CPU_THREAD, %r15 10155d9d9091SRichard Lowe 10165d9d9091SRichard Lowe movl $U32CS_SEL, REGOFF_CS(%rsp) 10175d9d9091SRichard Lowe movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */ 10185d9d9091SRichard Lowe movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */ 10195d9d9091SRichard Lowe /* 10205d9d9091SRichard Lowe * NOTE: none of the instructions that run before we get here should 10215d9d9091SRichard Lowe * clobber bits in (R)FLAGS! This includes the kpti trampoline. 10225d9d9091SRichard Lowe */ 10235d9d9091SRichard Lowe pushfq 10245d9d9091SRichard Lowe popq %r10 10255d9d9091SRichard Lowe movl $UDS_SEL, REGOFF_SS(%rsp) 10265d9d9091SRichard Lowe 10275d9d9091SRichard Lowe /* 10285d9d9091SRichard Lowe * Set the interrupt flag before storing the flags to the 10295d9d9091SRichard Lowe * flags image on the stack so we can return to user with 10305d9d9091SRichard Lowe * interrupts enabled if we return via sys_rtt_syscall32 10315d9d9091SRichard Lowe */ 10325d9d9091SRichard Lowe orq $PS_IE, %r10 10335d9d9091SRichard Lowe movq %r10, REGOFF_RFL(%rsp) 10345d9d9091SRichard Lowe 10355d9d9091SRichard Lowe movl %edi, REGOFF_RDI(%rsp) 10365d9d9091SRichard Lowe movl %esi, REGOFF_RSI(%rsp) 10375d9d9091SRichard Lowe movl %ebp, REGOFF_RBP(%rsp) 10385d9d9091SRichard Lowe movl %ebx, REGOFF_RBX(%rsp) 10395d9d9091SRichard Lowe movl %edx, REGOFF_RDX(%rsp) 10405d9d9091SRichard Lowe movl %ecx, REGOFF_RCX(%rsp) 10415d9d9091SRichard Lowe movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */ 10425d9d9091SRichard Lowe movq $0, REGOFF_SAVFP(%rsp) 10435d9d9091SRichard Lowe movq $0, REGOFF_SAVPC(%rsp) 10445d9d9091SRichard Lowe 10455d9d9091SRichard Lowe /* 10465d9d9091SRichard Lowe * Copy these registers here in case we end up stopped with 10475d9d9091SRichard Lowe * someone (like, say, /proc) messing with our register state. 10485d9d9091SRichard Lowe * We don't -restore- them unless we have to in update_sregs. 10495d9d9091SRichard Lowe * 10505d9d9091SRichard Lowe * Since userland -can't- change fsbase or gsbase directly, 10515d9d9091SRichard Lowe * we don't bother to capture them here. 10525d9d9091SRichard Lowe */ 10535d9d9091SRichard Lowe xorl %ebx, %ebx 10545d9d9091SRichard Lowe movw %ds, %bx 10555d9d9091SRichard Lowe movq %rbx, REGOFF_DS(%rsp) 10565d9d9091SRichard Lowe movw %es, %bx 10575d9d9091SRichard Lowe movq %rbx, REGOFF_ES(%rsp) 10585d9d9091SRichard Lowe movw %fs, %bx 10595d9d9091SRichard Lowe movq %rbx, REGOFF_FS(%rsp) 10605d9d9091SRichard Lowe movw %gs, %bx 10615d9d9091SRichard Lowe movq %rbx, REGOFF_GS(%rsp) 10625d9d9091SRichard Lowe 10635d9d9091SRichard Lowe /* 10645d9d9091SRichard Lowe * If we're trying to use TRAPTRACE though, I take that back: we're 10655d9d9091SRichard Lowe * probably debugging some problem in the SWAPGS logic and want to know 10665d9d9091SRichard Lowe * what the incoming gsbase was. 10675d9d9091SRichard Lowe * 10685d9d9091SRichard Lowe * Since we already did SWAPGS, record the KGSBASE. 10695d9d9091SRichard Lowe */ 10705d9d9091SRichard Lowe#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) 10715d9d9091SRichard Lowe movl $MSR_AMD_KGSBASE, %ecx 10725d9d9091SRichard Lowe rdmsr 10735d9d9091SRichard Lowe movl %eax, REGOFF_GSBASE(%rsp) 10745d9d9091SRichard Lowe movl %edx, REGOFF_GSBASE+4(%rsp) 10755d9d9091SRichard Lowe#endif 10765d9d9091SRichard Lowe 10775d9d9091SRichard Lowe /* 10785d9d9091SRichard Lowe * Application state saved in the regs structure on the stack 10795d9d9091SRichard Lowe * %eax is the syscall number 10805d9d9091SRichard Lowe * %rsp is the thread's stack, %r15 is curthread 10815d9d9091SRichard Lowe * REG_RSP(%rsp) is the user's stack 10825d9d9091SRichard Lowe */ 10835d9d9091SRichard Lowe 10845d9d9091SRichard Lowe SYSCALL_TRAPTRACE($TT_SYSENTER) 10855d9d9091SRichard Lowe 10865d9d9091SRichard Lowe movq %rsp, %rbp 10875d9d9091SRichard Lowe 10885d9d9091SRichard Lowe movq T_LWP(%r15), %r14 10895d9d9091SRichard Lowe ASSERT_NO_RUPDATE_PENDING(%r14) 10905d9d9091SRichard Lowe 10915d9d9091SRichard Lowe ENABLE_INTR_FLAGS 10925d9d9091SRichard Lowe 10935d9d9091SRichard Lowe /* 10945d9d9091SRichard Lowe * Catch 64-bit process trying to issue sysenter instruction 10955d9d9091SRichard Lowe * on Nocona based systems. 10965d9d9091SRichard Lowe */ 10975d9d9091SRichard Lowe movq LWP_PROCP(%r14), %rax 10985d9d9091SRichard Lowe cmpq $DATAMODEL_ILP32, P_MODEL(%rax) 10995d9d9091SRichard Lowe je 7f 11005d9d9091SRichard Lowe 11015d9d9091SRichard Lowe /* 11025d9d9091SRichard Lowe * For a non-32-bit process, simulate a #ud, since that's what 11035d9d9091SRichard Lowe * native hardware does. The traptrace entry (above) will 11045d9d9091SRichard Lowe * let you know what really happened. 11055d9d9091SRichard Lowe */ 11065d9d9091SRichard Lowe movq $T_ILLINST, REGOFF_TRAPNO(%rsp) 11075d9d9091SRichard Lowe movq REGOFF_CS(%rsp), %rdi 11085d9d9091SRichard Lowe movq %rdi, REGOFF_ERR(%rsp) 11095d9d9091SRichard Lowe movq %rsp, %rdi 11105d9d9091SRichard Lowe movq REGOFF_RIP(%rsp), %rsi 11115d9d9091SRichard Lowe movl %gs:CPU_ID, %edx 11125d9d9091SRichard Lowe call trap 11135d9d9091SRichard Lowe jmp _sys_rtt 11145d9d9091SRichard Lowe7: 11155d9d9091SRichard Lowe 11165d9d9091SRichard Lowe MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) 11175d9d9091SRichard Lowe movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate calls) */ 11185d9d9091SRichard Lowe 11195d9d9091SRichard Lowe ASSERT_LWPTOREGS(%r14, %rsp) 11205d9d9091SRichard Lowe 11215d9d9091SRichard Lowe incq %gs:CPU_STATS_SYS_SYSCALL 11225d9d9091SRichard Lowe 11235d9d9091SRichard Lowe /* 11245d9d9091SRichard Lowe * Make some space for MAXSYSARGS (currently 8) 32-bit args 11255d9d9091SRichard Lowe * placed into 64-bit (long) arg slots, plus one 64-bit 11265d9d9091SRichard Lowe * (long) arg count, maintaining 16 byte alignment. 11275d9d9091SRichard Lowe */ 11285d9d9091SRichard Lowe subq $SYS_DROP, %rsp 11295d9d9091SRichard Lowe movb $LWP_SYS, LWP_STATE(%r14) 11305d9d9091SRichard Lowe movq %r15, %rdi 11315d9d9091SRichard Lowe movq %rsp, %rsi 11325d9d9091SRichard Lowe call syscall_entry 11335d9d9091SRichard Lowe 11345d9d9091SRichard Lowe /* 11355d9d9091SRichard Lowe * Fetch the arguments copied onto the kernel stack and put 11365d9d9091SRichard Lowe * them in the right registers to invoke a C-style syscall handler. 11375d9d9091SRichard Lowe * %rax contains the handler address. 11385d9d9091SRichard Lowe */ 11395d9d9091SRichard Lowe movq %rax, %rbx 11405d9d9091SRichard Lowe movl 0(%rsp), %edi 11415d9d9091SRichard Lowe movl 8(%rsp), %esi 11425d9d9091SRichard Lowe movl 0x10(%rsp), %edx 11435d9d9091SRichard Lowe movl 0x18(%rsp), %ecx 11445d9d9091SRichard Lowe movl 0x20(%rsp), %r8d 11455d9d9091SRichard Lowe movl 0x28(%rsp), %r9d 11465d9d9091SRichard Lowe 11475d9d9091SRichard Lowe movq SY_CALLC(%rbx), %rax 11485d9d9091SRichard Lowe INDIRECT_CALL_REG(rax) 11495d9d9091SRichard Lowe 11505d9d9091SRichard Lowe movq %rbp, %rsp /* pop the args */ 11515d9d9091SRichard Lowe 11525d9d9091SRichard Lowe /* 11535d9d9091SRichard Lowe * amd64 syscall handlers -always- return a 64-bit value in %rax. 11545d9d9091SRichard Lowe * On the 32-bit kernel, the always return that value in %eax:%edx 11555d9d9091SRichard Lowe * as required by the 32-bit ABI. 11565d9d9091SRichard Lowe * 11575d9d9091SRichard Lowe * Simulate the same behaviour by unconditionally splitting the 11585d9d9091SRichard Lowe * return value in the same way. 11595d9d9091SRichard Lowe */ 11605d9d9091SRichard Lowe movq %rax, %r13 11615d9d9091SRichard Lowe shrq $32, %r13 /* upper 32-bits into %edx */ 11625d9d9091SRichard Lowe movl %eax, %r12d /* lower 32-bits into %eax */ 11635d9d9091SRichard Lowe 11645d9d9091SRichard Lowe /* 11655d9d9091SRichard Lowe * Optimistically assume that there's no post-syscall 11665d9d9091SRichard Lowe * work to do. (This is to avoid having to call syscall_mstate() 11675d9d9091SRichard Lowe * with interrupts disabled) 11685d9d9091SRichard Lowe */ 11695d9d9091SRichard Lowe MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER) 11705d9d9091SRichard Lowe 11715d9d9091SRichard Lowe /* 11725d9d9091SRichard Lowe * We must protect ourselves from being descheduled here; 11735d9d9091SRichard Lowe * If we were, and we ended up on another cpu, or another 11745d9d9091SRichard Lowe * lwp got int ahead of us, it could change the segment 11755d9d9091SRichard Lowe * registers without us noticing before we return to userland. 11765d9d9091SRichard Lowe * 11775d9d9091SRichard Lowe * This cli is undone in the tr_sysexit trampoline code. 11785d9d9091SRichard Lowe */ 11795d9d9091SRichard Lowe cli 11805d9d9091SRichard Lowe CHECK_POSTSYS_NE(%r15, %r14, %ebx) 11815d9d9091SRichard Lowe jne _full_syscall_postsys32 11825d9d9091SRichard Lowe SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx) 11835d9d9091SRichard Lowe 11845d9d9091SRichard Lowe /* 11855d9d9091SRichard Lowe * To get back to userland, load up the 32-bit registers and 11865d9d9091SRichard Lowe * sysexit back where we came from. 11875d9d9091SRichard Lowe */ 11885d9d9091SRichard Lowe 11895d9d9091SRichard Lowe /* 11905d9d9091SRichard Lowe * Interrupts will be turned on by the 'sti' executed just before 11915d9d9091SRichard Lowe * sysexit. The following ensures that restoring the user's rflags 11925d9d9091SRichard Lowe * doesn't enable interrupts too soon. 11935d9d9091SRichard Lowe */ 11945d9d9091SRichard Lowe andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp) 11955d9d9091SRichard Lowe 11965d9d9091SRichard Lowe /* 11975d9d9091SRichard Lowe * Clobber %r11 as we check CR0.TS. 11985d9d9091SRichard Lowe */ 11995d9d9091SRichard Lowe ASSERT_CR0TS_ZERO(%r11) 12005d9d9091SRichard Lowe 12015d9d9091SRichard Lowe /* 12025d9d9091SRichard Lowe * (There's no point in loading up %edx because the sysexit 12035d9d9091SRichard Lowe * mechanism smashes it.) 12045d9d9091SRichard Lowe */ 12055d9d9091SRichard Lowe movl %r12d, %eax 12065d9d9091SRichard Lowe movl REGOFF_RBX(%rsp), %ebx 12075d9d9091SRichard Lowe movl REGOFF_RBP(%rsp), %ebp 12085d9d9091SRichard Lowe movl REGOFF_RSI(%rsp), %esi 12095d9d9091SRichard Lowe movl REGOFF_RDI(%rsp), %edi 12105d9d9091SRichard Lowe 12115d9d9091SRichard Lowe movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */ 12125d9d9091SRichard Lowe pushq REGOFF_RFL(%rsp) 12135d9d9091SRichard Lowe popfq 12145d9d9091SRichard Lowe movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */ 12155d9d9091SRichard Lowe ALTENTRY(sys_sysenter_swapgs_sysexit) 12165d9d9091SRichard Lowe call x86_md_clear 12175d9d9091SRichard Lowe jmp tr_sysexit 12185d9d9091SRichard Lowe SET_SIZE(sys_sysenter_swapgs_sysexit) 12195d9d9091SRichard Lowe SET_SIZE(sys_sysenter) 12205d9d9091SRichard Lowe SET_SIZE(_sys_sysenter_post_swapgs) 12215d9d9091SRichard Lowe SET_SIZE(brand_sys_sysenter) 12225d9d9091SRichard Lowe 12235d9d9091SRichard Lowe/* 12245d9d9091SRichard Lowe * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by 12255d9d9091SRichard Lowe * the generic i386 libc to do system calls. We do a small amount of setup 12265d9d9091SRichard Lowe * before jumping into the existing sys_syscall32 path. 12275d9d9091SRichard Lowe */ 12285d9d9091SRichard Lowe 12295d9d9091SRichard Lowe ENTRY_NP(brand_sys_syscall_int) 12305d9d9091SRichard Lowe SWAPGS /* kernel gsbase */ 12315d9d9091SRichard Lowe XPV_TRAP_POP 12325d9d9091SRichard Lowe call smap_enable 12335d9d9091SRichard Lowe BRAND_CALLBACK(BRAND_CB_INT91, BRAND_URET_FROM_INTR_STACK()) 12345d9d9091SRichard Lowe jmp nopop_syscall_int 12355d9d9091SRichard Lowe 12365d9d9091SRichard Lowe ALTENTRY(sys_syscall_int) 12375d9d9091SRichard Lowe SWAPGS /* kernel gsbase */ 12385d9d9091SRichard Lowe XPV_TRAP_POP 12395d9d9091SRichard Lowe call smap_enable 12405d9d9091SRichard Lowe 12415d9d9091SRichard Lowenopop_syscall_int: 12425d9d9091SRichard Lowe movq %gs:CPU_THREAD, %r15 12435d9d9091SRichard Lowe movq T_STACK(%r15), %rsp 12445d9d9091SRichard Lowe movl %eax, %eax 12455d9d9091SRichard Lowe /* 12465d9d9091SRichard Lowe * Set t_post_sys on this thread to force ourselves out via the slow 12475d9d9091SRichard Lowe * path. It might be possible at some later date to optimize this out 12485d9d9091SRichard Lowe * and use a faster return mechanism. 12495d9d9091SRichard Lowe */ 12505d9d9091SRichard Lowe movb $1, T_POST_SYS(%r15) 12515d9d9091SRichard Lowe CLEAN_CS 12525d9d9091SRichard Lowe jmp _syscall32_save 12535d9d9091SRichard Lowe /* 12545d9d9091SRichard Lowe * There should be no instructions between this label and SWAPGS/IRET 12555d9d9091SRichard Lowe * or we could end up breaking branded zone support. See the usage of 12565d9d9091SRichard Lowe * this label in lx_brand_int80_callback and sn1_brand_int91_callback 12575d9d9091SRichard Lowe * for examples. 12585d9d9091SRichard Lowe * 12595d9d9091SRichard Lowe * We want to swapgs to maintain the invariant that all entries into 12605d9d9091SRichard Lowe * tr_iret_user are done on the user gsbase. 12615d9d9091SRichard Lowe */ 12625d9d9091SRichard Lowe ALTENTRY(sys_sysint_swapgs_iret) 12635d9d9091SRichard Lowe call x86_md_clear 12645d9d9091SRichard Lowe SWAPGS 12655d9d9091SRichard Lowe jmp tr_iret_user 12665d9d9091SRichard Lowe /*NOTREACHED*/ 12675d9d9091SRichard Lowe SET_SIZE(sys_sysint_swapgs_iret) 12685d9d9091SRichard Lowe SET_SIZE(sys_syscall_int) 12695d9d9091SRichard Lowe SET_SIZE(brand_sys_syscall_int) 12705d9d9091SRichard Lowe 12715d9d9091SRichard Lowe/* 12725d9d9091SRichard Lowe * Legacy 32-bit applications and old libc implementations do lcalls; 12735d9d9091SRichard Lowe * we should never get here because the LDT entry containing the syscall 12745d9d9091SRichard Lowe * segment descriptor has the "segment present" bit cleared, which means 12755d9d9091SRichard Lowe * we end up processing those system calls in trap() via a not-present trap. 12765d9d9091SRichard Lowe * 12775d9d9091SRichard Lowe * We do it this way because a call gate unhelpfully does -nothing- to the 12785d9d9091SRichard Lowe * interrupt flag bit, so an interrupt can run us just after the lcall 12795d9d9091SRichard Lowe * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and 12805d9d9091SRichard Lowe * INTR_POP paths would have to be slightly more complex to dance around 12815d9d9091SRichard Lowe * this problem, and end up depending explicitly on the first 12825d9d9091SRichard Lowe * instruction of this handler being either swapgs or cli. 12835d9d9091SRichard Lowe */ 12845d9d9091SRichard Lowe 12855d9d9091SRichard Lowe ENTRY_NP(sys_lcall32) 12865d9d9091SRichard Lowe SWAPGS /* kernel gsbase */ 12875d9d9091SRichard Lowe pushq $0 12885d9d9091SRichard Lowe pushq %rbp 12895d9d9091SRichard Lowe movq %rsp, %rbp 12905d9d9091SRichard Lowe leaq __lcall_panic_str(%rip), %rdi 12915d9d9091SRichard Lowe xorl %eax, %eax 12925d9d9091SRichard Lowe call panic 12935d9d9091SRichard Lowe SET_SIZE(sys_lcall32) 12945d9d9091SRichard Lowe 12955d9d9091SRichard Lowe__lcall_panic_str: 12965d9d9091SRichard Lowe .string "sys_lcall32: shouldn't be here!" 12975d9d9091SRichard Lowe 12985d9d9091SRichard Lowe/* 12995d9d9091SRichard Lowe * Declare a uintptr_t which covers the entire pc range of syscall 13005d9d9091SRichard Lowe * handlers for the stack walkers that need this. 13015d9d9091SRichard Lowe */ 13025d9d9091SRichard Lowe .align CPTRSIZE 13035d9d9091SRichard Lowe .globl _allsyscalls_size 13045d9d9091SRichard Lowe .type _allsyscalls_size, @object 13055d9d9091SRichard Lowe_allsyscalls_size: 13065d9d9091SRichard Lowe .NWORD . - _allsyscalls 13075d9d9091SRichard Lowe SET_SIZE(_allsyscalls_size) 13085d9d9091SRichard Lowe 13095d9d9091SRichard Lowe/* 13105d9d9091SRichard Lowe * These are the thread context handlers for lwps using sysenter/sysexit. 13115d9d9091SRichard Lowe */ 13125d9d9091SRichard Lowe 13135d9d9091SRichard Lowe /* 13145d9d9091SRichard Lowe * setting this value to zero as we switch away causes the 13155d9d9091SRichard Lowe * stack-pointer-on-sysenter to be NULL, ensuring that we 13165d9d9091SRichard Lowe * don't silently corrupt another (preempted) thread stack 13175d9d9091SRichard Lowe * when running an lwp that (somehow) didn't get sep_restore'd 13185d9d9091SRichard Lowe */ 13195d9d9091SRichard Lowe ENTRY_NP(sep_save) 13205d9d9091SRichard Lowe xorl %edx, %edx 13215d9d9091SRichard Lowe xorl %eax, %eax 13225d9d9091SRichard Lowe movl $MSR_INTC_SEP_ESP, %ecx 13235d9d9091SRichard Lowe wrmsr 13245d9d9091SRichard Lowe ret 13255d9d9091SRichard Lowe SET_SIZE(sep_save) 13265d9d9091SRichard Lowe 13275d9d9091SRichard Lowe /* 13285d9d9091SRichard Lowe * Update the kernel stack pointer as we resume onto this cpu. 13295d9d9091SRichard Lowe */ 13305d9d9091SRichard Lowe ENTRY_NP(sep_restore) 13315d9d9091SRichard Lowe movq %rdi, %rdx 13325d9d9091SRichard Lowe shrq $32, %rdx 13335d9d9091SRichard Lowe movl %edi, %eax 13345d9d9091SRichard Lowe movl $MSR_INTC_SEP_ESP, %ecx 13355d9d9091SRichard Lowe wrmsr 13365d9d9091SRichard Lowe ret 13375d9d9091SRichard Lowe SET_SIZE(sep_restore) 1338