1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=MUBUF %s 3; RUN: llc -global-isel -mattr=+enable-flat-scratch -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=FLATSCR %s 4 5; Test end-to-end codegen for outgoing arguments passed on the 6; stack. This test is likely redundant when all DAG and GlobalISel 7; tests are unified. 8 9declare hidden void @external_void_func_v16i32_v16i32_v4i32(<16 x i32>, <16 x i32>, <4 x i32>) #0 10declare hidden void @external_void_func_byval(ptr addrspace(5) byval([16 x i32])) #0 11 12define amdgpu_kernel void @kernel_caller_stack() { 13; MUBUF-LABEL: kernel_caller_stack: 14; MUBUF: ; %bb.0: 15; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s12, s17 16; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 17; MUBUF-NEXT: s_add_u32 s0, s0, s17 18; MUBUF-NEXT: s_mov_b32 s32, 0 19; MUBUF-NEXT: s_addc_u32 s1, s1, 0 20; MUBUF-NEXT: v_mov_b32_e32 v0, 9 21; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 22; MUBUF-NEXT: v_mov_b32_e32 v0, 10 23; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 24; MUBUF-NEXT: v_mov_b32_e32 v0, 11 25; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 26; MUBUF-NEXT: v_mov_b32_e32 v0, 12 27; MUBUF-NEXT: s_getpc_b64 s[4:5] 28; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 29; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 30; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 31; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] 32; MUBUF-NEXT: s_endpgm 33; 34; FLATSCR-LABEL: kernel_caller_stack: 35; FLATSCR: ; %bb.0: 36; FLATSCR-NEXT: s_mov_b32 s32, 0 37; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 38; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 39; FLATSCR-NEXT: s_add_u32 s0, s32, 4 40; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 41; FLATSCR-NEXT: scratch_store_dword off, v0, s0 42; FLATSCR-NEXT: s_add_u32 s0, s32, 8 43; FLATSCR-NEXT: v_mov_b32_e32 v0, 10 44; FLATSCR-NEXT: scratch_store_dword off, v0, s0 45; FLATSCR-NEXT: s_add_u32 s0, s32, 12 46; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 47; FLATSCR-NEXT: scratch_store_dword off, v0, s0 48; FLATSCR-NEXT: s_add_u32 s2, s32, 16 49; FLATSCR-NEXT: v_mov_b32_e32 v0, 12 50; FLATSCR-NEXT: s_getpc_b64 s[0:1] 51; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 52; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 53; FLATSCR-NEXT: scratch_store_dword off, v0, s2 54; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] 55; FLATSCR-NEXT: s_endpgm 56 call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>) 57 ret void 58} 59 60define amdgpu_kernel void @kernel_caller_byval() { 61; MUBUF-LABEL: kernel_caller_byval: 62; MUBUF: ; %bb.0: 63; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s12, s17 64; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 65; MUBUF-NEXT: s_add_u32 s0, s0, s17 66; MUBUF-NEXT: s_addc_u32 s1, s1, 0 67; MUBUF-NEXT: v_mov_b32_e32 v0, 0 68; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 69; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 70; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 71; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 72; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 73; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:20 74; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:24 75; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:28 76; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:32 77; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:36 78; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40 79; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:44 80; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:48 81; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:52 82; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:56 83; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:60 84; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64 85; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68 86; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:72 87; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:76 88; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:80 89; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:84 90; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:88 91; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:92 92; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:96 93; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:100 94; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:104 95; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:108 96; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:112 97; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:116 98; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120 99; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124 100; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 101; MUBUF-NEXT: s_nop 0 102; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 103; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:8 104; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:12 105; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:16 106; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:20 107; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:24 108; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:28 109; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:32 110; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:36 111; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:40 112; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:44 113; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:48 114; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:52 115; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:56 116; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:60 117; MUBUF-NEXT: s_movk_i32 s32, 0x1400 118; MUBUF-NEXT: s_getpc_b64 s[4:5] 119; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 120; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12 121; MUBUF-NEXT: s_waitcnt vmcnt(15) 122; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 123; MUBUF-NEXT: s_waitcnt vmcnt(15) 124; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 125; MUBUF-NEXT: s_waitcnt vmcnt(15) 126; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 127; MUBUF-NEXT: s_waitcnt vmcnt(15) 128; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 129; MUBUF-NEXT: s_waitcnt vmcnt(15) 130; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 131; MUBUF-NEXT: s_waitcnt vmcnt(15) 132; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 133; MUBUF-NEXT: s_waitcnt vmcnt(15) 134; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 135; MUBUF-NEXT: s_waitcnt vmcnt(15) 136; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 137; MUBUF-NEXT: s_waitcnt vmcnt(15) 138; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 139; MUBUF-NEXT: s_waitcnt vmcnt(15) 140; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 141; MUBUF-NEXT: s_waitcnt vmcnt(15) 142; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 143; MUBUF-NEXT: s_waitcnt vmcnt(15) 144; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 145; MUBUF-NEXT: s_waitcnt vmcnt(15) 146; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 147; MUBUF-NEXT: s_waitcnt vmcnt(15) 148; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 149; MUBUF-NEXT: s_waitcnt vmcnt(15) 150; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 151; MUBUF-NEXT: s_waitcnt vmcnt(15) 152; MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 153; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] 154; MUBUF-NEXT: s_endpgm 155; 156; FLATSCR-LABEL: kernel_caller_byval: 157; FLATSCR: ; %bb.0: 158; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 159; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 160; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 161; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 162; FLATSCR-NEXT: s_mov_b32 s0, 0 163; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 164; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:8 165; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:16 166; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:24 167; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:32 168; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:40 169; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:48 170; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:56 171; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:64 172; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:72 173; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:80 174; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:88 175; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:96 176; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:104 177; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:112 178; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:120 179; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 180; FLATSCR-NEXT: s_nop 0 181; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:8 182; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:16 183; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:24 184; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:32 185; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:40 186; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:48 187; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:56 188; FLATSCR-NEXT: s_movk_i32 s32, 0x50 189; FLATSCR-NEXT: s_getpc_b64 s[0:1] 190; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 191; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12 192; FLATSCR-NEXT: s_add_u32 s2, s32, 8 193; FLATSCR-NEXT: s_add_u32 s3, s32, 16 194; FLATSCR-NEXT: s_add_u32 s4, s32, 24 195; FLATSCR-NEXT: s_add_u32 s5, s32, 32 196; FLATSCR-NEXT: s_add_u32 s6, s32, 40 197; FLATSCR-NEXT: s_add_u32 s7, s32, 48 198; FLATSCR-NEXT: s_add_u32 s8, s32, 56 199; FLATSCR-NEXT: s_waitcnt vmcnt(7) 200; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 201; FLATSCR-NEXT: s_waitcnt vmcnt(7) 202; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 203; FLATSCR-NEXT: s_waitcnt vmcnt(7) 204; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s3 205; FLATSCR-NEXT: s_waitcnt vmcnt(7) 206; FLATSCR-NEXT: scratch_store_dwordx2 off, v[6:7], s4 207; FLATSCR-NEXT: s_waitcnt vmcnt(7) 208; FLATSCR-NEXT: scratch_store_dwordx2 off, v[8:9], s5 209; FLATSCR-NEXT: s_waitcnt vmcnt(7) 210; FLATSCR-NEXT: scratch_store_dwordx2 off, v[10:11], s6 211; FLATSCR-NEXT: s_waitcnt vmcnt(7) 212; FLATSCR-NEXT: scratch_store_dwordx2 off, v[12:13], s7 213; FLATSCR-NEXT: s_waitcnt vmcnt(7) 214; FLATSCR-NEXT: scratch_store_dwordx2 off, v[14:15], s8 215; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] 216; FLATSCR-NEXT: s_endpgm 217 %alloca = alloca [16 x i32], align 4, addrspace(5) 218 call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %alloca, i8 0, i32 128, i1 false) 219 call void @external_void_func_byval(ptr addrspace(5) byval([16 x i32]) %alloca) 220 ret void 221} 222 223define void @func_caller_stack() { 224; MUBUF-LABEL: func_caller_stack: 225; MUBUF: ; %bb.0: 226; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 227; MUBUF-NEXT: s_mov_b32 s4, s33 228; MUBUF-NEXT: s_mov_b32 s33, s32 229; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 230; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill 231; MUBUF-NEXT: s_mov_b64 exec, s[6:7] 232; MUBUF-NEXT: s_addk_i32 s32, 0x400 233; MUBUF-NEXT: v_mov_b32_e32 v0, 9 234; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 235; MUBUF-NEXT: v_mov_b32_e32 v0, 10 236; MUBUF-NEXT: v_writelane_b32 v40, s4, 2 237; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 238; MUBUF-NEXT: v_mov_b32_e32 v0, 11 239; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 240; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 241; MUBUF-NEXT: v_mov_b32_e32 v0, 12 242; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 243; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 244; MUBUF-NEXT: s_getpc_b64 s[4:5] 245; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 246; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 247; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] 248; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 249; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 250; MUBUF-NEXT: s_mov_b32 s32, s33 251; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 252; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 253; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload 254; MUBUF-NEXT: s_mov_b64 exec, s[6:7] 255; MUBUF-NEXT: s_mov_b32 s33, s4 256; MUBUF-NEXT: s_waitcnt vmcnt(0) 257; MUBUF-NEXT: s_setpc_b64 s[30:31] 258; 259; FLATSCR-LABEL: func_caller_stack: 260; FLATSCR: ; %bb.0: 261; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 262; FLATSCR-NEXT: s_mov_b32 s0, s33 263; FLATSCR-NEXT: s_mov_b32 s33, s32 264; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 265; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill 266; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] 267; FLATSCR-NEXT: s_add_i32 s32, s32, 16 268; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 269; FLATSCR-NEXT: s_add_u32 s0, s32, 4 270; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 271; FLATSCR-NEXT: scratch_store_dword off, v0, s0 272; FLATSCR-NEXT: s_add_u32 s0, s32, 8 273; FLATSCR-NEXT: v_mov_b32_e32 v0, 10 274; FLATSCR-NEXT: scratch_store_dword off, v0, s0 275; FLATSCR-NEXT: s_add_u32 s0, s32, 12 276; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 277; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 278; FLATSCR-NEXT: scratch_store_dword off, v0, s0 279; FLATSCR-NEXT: s_add_u32 s0, s32, 16 280; FLATSCR-NEXT: v_mov_b32_e32 v0, 12 281; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 282; FLATSCR-NEXT: scratch_store_dword off, v0, s0 283; FLATSCR-NEXT: s_getpc_b64 s[0:1] 284; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 285; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 286; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] 287; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 288; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 289; FLATSCR-NEXT: s_mov_b32 s32, s33 290; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 291; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 292; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload 293; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] 294; FLATSCR-NEXT: s_mov_b32 s33, s0 295; FLATSCR-NEXT: s_waitcnt vmcnt(0) 296; FLATSCR-NEXT: s_setpc_b64 s[30:31] 297 call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>) 298 ret void 299} 300 301define void @func_caller_byval(ptr addrspace(5) %argptr) { 302; MUBUF-LABEL: func_caller_byval: 303; MUBUF: ; %bb.0: 304; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 305; MUBUF-NEXT: s_mov_b32 s4, s33 306; MUBUF-NEXT: s_mov_b32 s33, s32 307; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 308; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill 309; MUBUF-NEXT: s_mov_b64 exec, s[6:7] 310; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen 311; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 312; MUBUF-NEXT: s_addk_i32 s32, 0x400 313; MUBUF-NEXT: v_writelane_b32 v40, s4, 2 314; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 315; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 316; MUBUF-NEXT: s_getpc_b64 s[4:5] 317; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 318; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12 319; MUBUF-NEXT: s_waitcnt vmcnt(1) 320; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 321; MUBUF-NEXT: s_waitcnt vmcnt(1) 322; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 323; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:8 324; MUBUF-NEXT: s_nop 0 325; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:12 326; MUBUF-NEXT: s_waitcnt vmcnt(1) 327; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 328; MUBUF-NEXT: s_waitcnt vmcnt(1) 329; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 330; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:16 331; MUBUF-NEXT: s_nop 0 332; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:20 333; MUBUF-NEXT: s_waitcnt vmcnt(1) 334; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 335; MUBUF-NEXT: s_waitcnt vmcnt(1) 336; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 337; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:24 338; MUBUF-NEXT: s_nop 0 339; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:28 340; MUBUF-NEXT: s_waitcnt vmcnt(1) 341; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 342; MUBUF-NEXT: s_waitcnt vmcnt(1) 343; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 344; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:32 345; MUBUF-NEXT: s_nop 0 346; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:36 347; MUBUF-NEXT: s_waitcnt vmcnt(1) 348; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 349; MUBUF-NEXT: s_waitcnt vmcnt(1) 350; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 351; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:40 352; MUBUF-NEXT: s_nop 0 353; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:44 354; MUBUF-NEXT: s_waitcnt vmcnt(1) 355; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40 356; MUBUF-NEXT: s_waitcnt vmcnt(1) 357; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 358; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:48 359; MUBUF-NEXT: s_nop 0 360; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:52 361; MUBUF-NEXT: s_waitcnt vmcnt(1) 362; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:48 363; MUBUF-NEXT: s_waitcnt vmcnt(1) 364; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 365; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:56 366; MUBUF-NEXT: s_nop 0 367; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:60 368; MUBUF-NEXT: s_waitcnt vmcnt(1) 369; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 370; MUBUF-NEXT: s_waitcnt vmcnt(1) 371; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 372; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] 373; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 374; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 375; MUBUF-NEXT: s_mov_b32 s32, s33 376; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 377; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 378; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload 379; MUBUF-NEXT: s_mov_b64 exec, s[6:7] 380; MUBUF-NEXT: s_mov_b32 s33, s4 381; MUBUF-NEXT: s_waitcnt vmcnt(0) 382; MUBUF-NEXT: s_setpc_b64 s[30:31] 383; 384; FLATSCR-LABEL: func_caller_byval: 385; FLATSCR: ; %bb.0: 386; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 387; FLATSCR-NEXT: s_mov_b32 s0, s33 388; FLATSCR-NEXT: s_mov_b32 s33, s32 389; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 390; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill 391; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] 392; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off 393; FLATSCR-NEXT: s_add_i32 s32, s32, 16 394; FLATSCR-NEXT: v_add_u32_e32 v3, 8, v0 395; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 396; FLATSCR-NEXT: s_add_u32 s0, s32, 8 397; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 398; FLATSCR-NEXT: s_add_u32 s2, s32, 56 399; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 400; FLATSCR-NEXT: s_waitcnt vmcnt(0) 401; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 402; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off 403; FLATSCR-NEXT: v_add_u32_e32 v3, 16, v0 404; FLATSCR-NEXT: s_waitcnt vmcnt(0) 405; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 406; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off 407; FLATSCR-NEXT: s_add_u32 s0, s32, 16 408; FLATSCR-NEXT: v_add_u32_e32 v3, 24, v0 409; FLATSCR-NEXT: s_waitcnt vmcnt(0) 410; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 411; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off 412; FLATSCR-NEXT: s_add_u32 s0, s32, 24 413; FLATSCR-NEXT: v_add_u32_e32 v3, 32, v0 414; FLATSCR-NEXT: s_waitcnt vmcnt(0) 415; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 416; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off 417; FLATSCR-NEXT: s_add_u32 s0, s32, 32 418; FLATSCR-NEXT: v_add_u32_e32 v3, 40, v0 419; FLATSCR-NEXT: s_waitcnt vmcnt(0) 420; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 421; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off 422; FLATSCR-NEXT: s_add_u32 s0, s32, 40 423; FLATSCR-NEXT: v_add_u32_e32 v3, 48, v0 424; FLATSCR-NEXT: v_add_u32_e32 v0, 56, v0 425; FLATSCR-NEXT: s_waitcnt vmcnt(0) 426; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 427; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off 428; FLATSCR-NEXT: s_add_u32 s0, s32, 48 429; FLATSCR-NEXT: s_waitcnt vmcnt(0) 430; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 431; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], v0, off 432; FLATSCR-NEXT: s_getpc_b64 s[0:1] 433; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 434; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12 435; FLATSCR-NEXT: s_waitcnt vmcnt(0) 436; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s2 437; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] 438; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 439; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 440; FLATSCR-NEXT: s_mov_b32 s32, s33 441; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 442; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 443; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload 444; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] 445; FLATSCR-NEXT: s_mov_b32 s33, s0 446; FLATSCR-NEXT: s_waitcnt vmcnt(0) 447; FLATSCR-NEXT: s_setpc_b64 s[30:31] 448 call void @external_void_func_byval(ptr addrspace(5) byval([16 x i32]) %argptr) 449 ret void 450} 451 452declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #1 453 454attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } 455attributes #1 = { argmemonly nofree nounwind willreturn writeonly } 456