1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE %s 3; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE-V5 %s 4; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=MUBUF,DEFAULTSIZE-V5 %s 5; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=MUBUF,ASSUME1024 %s 6; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=MUBUF,ASSUME1024 %s 7; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch | FileCheck -check-prefixes=FLATSCR,DEFAULTSIZE %s 8; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=FLATSCR,ASSUME1024 %s 9 10; FIXME: Generated test checks do not check metadata at the end of the 11; function, so this also includes manually added checks. 12 13; Test that we can select a statically sized alloca outside of the 14; entry block. 15 16; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an 17; alignment less than the stack alignment. 18define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) #1 { 19; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: 20; MUBUF: ; %bb.0: ; %entry 21; MUBUF-NEXT: s_add_u32 s0, s0, s9 22; MUBUF-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 23; MUBUF-NEXT: s_addc_u32 s1, s1, 0 24; MUBUF-NEXT: s_mov_b32 s33, 0 25; MUBUF-NEXT: s_movk_i32 s32, 0x400 26; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 27; MUBUF-NEXT: s_cmp_lg_u32 s8, 0 28; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3 29; MUBUF-NEXT: ; %bb.1: ; %bb.0 30; MUBUF-NEXT: s_cmp_lg_u32 s9, 0 31; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3 32; MUBUF-NEXT: ; %bb.2: ; %bb.1 33; MUBUF-NEXT: s_mov_b32 s6, s32 34; MUBUF-NEXT: v_mov_b32_e32 v1, 0 35; MUBUF-NEXT: v_mov_b32_e32 v2, 1 36; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 37; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000 38; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6 39; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4 40; MUBUF-NEXT: s_add_i32 s6, s6, s7 41; MUBUF-NEXT: v_mov_b32_e32 v2, s6 42; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 43; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 44; MUBUF-NEXT: s_waitcnt vmcnt(0) 45; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 46; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 47; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] 48; MUBUF-NEXT: .LBB0_3: ; %bb.2 49; MUBUF-NEXT: v_mov_b32_e32 v0, 0 50; MUBUF-NEXT: global_store_dword v[0:1], v0, off 51; MUBUF-NEXT: s_waitcnt vmcnt(0) 52; MUBUF-NEXT: s_endpgm 53; 54; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: 55; FLATSCR: ; %bb.0: ; %entry 56; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 57; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 58; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 59; FLATSCR-NEXT: s_mov_b32 s33, 0 60; FLATSCR-NEXT: s_mov_b32 s32, 16 61; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 62; FLATSCR-NEXT: s_cmp_lg_u32 s4, 0 63; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3 64; FLATSCR-NEXT: ; %bb.1: ; %bb.0 65; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0 66; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3 67; FLATSCR-NEXT: ; %bb.2: ; %bb.1 68; FLATSCR-NEXT: s_mov_b32 s2, s32 69; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 70; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 71; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2 72; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000 73; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 74; FLATSCR-NEXT: s_add_i32 s2, s2, s3 75; FLATSCR-NEXT: scratch_load_dword v2, off, s2 76; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 77; FLATSCR-NEXT: s_waitcnt vmcnt(0) 78; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 79; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 80; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] 81; FLATSCR-NEXT: .LBB0_3: ; %bb.2 82; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 83; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 84; FLATSCR-NEXT: s_waitcnt vmcnt(0) 85; FLATSCR-NEXT: s_endpgm 86 87entry: 88 %cond0 = icmp eq i32 %arg.cond0, 0 89 br i1 %cond0, label %bb.0, label %bb.2 90 91bb.0: 92 %alloca = alloca [16 x i32], align 4, addrspace(5) 93 %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 94 %cond1 = icmp eq i32 %arg.cond1, 0 95 br i1 %cond1, label %bb.1, label %bb.2 96 97bb.1: 98 ; Use the alloca outside of the defining block. 99 store i32 0, ptr addrspace(5) %alloca 100 store i32 1, ptr addrspace(5) %gep1 101 %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in 102 %load = load i32, ptr addrspace(5) %gep2 103 %tid = call i32 @llvm.amdgcn.workitem.id.x() 104 %add = add i32 %load, %tid 105 store i32 %add, ptr addrspace(1) %out 106 br label %bb.2 107 108bb.2: 109 store volatile i32 0, ptr addrspace(1) undef 110 ret void 111} 112; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 113; DEFAULTSIZE: ; ScratchSize: 4112 114; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16 115; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1 116; DEFAULTSIZE-V5: ; ScratchSize: 16 117 118; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 119; ASSUME1024: ; ScratchSize: 1040 120 121define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { 122; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: 123; MUBUF: ; %bb.0: ; %entry 124; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 125; MUBUF-NEXT: s_add_u32 s0, s0, s17 126; MUBUF-NEXT: s_addc_u32 s1, s1, 0 127; MUBUF-NEXT: s_mov_b32 s33, 0 128; MUBUF-NEXT: s_movk_i32 s32, 0x1000 129; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 130; MUBUF-NEXT: s_cmp_lg_u32 s4, 0 131; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2 132; MUBUF-NEXT: ; %bb.1: ; %bb.0 133; MUBUF-NEXT: s_add_i32 s4, s32, 0xfff 134; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000 135; MUBUF-NEXT: s_lshl_b32 s5, s5, 2 136; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000 137; MUBUF-NEXT: v_mov_b32_e32 v1, 0 138; MUBUF-NEXT: v_mov_b32_e32 v2, s4 139; MUBUF-NEXT: v_mov_b32_e32 v3, 1 140; MUBUF-NEXT: s_add_i32 s4, s4, s5 141; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 142; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 143; MUBUF-NEXT: v_mov_b32_e32 v2, s4 144; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 145; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 146; MUBUF-NEXT: s_waitcnt vmcnt(0) 147; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 148; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 149; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] 150; MUBUF-NEXT: .LBB1_2: ; %bb.1 151; MUBUF-NEXT: v_mov_b32_e32 v0, 0 152; MUBUF-NEXT: global_store_dword v[0:1], v0, off 153; MUBUF-NEXT: s_waitcnt vmcnt(0) 154; MUBUF-NEXT: s_endpgm 155; 156; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: 157; FLATSCR: ; %bb.0: ; %entry 158; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 159; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 160; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 161; FLATSCR-NEXT: s_mov_b32 s33, 0 162; FLATSCR-NEXT: s_mov_b32 s32, 64 163; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 164; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0 165; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2 166; FLATSCR-NEXT: ; %bb.1: ; %bb.0 167; FLATSCR-NEXT: s_add_i32 s0, s32, 0xfff 168; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 169; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000 170; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 171; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2 172; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000 173; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0 174; FLATSCR-NEXT: s_add_i32 s0, s0, s1 175; FLATSCR-NEXT: scratch_load_dword v2, off, s0 176; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 177; FLATSCR-NEXT: s_waitcnt vmcnt(0) 178; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 179; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 180; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] 181; FLATSCR-NEXT: .LBB1_2: ; %bb.1 182; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 183; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 184; FLATSCR-NEXT: s_waitcnt vmcnt(0) 185; FLATSCR-NEXT: s_endpgm 186entry: 187 %cond = icmp eq i32 %arg.cond, 0 188 br i1 %cond, label %bb.0, label %bb.1 189 190bb.0: 191 %alloca = alloca [16 x i32], align 64, addrspace(5) 192 %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 193 store i32 0, ptr addrspace(5) %alloca 194 store i32 1, ptr addrspace(5) %gep1 195 %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in 196 %load = load i32, ptr addrspace(5) %gep2 197 %tid = call i32 @llvm.amdgcn.workitem.id.x() 198 %add = add i32 %load, %tid 199 store i32 %add, ptr addrspace(1) %out 200 br label %bb.1 201 202bb.1: 203 store volatile i32 0, ptr addrspace(1) undef 204 ret void 205} 206 207; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160 208; DEFAULTSIZE: ; ScratchSize: 4160 209; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64 210; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1 211; DEFAULTSIZE-V5: ; ScratchSize: 64 212 213; ASSUME1024: .amdhsa_private_segment_fixed_size 1088 214; ASSUME1024: ; ScratchSize: 1088 215 216 217define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { 218; MUBUF-LABEL: func_non_entry_block_static_alloca_align4: 219; MUBUF: ; %bb.0: ; %entry 220; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 221; MUBUF-NEXT: s_mov_b32 s7, s33 222; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 223; MUBUF-NEXT: s_mov_b32 s33, s32 224; MUBUF-NEXT: s_addk_i32 s32, 0x400 225; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc 226; MUBUF-NEXT: s_cbranch_execz .LBB2_3 227; MUBUF-NEXT: ; %bb.1: ; %bb.0 228; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 229; MUBUF-NEXT: s_and_b64 exec, exec, vcc 230; MUBUF-NEXT: s_cbranch_execz .LBB2_3 231; MUBUF-NEXT: ; %bb.2: ; %bb.1 232; MUBUF-NEXT: s_mov_b32 s6, s32 233; MUBUF-NEXT: v_mov_b32_e32 v2, 0 234; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 235; MUBUF-NEXT: v_mov_b32_e32 v2, 1 236; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4 237; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 238; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 239; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 240; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000 241; MUBUF-NEXT: s_waitcnt vmcnt(0) 242; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 243; MUBUF-NEXT: global_store_dword v[0:1], v2, off 244; MUBUF-NEXT: .LBB2_3: ; %bb.2 245; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] 246; MUBUF-NEXT: v_mov_b32_e32 v0, 0 247; MUBUF-NEXT: global_store_dword v[0:1], v0, off 248; MUBUF-NEXT: s_waitcnt vmcnt(0) 249; MUBUF-NEXT: s_mov_b32 s32, s33 250; MUBUF-NEXT: s_mov_b32 s33, s7 251; MUBUF-NEXT: s_setpc_b64 s[30:31] 252; 253; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4: 254; FLATSCR: ; %bb.0: ; %entry 255; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 256; FLATSCR-NEXT: s_mov_b32 s3, s33 257; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 258; FLATSCR-NEXT: s_mov_b32 s33, s32 259; FLATSCR-NEXT: s_add_i32 s32, s32, 16 260; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc 261; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 262; FLATSCR-NEXT: ; %bb.1: ; %bb.0 263; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 264; FLATSCR-NEXT: s_and_b64 exec, exec, vcc 265; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 266; FLATSCR-NEXT: ; %bb.2: ; %bb.1 267; FLATSCR-NEXT: s_mov_b32 s2, s32 268; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 269; FLATSCR-NEXT: v_mov_b32_e32 v3, 1 270; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 271; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2 272; FLATSCR-NEXT: scratch_load_dword v2, v2, off 273; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 274; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000 275; FLATSCR-NEXT: s_waitcnt vmcnt(0) 276; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 277; FLATSCR-NEXT: global_store_dword v[0:1], v2, off 278; FLATSCR-NEXT: .LBB2_3: ; %bb.2 279; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] 280; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 281; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 282; FLATSCR-NEXT: s_waitcnt vmcnt(0) 283; FLATSCR-NEXT: s_mov_b32 s32, s33 284; FLATSCR-NEXT: s_mov_b32 s33, s3 285; FLATSCR-NEXT: s_setpc_b64 s[30:31] 286 287entry: 288 %cond0 = icmp eq i32 %arg.cond0, 0 289 br i1 %cond0, label %bb.0, label %bb.2 290 291bb.0: 292 %alloca = alloca [16 x i32], align 4, addrspace(5) 293 %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 294 %cond1 = icmp eq i32 %arg.cond1, 0 295 br i1 %cond1, label %bb.1, label %bb.2 296 297bb.1: 298 ; Use the alloca outside of the defining block. 299 store i32 0, ptr addrspace(5) %alloca 300 store i32 1, ptr addrspace(5) %gep1 301 %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in 302 %load = load i32, ptr addrspace(5) %gep2 303 %tid = call i32 @llvm.amdgcn.workitem.id.x() 304 %add = add i32 %load, %tid 305 store i32 %add, ptr addrspace(1) %out 306 br label %bb.2 307 308bb.2: 309 store volatile i32 0, ptr addrspace(1) undef 310 ret void 311} 312 313define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) { 314; MUBUF-LABEL: func_non_entry_block_static_alloca_align64: 315; MUBUF: ; %bb.0: ; %entry 316; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 317; MUBUF-NEXT: s_mov_b32 s7, s33 318; MUBUF-NEXT: s_add_i32 s33, s32, 0xfc0 319; MUBUF-NEXT: s_mov_b32 s8, s34 320; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 321; MUBUF-NEXT: s_and_b32 s33, s33, 0xfffff000 322; MUBUF-NEXT: s_mov_b32 s34, s32 323; MUBUF-NEXT: s_addk_i32 s32, 0x2000 324; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc 325; MUBUF-NEXT: s_cbranch_execz .LBB3_2 326; MUBUF-NEXT: ; %bb.1: ; %bb.0 327; MUBUF-NEXT: s_add_i32 s6, s32, 0xfff 328; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 329; MUBUF-NEXT: v_mov_b32_e32 v2, 0 330; MUBUF-NEXT: v_mov_b32_e32 v4, s6 331; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 332; MUBUF-NEXT: v_mov_b32_e32 v2, 1 333; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 334; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6 335; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 336; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 337; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000 338; MUBUF-NEXT: s_waitcnt vmcnt(0) 339; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 340; MUBUF-NEXT: global_store_dword v[0:1], v2, off 341; MUBUF-NEXT: .LBB3_2: ; %bb.1 342; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] 343; MUBUF-NEXT: v_mov_b32_e32 v0, 0 344; MUBUF-NEXT: global_store_dword v[0:1], v0, off 345; MUBUF-NEXT: s_waitcnt vmcnt(0) 346; MUBUF-NEXT: s_mov_b32 s32, s34 347; MUBUF-NEXT: s_mov_b32 s34, s8 348; MUBUF-NEXT: s_mov_b32 s33, s7 349; MUBUF-NEXT: s_setpc_b64 s[30:31] 350; 351; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64: 352; FLATSCR: ; %bb.0: ; %entry 353; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 354; FLATSCR-NEXT: s_mov_b32 s3, s33 355; FLATSCR-NEXT: s_add_i32 s33, s32, 63 356; FLATSCR-NEXT: s_mov_b32 s4, s34 357; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 358; FLATSCR-NEXT: s_andn2_b32 s33, s33, 63 359; FLATSCR-NEXT: s_mov_b32 s34, s32 360; FLATSCR-NEXT: s_addk_i32 s32, 0x80 361; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc 362; FLATSCR-NEXT: s_cbranch_execz .LBB3_2 363; FLATSCR-NEXT: ; %bb.1: ; %bb.0 364; FLATSCR-NEXT: s_add_i32 s2, s32, 0xfff 365; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 366; FLATSCR-NEXT: v_mov_b32_e32 v4, 0 367; FLATSCR-NEXT: v_mov_b32_e32 v5, 1 368; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2 369; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2 370; FLATSCR-NEXT: scratch_load_dword v2, v2, off 371; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 372; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000 373; FLATSCR-NEXT: s_waitcnt vmcnt(0) 374; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 375; FLATSCR-NEXT: global_store_dword v[0:1], v2, off 376; FLATSCR-NEXT: .LBB3_2: ; %bb.1 377; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] 378; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 379; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 380; FLATSCR-NEXT: s_waitcnt vmcnt(0) 381; FLATSCR-NEXT: s_mov_b32 s32, s34 382; FLATSCR-NEXT: s_mov_b32 s34, s4 383; FLATSCR-NEXT: s_mov_b32 s33, s3 384; FLATSCR-NEXT: s_setpc_b64 s[30:31] 385entry: 386 %cond = icmp eq i32 %arg.cond, 0 387 br i1 %cond, label %bb.0, label %bb.1 388 389bb.0: 390 %alloca = alloca [16 x i32], align 64, addrspace(5) 391 %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 392 store i32 0, ptr addrspace(5) %alloca 393 store i32 1, ptr addrspace(5) %gep1 394 %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in 395 %load = load i32, ptr addrspace(5) %gep2 396 %tid = call i32 @llvm.amdgcn.workitem.id.x() 397 %add = add i32 %load, %tid 398 store i32 %add, ptr addrspace(1) %out 399 br label %bb.1 400 401bb.1: 402 store volatile i32 0, ptr addrspace(1) undef 403 ret void 404} 405 406declare i32 @llvm.amdgcn.workitem.id.x() #0 407 408attributes #0 = { nounwind readnone speculatable } 409attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } 410 411!llvm.module.flags = !{!0} 412!0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} 413;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 414; ASSUME1024: {{.*}} 415; DEFAULTSIZE: {{.*}} 416; DEFAULTSIZE-V5: {{.*}} 417