1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX9 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX10 %s 4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX9 %s 5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX10 %s 6 7; Test case looks at the allocated offset of @used_by_both. It's at zero when 8; allocated by itself, but at 8 when allocated in combination with the double. 9; Redundantly also checks LDSByteSize. 10@used_by_both = addrspace(3) global i32 undef 11@used_by_kernel = addrspace(3) global i32 undef 12@used_by_function = addrspace(3) global double undef 13 14; kernel that calls no functions and uses an LDS variable allocates only that 15; variable, so accesses at at offset 0 and LDSByteSize is 4 16define amdgpu_kernel void @nocall_ideal() { 17; CHECK-LABEL: nocall_ideal: 18; CHECK: ; %bb.0: 19; CHECK-NEXT: v_mov_b32_e32 v0, 0 20; CHECK-NEXT: ds_write_b32 v0, v0 21; CHECK-NEXT: s_endpgm 22store i32 0, ptr addrspace(3) @used_by_kernel 23 ret void 24} 25; CHECK: ; LDSByteSize: 4 bytes 26 27define void @nonkernel() { 28; GFX9-LABEL: nonkernel: 29; GFX9: ; %bb.0: 30; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31; GFX9-NEXT: v_mov_b32_e32 v0, 0 32; GFX9-NEXT: v_mov_b32_e32 v1, v0 33; GFX9-NEXT: ds_write_b32 v0, v0 offset:8 34; GFX9-NEXT: ds_write_b64 v0, v[0:1] 35; GFX9-NEXT: s_waitcnt lgkmcnt(0) 36; GFX9-NEXT: s_setpc_b64 s[30:31] 37; 38; GFX10-LABEL: nonkernel: 39; GFX10: ; %bb.0: 40; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41; GFX10-NEXT: v_mov_b32_e32 v0, 0 42; GFX10-NEXT: v_mov_b32_e32 v1, v0 43; GFX10-NEXT: ds_write_b32 v0, v0 offset:8 44; GFX10-NEXT: ds_write_b64 v0, v[0:1] 45; GFX10-NEXT: s_waitcnt lgkmcnt(0) 46; GFX10-NEXT: s_setpc_b64 s[30:31] 47; 48; G_GFX9-LABEL: nonkernel: 49; G_GFX9: ; %bb.0: 50; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 51; G_GFX9-NEXT: v_mov_b32_e32 v2, 0 52; G_GFX9-NEXT: v_mov_b32_e32 v3, 8 53; G_GFX9-NEXT: v_mov_b32_e32 v0, 0 54; G_GFX9-NEXT: v_mov_b32_e32 v1, 0 55; G_GFX9-NEXT: ds_write_b32 v3, v2 56; G_GFX9-NEXT: ds_write_b64 v2, v[0:1] 57; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) 58; G_GFX9-NEXT: s_setpc_b64 s[30:31] 59; 60; G_GFX10-LABEL: nonkernel: 61; G_GFX10: ; %bb.0: 62; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 63; G_GFX10-NEXT: v_mov_b32_e32 v2, 0 64; G_GFX10-NEXT: v_mov_b32_e32 v3, 8 65; G_GFX10-NEXT: v_mov_b32_e32 v0, 0 66; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 67; G_GFX10-NEXT: ds_write_b32 v3, v2 68; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] 69; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) 70; G_GFX10-NEXT: s_setpc_b64 s[30:31] 71 store i32 0, ptr addrspace(3) @used_by_both 72 store double 0.0, ptr addrspace(3) @used_by_function 73 ret void 74} 75 76; Needs to allocate both variables, store to used_by_both is at sizeof(double) 77define amdgpu_kernel void @withcall() { 78; GFX9-LABEL: withcall: 79; GFX9: ; %bb.0: 80; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 81; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 82; GFX9-NEXT: s_mov_b32 s22, -1 83; GFX9-NEXT: s_mov_b32 s23, 0xe00000 84; GFX9-NEXT: s_add_u32 s20, s20, s11 85; GFX9-NEXT: s_addc_u32 s21, s21, 0 86; GFX9-NEXT: s_mov_b32 s12, s8 87; GFX9-NEXT: s_add_u32 s8, s4, 36 88; GFX9-NEXT: s_mov_b32 s13, s9 89; GFX9-NEXT: s_addc_u32 s9, s5, 0 90; GFX9-NEXT: s_getpc_b64 s[4:5] 91; GFX9-NEXT: s_add_u32 s4, s4, nonkernel@gotpcrel32@lo+4 92; GFX9-NEXT: s_addc_u32 s5, s5, nonkernel@gotpcrel32@hi+12 93; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 94; GFX9-NEXT: s_mov_b32 s14, s10 95; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] 96; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 97; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 98; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] 99; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] 100; GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] 101; GFX9-NEXT: v_mov_b32_e32 v3, 0 102; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 103; GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] 104; GFX9-NEXT: s_mov_b32 s32, 0 105; GFX9-NEXT: ds_write_b32 v3, v3 offset:8 106; GFX9-NEXT: s_waitcnt lgkmcnt(0) 107; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] 108; GFX9-NEXT: s_endpgm 109; 110; GFX10-LABEL: withcall: 111; GFX10: ; %bb.0: 112; GFX10-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 113; GFX10-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 114; GFX10-NEXT: s_mov_b32 s22, -1 115; GFX10-NEXT: s_mov_b32 s23, 0x31c16000 116; GFX10-NEXT: s_add_u32 s20, s20, s11 117; GFX10-NEXT: s_addc_u32 s21, s21, 0 118; GFX10-NEXT: s_mov_b32 s12, s8 119; GFX10-NEXT: s_add_u32 s8, s4, 36 120; GFX10-NEXT: s_mov_b32 s13, s9 121; GFX10-NEXT: s_addc_u32 s9, s5, 0 122; GFX10-NEXT: s_getpc_b64 s[4:5] 123; GFX10-NEXT: s_add_u32 s4, s4, nonkernel@gotpcrel32@lo+4 124; GFX10-NEXT: s_addc_u32 s5, s5, nonkernel@gotpcrel32@hi+12 125; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 126; GFX10-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 127; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 128; GFX10-NEXT: v_mov_b32_e32 v3, 0 129; GFX10-NEXT: s_mov_b32 s14, s10 130; GFX10-NEXT: s_mov_b64 s[10:11], s[6:7] 131; GFX10-NEXT: s_mov_b64 s[4:5], s[0:1] 132; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 133; GFX10-NEXT: s_mov_b64 s[6:7], s[2:3] 134; GFX10-NEXT: s_mov_b64 s[0:1], s[20:21] 135; GFX10-NEXT: s_mov_b64 s[2:3], s[22:23] 136; GFX10-NEXT: s_mov_b32 s32, 0 137; GFX10-NEXT: ds_write_b32 v3, v3 offset:8 138; GFX10-NEXT: s_waitcnt lgkmcnt(0) 139; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] 140; GFX10-NEXT: s_endpgm 141; 142; G_GFX9-LABEL: withcall: 143; G_GFX9: ; %bb.0: 144; G_GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 145; G_GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 146; G_GFX9-NEXT: s_mov_b32 s22, -1 147; G_GFX9-NEXT: s_mov_b32 s23, 0xe00000 148; G_GFX9-NEXT: s_add_u32 s20, s20, s11 149; G_GFX9-NEXT: s_addc_u32 s21, s21, 0 150; G_GFX9-NEXT: s_mov_b32 s16, s8 151; G_GFX9-NEXT: s_add_u32 s8, s4, 36 152; G_GFX9-NEXT: s_mov_b32 s15, s9 153; G_GFX9-NEXT: s_addc_u32 s9, s5, 0 154; G_GFX9-NEXT: s_mov_b64 s[12:13], s[0:1] 155; G_GFX9-NEXT: s_getpc_b64 s[0:1] 156; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 157; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 158; G_GFX9-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0 159; G_GFX9-NEXT: s_mov_b32 s14, s10 160; G_GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] 161; G_GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] 162; G_GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 163; G_GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 164; G_GFX9-NEXT: s_mov_b64 s[0:1], s[20:21] 165; G_GFX9-NEXT: v_mov_b32_e32 v3, 0 166; G_GFX9-NEXT: v_mov_b32_e32 v4, 8 167; G_GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 168; G_GFX9-NEXT: s_mov_b64 s[2:3], s[22:23] 169; G_GFX9-NEXT: s_mov_b64 s[4:5], s[12:13] 170; G_GFX9-NEXT: s_mov_b32 s12, s16 171; G_GFX9-NEXT: s_mov_b32 s13, s15 172; G_GFX9-NEXT: s_mov_b32 s32, 0 173; G_GFX9-NEXT: ds_write_b32 v4, v3 174; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) 175; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] 176; G_GFX9-NEXT: s_endpgm 177; 178; G_GFX10-LABEL: withcall: 179; G_GFX10: ; %bb.0: 180; G_GFX10-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 181; G_GFX10-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 182; G_GFX10-NEXT: s_mov_b32 s22, -1 183; G_GFX10-NEXT: s_mov_b32 s23, 0x31c16000 184; G_GFX10-NEXT: s_add_u32 s20, s20, s11 185; G_GFX10-NEXT: s_addc_u32 s21, s21, 0 186; G_GFX10-NEXT: s_mov_b32 s16, s8 187; G_GFX10-NEXT: s_add_u32 s8, s4, 36 188; G_GFX10-NEXT: s_mov_b32 s15, s9 189; G_GFX10-NEXT: s_addc_u32 s9, s5, 0 190; G_GFX10-NEXT: s_mov_b64 s[12:13], s[0:1] 191; G_GFX10-NEXT: s_getpc_b64 s[0:1] 192; G_GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4 193; G_GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12 194; G_GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 195; G_GFX10-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x0 196; G_GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 197; G_GFX10-NEXT: v_mov_b32_e32 v3, 0 198; G_GFX10-NEXT: v_mov_b32_e32 v4, 8 199; G_GFX10-NEXT: s_mov_b32 s14, s10 200; G_GFX10-NEXT: s_mov_b64 s[10:11], s[6:7] 201; G_GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 202; G_GFX10-NEXT: s_mov_b64 s[6:7], s[2:3] 203; G_GFX10-NEXT: s_mov_b64 s[0:1], s[20:21] 204; G_GFX10-NEXT: s_mov_b64 s[2:3], s[22:23] 205; G_GFX10-NEXT: s_mov_b64 s[4:5], s[12:13] 206; G_GFX10-NEXT: s_mov_b32 s12, s16 207; G_GFX10-NEXT: s_mov_b32 s13, s15 208; G_GFX10-NEXT: s_mov_b32 s32, 0 209; G_GFX10-NEXT: ds_write_b32 v4, v3 210; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) 211; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[18:19] 212; G_GFX10-NEXT: s_endpgm 213 store i32 0, ptr addrspace(3) @used_by_both 214 call void @nonkernel() 215 ret void 216} 217; CHECK: ; LDSByteSize: 16 bytes 218 219; Previous lowering was less efficient here than necessary as the i32 used 220; by the kernel is also used by an unrelated non-kernel function. Codegen 221; is now the same as nocall_ideal. 222define amdgpu_kernel void @nocall_false_sharing() { 223; CHECK-LABEL: nocall_false_sharing: 224; CHECK: ; %bb.0: 225; CHECK-NEXT: v_mov_b32_e32 v0, 0 226; CHECK-NEXT: ds_write_b32 v0, v0 227; CHECK-NEXT: s_endpgm 228 store i32 0, ptr addrspace(3) @used_by_both 229 ret void 230} 231; CHECK: ; LDSByteSize: 4 bytes 232 233!llvm.module.flags = !{!0} 234!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} 235