1; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,CIVI %s 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,CIVI %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,CIVI,CIVI-HSA %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS %s 6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11 %s 7 8; GCN-LABEL: {{^}}store_flat_i32: 9; GCN-DAG: s_load_{{dwordx2|b64}} s[[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]], 10; GCN-DAG: s_load_{{dword|b32}} s[[SDATA:[0-9]+]], 11; GCN: s_waitcnt lgkmcnt(0) 12; GCN-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]] 13; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] 14; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] 15; GCN: flat_store_{{dword|b32}} v[[[LO_VREG]]:[[HI_VREG]]], v[[DATA]] 16define amdgpu_kernel void @store_flat_i32(ptr addrspace(1) %gptr, i32 %x) #0 { 17 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 18 store volatile i32 %x, ptr %fptr, align 4 19 ret void 20} 21 22; GCN-LABEL: {{^}}store_flat_i64: 23; GCN: flat_store_{{dword|b64}} 24define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 { 25 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 26 store volatile i64 %x, ptr %fptr, align 8 27 ret void 28} 29 30; GCN-LABEL: {{^}}store_flat_v4i32: 31; GCN: flat_store_{{dword|b128}} 32define amdgpu_kernel void @store_flat_v4i32(ptr addrspace(1) %gptr, <4 x i32> %x) #0 { 33 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 34 store volatile <4 x i32> %x, ptr %fptr, align 16 35 ret void 36} 37 38; GCN-LABEL: {{^}}store_flat_trunc_i16: 39; GCN: flat_store_{{short|b16}} 40define amdgpu_kernel void @store_flat_trunc_i16(ptr addrspace(1) %gptr, i32 %x) #0 { 41 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 42 %y = trunc i32 %x to i16 43 store volatile i16 %y, ptr %fptr, align 2 44 ret void 45} 46 47; GCN-LABEL: {{^}}store_flat_trunc_i8: 48; GCN: flat_store_{{byte|b8}} 49define amdgpu_kernel void @store_flat_trunc_i8(ptr addrspace(1) %gptr, i32 %x) #0 { 50 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 51 %y = trunc i32 %x to i8 52 store volatile i8 %y, ptr %fptr, align 2 53 ret void 54} 55 56 57 58; GCN-LABEL: load_flat_i32: 59; GCN: flat_load_{{dword|b32}} 60define amdgpu_kernel void @load_flat_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 { 61 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 62 %fload = load volatile i32, ptr %fptr, align 4 63 store i32 %fload, ptr addrspace(1) %out, align 4 64 ret void 65} 66 67; GCN-LABEL: load_flat_i64: 68; GCN: flat_load_{{dword|b64}} 69define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 { 70 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 71 %fload = load volatile i64, ptr %fptr, align 8 72 store i64 %fload, ptr addrspace(1) %out, align 8 73 ret void 74} 75 76; GCN-LABEL: load_flat_v4i32: 77; GCN: flat_load_{{dword|b128}} 78define amdgpu_kernel void @load_flat_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 { 79 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 80 %fload = load volatile <4 x i32>, ptr %fptr, align 32 81 store <4 x i32> %fload, ptr addrspace(1) %out, align 8 82 ret void 83} 84 85; GCN-LABEL: sextload_flat_i8: 86; GCN: flat_load_{{sbyte|i8}} 87define amdgpu_kernel void @sextload_flat_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 { 88 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 89 %fload = load volatile i8, ptr %fptr, align 4 90 %ext = sext i8 %fload to i32 91 store i32 %ext, ptr addrspace(1) %out, align 4 92 ret void 93} 94 95; GCN-LABEL: zextload_flat_i8: 96; GCN: flat_load_{{ubyte|u8}} 97define amdgpu_kernel void @zextload_flat_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 { 98 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 99 %fload = load volatile i8, ptr %fptr, align 4 100 %ext = zext i8 %fload to i32 101 store i32 %ext, ptr addrspace(1) %out, align 4 102 ret void 103} 104 105; GCN-LABEL: sextload_flat_i16: 106; GCN: flat_load_{{sshort|i16}} 107define amdgpu_kernel void @sextload_flat_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 { 108 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 109 %fload = load volatile i16, ptr %fptr, align 4 110 %ext = sext i16 %fload to i32 111 store i32 %ext, ptr addrspace(1) %out, align 4 112 ret void 113} 114 115; GCN-LABEL: zextload_flat_i16: 116; GCN: flat_load_{{ushort|u16}} 117define amdgpu_kernel void @zextload_flat_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 { 118 %fptr = addrspacecast ptr addrspace(1) %gptr to ptr 119 %fload = load volatile i16, ptr %fptr, align 4 120 %ext = zext i16 %fload to i32 121 store i32 %ext, ptr addrspace(1) %out, align 4 122 ret void 123} 124 125; GCN-LABEL: flat_scratch_unaligned_load: 126; GFX9: flat_load_dword 127; GFX10PLUS: flat_load_{{dword|b32}} 128define amdgpu_kernel void @flat_scratch_unaligned_load() { 129 %scratch = alloca i32, addrspace(5) 130 %fptr = addrspacecast ptr addrspace(5) %scratch to ptr 131 store volatile ptr %fptr, ptr addrspace(3) null 132 %ld = load volatile i32, ptr %fptr, align 1 133 ret void 134} 135 136; GCN-LABEL: flat_scratch_unaligned_store: 137; GFX9: flat_store_dword 138; GFX10PLUS: flat_store_{{dword|b32}} 139define amdgpu_kernel void @flat_scratch_unaligned_store() { 140 %scratch = alloca i32, addrspace(5) 141 %fptr = addrspacecast ptr addrspace(5) %scratch to ptr 142 store volatile ptr %fptr, ptr addrspace(3) null 143 store volatile i32 0, ptr %fptr, align 1 144 ret void 145} 146 147; GCN-LABEL: flat_scratch_multidword_load_kernel: 148; CIVI-HSA: flat_load_dword v 149; CIVI-HSA: flat_load_dword v 150; GFX9: flat_load_dwordx2 151; GFX10PLUS: flat_load_{{dwordx2|b64}} 152; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr 153define amdgpu_kernel void @flat_scratch_multidword_load_kernel() { 154 %scratch = alloca <2 x i32>, addrspace(5) 155 %fptr = addrspacecast ptr addrspace(5) %scratch to ptr 156 %ld = load volatile <2 x i32>, ptr %fptr 157 ret void 158} 159; 160; GCN-LABEL: flat_scratch_multidword_load_func: 161; CIVI-HSA: flat_load_dword v 162; CIVI-HSA: flat_load_dword v 163; GFX9: flat_load_dwordx2 164; GFX10PLUS: flat_load_{{dwordx2|b64}} 165; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr 166define <2 x i32> @flat_scratch_multidword_load_func(ptr %maybe.scratch) { 167 %load = load <2 x i32>, ptr %maybe.scratch 168 ret <2 x i32> %load 169} 170 171; GCN-LABEL: flat_scratch_multidword_store_kernel: 172; CIVI-HSA: flat_store_dword v 173; CIVI-HSA: flat_store_dword v 174; GFX9: flat_store_dwordx2 175; GFX10PLUS: flat_store_{{dwordx2|b64}} 176; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr 177define amdgpu_kernel void @flat_scratch_multidword_store_kernel() { 178 %scratch = alloca <2 x i32>, addrspace(5) 179 %fptr = addrspacecast ptr addrspace(5) %scratch to ptr 180 store volatile <2 x i32> zeroinitializer, ptr %fptr 181 ret void 182} 183 184; GCN-LABEL: flat_scratch_multidword_store_func: 185; CIVI-HSA: flat_store_dword v 186; CIVI-HSA: flat_store_dword v 187; GFX9: flat_store_dwordx2 188; GFX10PLUS: flat_store_{{dwordx2|b64}} 189define void @flat_scratch_multidword_store_func(ptr %maybe.scratch) { 190 store <2 x i32> zeroinitializer, ptr %maybe.scratch 191 ret void 192} 193 194; GCN-LABEL: {{^}}store_flat_i8_max_offset: 195; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} 196; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4095{{$}} 197define amdgpu_kernel void @store_flat_i8_max_offset(ptr %fptr, i8 %x) #0 { 198 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4095 199 store volatile i8 %x, ptr %fptr.offset 200 ret void 201} 202 203; GCN-LABEL: {{^}}store_flat_i8_max_offset_p1: 204; GCN: flat_store_{{byte|b8}} v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{( dlc)?}}{{$}} 205define amdgpu_kernel void @store_flat_i8_max_offset_p1(ptr %fptr, i8 %x) #0 { 206 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4096 207 store volatile i8 %x, ptr %fptr.offset 208 ret void 209} 210 211; GCN-LABEL: {{^}}store_flat_i8_neg_offset: 212; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} 213 214; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s 215; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, 216; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} 217define amdgpu_kernel void @store_flat_i8_neg_offset(ptr %fptr, i8 %x) #0 { 218 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 -2 219 store volatile i8 %x, ptr %fptr.offset 220 ret void 221} 222 223; GCN-LABEL: {{^}}load_flat_i8_max_offset: 224; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} 225; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}} 226; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} 227; GFX11: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}} 228define amdgpu_kernel void @load_flat_i8_max_offset(ptr %fptr) #0 { 229 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4095 230 %val = load volatile i8, ptr %fptr.offset 231 ret void 232} 233 234; GCN-LABEL: {{^}}load_flat_i8_max_offset_p1: 235; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} 236; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} 237; GFX10PLUS: flat_load_{{ubyte|u8}} v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} 238define amdgpu_kernel void @load_flat_i8_max_offset_p1(ptr %fptr) #0 { 239 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4096 240 %val = load volatile i8, ptr %fptr.offset 241 ret void 242} 243 244; GCN-LABEL: {{^}}load_flat_i8_neg_offset: 245; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} 246 247; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s 248; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, 249; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} 250define amdgpu_kernel void @load_flat_i8_neg_offset(ptr %fptr) #0 { 251 %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 -2 252 %val = load volatile i8, ptr %fptr.offset 253 ret void 254} 255 256attributes #0 = { nounwind } 257attributes #1 = { nounwind convergent } 258