1; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs < %s | FileCheck %s 2 3; uniform loads 4; CHECK-LABEL: @uniform_load 5; CHECK: s_load_dwordx4 6; CHECK-NOT: flat_load_dword 7 8define amdgpu_kernel void @uniform_load(ptr addrspace(1) %arg, [8 x i32], ptr addrspace(1) %arg1) { 9bb: 10 %tmp2 = load float, ptr addrspace(1) %arg, align 4, !tbaa !8 11 %tmp3 = fadd float %tmp2, 0.000000e+00 12 %tmp4 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 1 13 %tmp5 = load float, ptr addrspace(1) %tmp4, align 4, !tbaa !8 14 %tmp6 = fadd float %tmp3, %tmp5 15 %tmp7 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 2 16 %tmp8 = load float, ptr addrspace(1) %tmp7, align 4, !tbaa !8 17 %tmp9 = fadd float %tmp6, %tmp8 18 %tmp10 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 3 19 %tmp11 = load float, ptr addrspace(1) %tmp10, align 4, !tbaa !8 20 %tmp12 = fadd float %tmp9, %tmp11 21 %tmp13 = getelementptr inbounds float, ptr addrspace(1) %arg1 22 store float %tmp12, ptr addrspace(1) %tmp13, align 4, !tbaa !8 23 ret void 24} 25 26; uniform loads before and after an aliasing store 27; CHECK-LABEL: @uniform_load_store_load 28; CHECK: s_load_dwordx4 29; CHECK: s_load_dword 30; CHECK: flat_store_dword 31; CHECK: flat_load_dword 32; CHECK: flat_store_dword 33 34define amdgpu_kernel void @uniform_load_store_load(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 35bb: 36 %tmp2 = load float, ptr addrspace(1) %arg0, !tbaa !8 37 store float %tmp2, ptr addrspace(1) %arg1, !tbaa !8 38 %tmp3 = load float, ptr addrspace(1) %arg0, !tbaa !8 39 store float %tmp3, ptr addrspace(1) %arg1, !tbaa !8 40 ret void 41} 42 43; non-uniform loads 44; CHECK-LABEL: @non-uniform_load 45; CHECK: flat_load_dword 46; CHECK-NOT: s_load_dwordx4 47 48define amdgpu_kernel void @non-uniform_load(ptr addrspace(1) %arg, [8 x i32], ptr addrspace(1) %arg1) #0 { 49bb: 50 %tmp = call i32 @llvm.amdgcn.workitem.id.x() #1 51 %tmp2 = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %tmp 52 %tmp3 = load float, ptr addrspace(1) %tmp2, align 4, !tbaa !8 53 %tmp4 = fadd float %tmp3, 0.000000e+00 54 %tmp5 = add i32 %tmp, 1 55 %tmp6 = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %tmp5 56 %tmp7 = load float, ptr addrspace(1) %tmp6, align 4, !tbaa !8 57 %tmp8 = fadd float %tmp4, %tmp7 58 %tmp9 = add i32 %tmp, 2 59 %tmp10 = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %tmp9 60 %tmp11 = load float, ptr addrspace(1) %tmp10, align 4, !tbaa !8 61 %tmp12 = fadd float %tmp8, %tmp11 62 %tmp13 = add i32 %tmp, 3 63 %tmp14 = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %tmp13 64 %tmp15 = load float, ptr addrspace(1) %tmp14, align 4, !tbaa !8 65 %tmp16 = fadd float %tmp12, %tmp15 66 %tmp17 = getelementptr inbounds float, ptr addrspace(1) %arg1, i32 %tmp 67 store float %tmp16, ptr addrspace(1) %tmp17, align 4, !tbaa !8 68 ret void 69} 70 71 72; uniform load dominated by no-alias store - scalarize 73; CHECK-LABEL: @no_memdep_alias_arg 74; CHECK: s_load_dwordx2 s[[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[8:9], 0x0 75; CHECK: s_load_dword [[SVAL:s[0-9]+]], s[[[IN_LO]]:[[IN_HI]]], 0x0 76; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] 77; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] 78 79define amdgpu_kernel void @no_memdep_alias_arg(ptr addrspace(1) noalias %in, [8 x i32], ptr addrspace(1) %out0, [8 x i32], ptr addrspace(1) %out1) { 80 store i32 0, ptr addrspace(1) %out0 81 %val = load i32, ptr addrspace(1) %in 82 store i32 %val, ptr addrspace(1) %out1 83 ret void 84} 85 86; uniform load dominated by alias store - vector 87; CHECK-LABEL: {{^}}memdep: 88; CHECK: flat_store_dword 89; CHECK: flat_load_dword [[VVAL:v[0-9]+]] 90; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] 91define amdgpu_kernel void @memdep(ptr addrspace(1) %in, [8 x i32], ptr addrspace(1) %out0, [8 x i32], ptr addrspace(1) %out1) { 92 store i32 0, ptr addrspace(1) %out0 93 %val = load i32, ptr addrspace(1) %in 94 store i32 %val, ptr addrspace(1) %out1 95 ret void 96} 97 98; uniform load from global array 99; CHECK-LABEL: @global_array 100; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]] 101; CHECK-DAG: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0 102; CHECK-DAG: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 103; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[8:9], 0x0 104; CHECK-DAG: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 105; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] 106; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] 107@A = common local_unnamed_addr addrspace(1) global ptr addrspace(1) null, align 4 108 109define amdgpu_kernel void @global_array(ptr addrspace(1) nocapture %out) { 110entry: 111 %load0 = load ptr addrspace(1), ptr addrspace(1) @A, align 4 112 %load1 = load i32, ptr addrspace(1) %load0, align 4 113 store i32 %load1, ptr addrspace(1) %out, align 4 114 ret void 115} 116 117 118; uniform load from global array dominated by alias store 119; CHECK-LABEL: @global_array_alias_store 120; CHECK: flat_store_dword 121; CHECK: v_mov_b32_e32 v[[ADDR_LO:[0-9]+]], s{{[0-9]+}} 122; CHECK: v_mov_b32_e32 v[[ADDR_HI:[0-9]+]], s{{[0-9]+}} 123; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v[[[ADDR_LO]]:[[ADDR_HI]]] 124; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]] 125; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] 126define amdgpu_kernel void @global_array_alias_store(ptr addrspace(1) nocapture %out, [8 x i32], i32 %n) { 127entry: 128 %gep = getelementptr i32, ptr addrspace(1) %out, i32 %n 129 store i32 12, ptr addrspace(1) %gep 130 %load0 = load ptr addrspace(1), ptr addrspace(1) @A, align 4 131 %load1 = load i32, ptr addrspace(1) %load0, align 4 132 store i32 %load1, ptr addrspace(1) %out, align 4 133 ret void 134} 135 136 137declare i32 @llvm.amdgcn.workitem.id.x() #1 138 139attributes #1 = { nounwind readnone } 140 141!8 = !{!9, !9, i64 0} 142!9 = !{!"float", !10, i64 0} 143!10 = !{!"omnipotent char", !11, i64 0} 144!11 = !{!"Simple C/C++ TBAA"} 145