1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GCN %s 3 4define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) { 5; GCN-LABEL: copy_flat: 6; GCN: ; %bb.0: ; %entry 7; GCN-NEXT: s_load_b32 s6, s[4:5], 0x34 8; GCN-NEXT: s_wait_kmcnt 0x0 9; GCN-NEXT: s_cmp_eq_u32 s6, 0 10; GCN-NEXT: s_cbranch_scc1 .LBB0_3 11; GCN-NEXT: ; %bb.1: ; %for.body.preheader 12; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 13; GCN-NEXT: s_wait_kmcnt 0x0 14; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 15; GCN-NEXT: .LBB0_2: ; %for.body 16; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 17; GCN-NEXT: s_wait_alu 0xfffe 18; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 19; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 20; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 21; GCN-NEXT: s_add_co_i32 s6, s6, -1 22; GCN-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 23; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 24; GCN-NEXT: s_cmp_lg_u32 s6, 0 25; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 26; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 27; GCN-NEXT: flat_store_b128 v[4:5], v[0:3] 28; GCN-NEXT: s_cbranch_scc1 .LBB0_2 29; GCN-NEXT: .LBB0_3: ; %for.end 30; GCN-NEXT: s_endpgm 31entry: 32 %cmp6.not = icmp eq i32 %n, 0 33 br i1 %cmp6.not, label %for.end, label %for.body 34 35for.body: ; preds = %entry, %for.body 36 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 37 %idxprom = zext i32 %i.07 to i64 38 %arrayidx = getelementptr inbounds <4 x i32>, ptr %s, i64 %idxprom 39 %ld = load <4 x i32>, ptr %arrayidx, align 4 40 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d, i64 %idxprom 41 store <4 x i32> %ld, ptr %arrayidx2, align 4 42 %inc = add nuw i32 %i.07, 1 43 %exitcond.not = icmp eq i32 %inc, %n 44 br i1 %exitcond.not, label %for.end, label %for.body 45 46for.end: ; preds = %for.body, %entry 47 ret void 48} 49 50define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) { 51; GCN-LABEL: copy_global: 52; GCN: ; %bb.0: ; %entry 53; GCN-NEXT: s_load_b32 s6, s[4:5], 0x34 54; GCN-NEXT: s_wait_kmcnt 0x0 55; GCN-NEXT: s_cmp_eq_u32 s6, 0 56; GCN-NEXT: s_cbranch_scc1 .LBB1_3 57; GCN-NEXT: ; %bb.1: ; %for.body.preheader 58; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 59; GCN-NEXT: v_mov_b32_e32 v0, 0 60; GCN-NEXT: s_wait_kmcnt 0x0 61; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 62; GCN-NEXT: .LBB1_2: ; %for.body 63; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 64; GCN-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 65; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 66; GCN-NEXT: s_add_co_i32 s6, s6, -1 67; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 68; GCN-NEXT: s_cmp_lg_u32 s6, 0 69; GCN-NEXT: s_wait_loadcnt 0x0 70; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1] 71; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 72; GCN-NEXT: s_cbranch_scc1 .LBB1_2 73; GCN-NEXT: .LBB1_3: ; %for.end 74; GCN-NEXT: s_endpgm 75entry: 76 %cmp6.not = icmp eq i32 %n, 0 77 br i1 %cmp6.not, label %for.end, label %for.body 78 79for.body: ; preds = %entry, %for.body 80 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 81 %idxprom = zext i32 %i.07 to i64 82 %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i64 %idxprom 83 %ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4 84 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom 85 store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4 86 %inc = add nuw i32 %i.07, 1 87 %exitcond.not = icmp eq i32 %inc, %n 88 br i1 %exitcond.not, label %for.end, label %for.body 89 90for.end: ; preds = %for.body, %entry 91 ret void 92} 93 94define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) { 95; GCN-LABEL: copy_constant: 96; GCN: ; %bb.0: ; %entry 97; GCN-NEXT: s_load_b32 s6, s[4:5], 0x34 98; GCN-NEXT: s_wait_kmcnt 0x0 99; GCN-NEXT: s_cmp_eq_u32 s6, 0 100; GCN-NEXT: s_cbranch_scc1 .LBB2_3 101; GCN-NEXT: ; %bb.1: ; %for.body.preheader 102; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 103; GCN-NEXT: v_mov_b32_e32 v0, 0 104; GCN-NEXT: .LBB2_2: ; %for.body 105; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 106; GCN-NEXT: s_wait_kmcnt 0x0 107; GCN-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 108; GCN-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0 109; GCN-NEXT: s_add_co_i32 s6, s6, -1 110; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 111; GCN-NEXT: s_cmp_lg_u32 s6, 0 112; GCN-NEXT: s_wait_kmcnt 0x0 113; GCN-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9 114; GCN-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 115; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1] 116; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 117; GCN-NEXT: s_cbranch_scc1 .LBB2_2 118; GCN-NEXT: .LBB2_3: ; %for.end 119; GCN-NEXT: s_endpgm 120entry: 121 %cmp6.not = icmp eq i32 %n, 0 122 br i1 %cmp6.not, label %for.end, label %for.body 123 124for.body: ; preds = %entry, %for.body 125 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 126 %idxprom = zext i32 %i.07 to i64 127 %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(4) %s, i64 %idxprom 128 %ld = load <4 x i32>, ptr addrspace(4) %arrayidx, align 4 129 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom 130 store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4 131 %inc = add nuw i32 %i.07, 1 132 %exitcond.not = icmp eq i32 %inc, %n 133 br i1 %exitcond.not, label %for.end, label %for.body 134 135for.end: ; preds = %for.body, %entry 136 ret void 137} 138 139define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) { 140; GCN-LABEL: copy_local: 141; GCN: ; %bb.0: ; %entry 142; GCN-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 143; GCN-NEXT: s_wait_kmcnt 0x0 144; GCN-NEXT: s_cmp_eq_u32 s2, 0 145; GCN-NEXT: s_cbranch_scc1 .LBB3_2 146; GCN-NEXT: .LBB3_1: ; %for.body 147; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 148; GCN-NEXT: s_wait_alu 0xfffe 149; GCN-NEXT: v_mov_b32_e32 v2, s1 150; GCN-NEXT: v_mov_b32_e32 v4, s0 151; GCN-NEXT: s_add_co_i32 s2, s2, -1 152; GCN-NEXT: s_add_co_i32 s0, s0, 16 153; GCN-NEXT: s_add_co_i32 s1, s1, 16 154; GCN-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3 155; GCN-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1 156; GCN-NEXT: s_cmp_lg_u32 s2, 0 157; GCN-NEXT: s_wait_dscnt 0x1 158; GCN-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3 159; GCN-NEXT: s_wait_dscnt 0x1 160; GCN-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1 161; GCN-NEXT: s_cbranch_scc1 .LBB3_1 162; GCN-NEXT: .LBB3_2: ; %for.end 163; GCN-NEXT: s_endpgm 164entry: 165 %cmp6.not = icmp eq i32 %n, 0 166 br i1 %cmp6.not, label %for.end, label %for.body 167 168for.body: ; preds = %entry, %for.body 169 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 170 %idxprom = zext i32 %i.07 to i64 171 %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(3) %s, i64 %idxprom 172 %ld = load <4 x i32>, ptr addrspace(3) %arrayidx, align 4 173 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %d, i64 %idxprom 174 store <4 x i32> %ld, ptr addrspace(3) %arrayidx2, align 4 175 %inc = add nuw i32 %i.07, 1 176 %exitcond.not = icmp eq i32 %inc, %n 177 br i1 %exitcond.not, label %for.end, label %for.body 178 179for.end: ; preds = %for.body, %entry 180 ret void 181} 182