xref: /llvm-project/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GCN %s
3
4define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
5; GCN-LABEL: copy_flat:
6; GCN:       ; %bb.0: ; %entry
7; GCN-NEXT:    s_load_b32 s6, s[4:5], 0x34
8; GCN-NEXT:    s_wait_kmcnt 0x0
9; GCN-NEXT:    s_cmp_eq_u32 s6, 0
10; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
11; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
12; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
13; GCN-NEXT:    s_wait_kmcnt 0x0
14; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
15; GCN-NEXT:  .LBB0_2: ; %for.body
16; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
17; GCN-NEXT:    s_wait_alu 0xfffe
18; GCN-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
19; GCN-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
20; GCN-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
21; GCN-NEXT:    s_add_co_i32 s6, s6, -1
22; GCN-NEXT:    flat_load_b128 v[0:3], v[0:1] offset:-176
23; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
24; GCN-NEXT:    s_cmp_lg_u32 s6, 0
25; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
26; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
27; GCN-NEXT:    flat_store_b128 v[4:5], v[0:3]
28; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
29; GCN-NEXT:  .LBB0_3: ; %for.end
30; GCN-NEXT:    s_endpgm
31entry:
32  %cmp6.not = icmp eq i32 %n, 0
33  br i1 %cmp6.not, label %for.end, label %for.body
34
35for.body:                                         ; preds = %entry, %for.body
36  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
37  %idxprom = zext i32 %i.07 to i64
38  %arrayidx = getelementptr inbounds <4 x i32>, ptr %s, i64 %idxprom
39  %ld = load <4 x i32>, ptr %arrayidx, align 4
40  %arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d, i64 %idxprom
41  store <4 x i32> %ld, ptr %arrayidx2, align 4
42  %inc = add nuw i32 %i.07, 1
43  %exitcond.not = icmp eq i32 %inc, %n
44  br i1 %exitcond.not, label %for.end, label %for.body
45
46for.end:                                          ; preds = %for.body, %entry
47  ret void
48}
49
50define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
51; GCN-LABEL: copy_global:
52; GCN:       ; %bb.0: ; %entry
53; GCN-NEXT:    s_load_b32 s6, s[4:5], 0x34
54; GCN-NEXT:    s_wait_kmcnt 0x0
55; GCN-NEXT:    s_cmp_eq_u32 s6, 0
56; GCN-NEXT:    s_cbranch_scc1 .LBB1_3
57; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
58; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
59; GCN-NEXT:    v_mov_b32_e32 v0, 0
60; GCN-NEXT:    s_wait_kmcnt 0x0
61; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
62; GCN-NEXT:  .LBB1_2: ; %for.body
63; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
64; GCN-NEXT:    global_load_b128 v[1:4], v0, s[2:3] offset:-176
65; GCN-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
66; GCN-NEXT:    s_add_co_i32 s6, s6, -1
67; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
68; GCN-NEXT:    s_cmp_lg_u32 s6, 0
69; GCN-NEXT:    s_wait_loadcnt 0x0
70; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
71; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
72; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
73; GCN-NEXT:  .LBB1_3: ; %for.end
74; GCN-NEXT:    s_endpgm
75entry:
76  %cmp6.not = icmp eq i32 %n, 0
77  br i1 %cmp6.not, label %for.end, label %for.body
78
79for.body:                                         ; preds = %entry, %for.body
80  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
81  %idxprom = zext i32 %i.07 to i64
82  %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i64 %idxprom
83  %ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4
84  %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
85  store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
86  %inc = add nuw i32 %i.07, 1
87  %exitcond.not = icmp eq i32 %inc, %n
88  br i1 %exitcond.not, label %for.end, label %for.body
89
90for.end:                                          ; preds = %for.body, %entry
91  ret void
92}
93
94define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) {
95; GCN-LABEL: copy_constant:
96; GCN:       ; %bb.0: ; %entry
97; GCN-NEXT:    s_load_b32 s6, s[4:5], 0x34
98; GCN-NEXT:    s_wait_kmcnt 0x0
99; GCN-NEXT:    s_cmp_eq_u32 s6, 0
100; GCN-NEXT:    s_cbranch_scc1 .LBB2_3
101; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
102; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
103; GCN-NEXT:    v_mov_b32_e32 v0, 0
104; GCN-NEXT:  .LBB2_2: ; %for.body
105; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
106; GCN-NEXT:    s_wait_kmcnt 0x0
107; GCN-NEXT:    s_load_b128 s[8:11], s[2:3], 0x0
108; GCN-NEXT:    s_prefetch_data s[2:3], 0xb0, null, 0
109; GCN-NEXT:    s_add_co_i32 s6, s6, -1
110; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
111; GCN-NEXT:    s_cmp_lg_u32 s6, 0
112; GCN-NEXT:    s_wait_kmcnt 0x0
113; GCN-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
114; GCN-NEXT:    v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
115; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
116; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
117; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
118; GCN-NEXT:  .LBB2_3: ; %for.end
119; GCN-NEXT:    s_endpgm
120entry:
121  %cmp6.not = icmp eq i32 %n, 0
122  br i1 %cmp6.not, label %for.end, label %for.body
123
124for.body:                                         ; preds = %entry, %for.body
125  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
126  %idxprom = zext i32 %i.07 to i64
127  %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(4) %s, i64 %idxprom
128  %ld = load <4 x i32>, ptr addrspace(4) %arrayidx, align 4
129  %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
130  store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
131  %inc = add nuw i32 %i.07, 1
132  %exitcond.not = icmp eq i32 %inc, %n
133  br i1 %exitcond.not, label %for.end, label %for.body
134
135for.end:                                          ; preds = %for.body, %entry
136  ret void
137}
138
139define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) {
140; GCN-LABEL: copy_local:
141; GCN:       ; %bb.0: ; %entry
142; GCN-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
143; GCN-NEXT:    s_wait_kmcnt 0x0
144; GCN-NEXT:    s_cmp_eq_u32 s2, 0
145; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
146; GCN-NEXT:  .LBB3_1: ; %for.body
147; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
148; GCN-NEXT:    s_wait_alu 0xfffe
149; GCN-NEXT:    v_mov_b32_e32 v2, s1
150; GCN-NEXT:    v_mov_b32_e32 v4, s0
151; GCN-NEXT:    s_add_co_i32 s2, s2, -1
152; GCN-NEXT:    s_add_co_i32 s0, s0, 16
153; GCN-NEXT:    s_add_co_i32 s1, s1, 16
154; GCN-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
155; GCN-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset1:1
156; GCN-NEXT:    s_cmp_lg_u32 s2, 0
157; GCN-NEXT:    s_wait_dscnt 0x1
158; GCN-NEXT:    ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
159; GCN-NEXT:    s_wait_dscnt 0x1
160; GCN-NEXT:    ds_store_2addr_b32 v4, v2, v3 offset1:1
161; GCN-NEXT:    s_cbranch_scc1 .LBB3_1
162; GCN-NEXT:  .LBB3_2: ; %for.end
163; GCN-NEXT:    s_endpgm
164entry:
165  %cmp6.not = icmp eq i32 %n, 0
166  br i1 %cmp6.not, label %for.end, label %for.body
167
168for.body:                                         ; preds = %entry, %for.body
169  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
170  %idxprom = zext i32 %i.07 to i64
171  %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(3) %s, i64 %idxprom
172  %ld = load <4 x i32>, ptr addrspace(3) %arrayidx, align 4
173  %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %d, i64 %idxprom
174  store <4 x i32> %ld, ptr addrspace(3) %arrayidx2, align 4
175  %inc = add nuw i32 %i.07, 1
176  %exitcond.not = icmp eq i32 %inc, %n
177  br i1 %exitcond.not, label %for.end, label %for.body
178
179for.end:                                          ; preds = %for.body, %entry
180  ret void
181}
182