xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
5
6define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
7; GFX8-LABEL: dpp_test:
8; GFX8:       ; %bb.0:
9; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
10; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
11; GFX8-NEXT:    v_mov_b32_e32 v2, s2
12; GFX8-NEXT:    v_mov_b32_e32 v0, s3
13; GFX8-NEXT:    s_nop 1
14; GFX8-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
15; GFX8-NEXT:    v_mov_b32_e32 v0, s0
16; GFX8-NEXT:    v_mov_b32_e32 v1, s1
17; GFX8-NEXT:    flat_store_dword v[0:1], v2
18; GFX8-NEXT:    s_endpgm
19;
20; GFX10-LABEL: dpp_test:
21; GFX10:       ; %bb.0:
22; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
23; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
24; GFX10-NEXT:    v_mov_b32_e32 v0, s2
25; GFX10-NEXT:    v_mov_b32_e32 v1, s3
26; GFX10-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
27; GFX10-NEXT:    v_mov_b32_e32 v1, 0
28; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
29; GFX10-NEXT:    s_endpgm
30;
31; GFX11-LABEL: dpp_test:
32; GFX11:       ; %bb.0:
33; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
34; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
36; GFX11-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
37; GFX11-NEXT:    v_mov_b32_e32 v1, 0
38; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
39; GFX11-NEXT:    s_endpgm
40  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false)
41  store i32 %tmp0, ptr addrspace(1) %out
42  ret void
43}
44define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
45; GFX8-LABEL: update_dppi64_test:
46; GFX8:       ; %bb.0:
47; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
48; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
49; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX8-NEXT:    v_mov_b32_e32 v0, s0
51; GFX8-NEXT:    v_mov_b32_e32 v1, s1
52; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
53; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
54; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
55; GFX8-NEXT:    v_mov_b32_e32 v4, s2
56; GFX8-NEXT:    v_mov_b32_e32 v5, s3
57; GFX8-NEXT:    s_waitcnt vmcnt(0)
58; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
59; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
60; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
61; GFX8-NEXT:    s_endpgm
62;
63; GFX10-LABEL: update_dppi64_test:
64; GFX10:       ; %bb.0:
65; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
66; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
67; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
69; GFX10-NEXT:    v_mov_b32_e32 v2, s2
70; GFX10-NEXT:    v_mov_b32_e32 v3, s3
71; GFX10-NEXT:    s_waitcnt vmcnt(0)
72; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
73; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
74; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
75; GFX10-NEXT:    s_endpgm
76;
77; GFX11-LABEL: update_dppi64_test:
78; GFX11:       ; %bb.0:
79; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
80; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX11-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
82; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
83; GFX11-NEXT:    v_mov_b32_e32 v2, s2
84; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
85; GFX11-NEXT:    s_waitcnt vmcnt(0)
86; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
87; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
88; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
89; GFX11-NEXT:    s_endpgm
90  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
91  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
92  %load = load i64, ptr addrspace(1) %gep
93  %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 false) #1
94  store i64 %tmp0, ptr addrspace(1) %gep
95  ret void
96}
97
98define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) {
99; GFX8-LABEL: update_dppf64_test:
100; GFX8:       ; %bb.0:
101; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
102; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
103; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
104; GFX8-NEXT:    v_mov_b32_e32 v0, s0
105; GFX8-NEXT:    v_mov_b32_e32 v1, s1
106; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
107; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
108; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
109; GFX8-NEXT:    v_mov_b32_e32 v4, s2
110; GFX8-NEXT:    v_mov_b32_e32 v5, s3
111; GFX8-NEXT:    s_waitcnt vmcnt(0)
112; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
113; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
114; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
115; GFX8-NEXT:    s_endpgm
116;
117; GFX10-LABEL: update_dppf64_test:
118; GFX10:       ; %bb.0:
119; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
120; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
121; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
122; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
123; GFX10-NEXT:    v_mov_b32_e32 v2, s2
124; GFX10-NEXT:    v_mov_b32_e32 v3, s3
125; GFX10-NEXT:    s_waitcnt vmcnt(0)
126; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
127; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
128; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
129; GFX10-NEXT:    s_endpgm
130;
131; GFX11-LABEL: update_dppf64_test:
132; GFX11:       ; %bb.0:
133; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
134; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
135; GFX11-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
136; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
137; GFX11-NEXT:    v_mov_b32_e32 v2, s2
138; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
139; GFX11-NEXT:    s_waitcnt vmcnt(0)
140; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
141; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
142; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
143; GFX11-NEXT:    s_endpgm
144  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
145  %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
146  %load = load double, ptr addrspace(1) %gep
147  %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double %load, i32 1, i32 1, i32 1, i1 false) #1
148  store double %tmp0, ptr addrspace(1) %gep
149  ret void
150}
151
152define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
153; GFX8-LABEL: update_dppv2i32_test:
154; GFX8:       ; %bb.0:
155; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
156; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
157; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
158; GFX8-NEXT:    v_mov_b32_e32 v0, s0
159; GFX8-NEXT:    v_mov_b32_e32 v1, s1
160; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
161; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
162; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
163; GFX8-NEXT:    v_mov_b32_e32 v4, s2
164; GFX8-NEXT:    v_mov_b32_e32 v5, s3
165; GFX8-NEXT:    s_waitcnt vmcnt(0)
166; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
167; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
168; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
169; GFX8-NEXT:    s_endpgm
170;
171; GFX10-LABEL: update_dppv2i32_test:
172; GFX10:       ; %bb.0:
173; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
174; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
175; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
176; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
177; GFX10-NEXT:    v_mov_b32_e32 v2, s2
178; GFX10-NEXT:    v_mov_b32_e32 v3, s3
179; GFX10-NEXT:    s_waitcnt vmcnt(0)
180; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
181; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
182; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
183; GFX10-NEXT:    s_endpgm
184;
185; GFX11-LABEL: update_dppv2i32_test:
186; GFX11:       ; %bb.0:
187; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
188; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
189; GFX11-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
190; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
191; GFX11-NEXT:    v_mov_b32_e32 v2, s2
192; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
193; GFX11-NEXT:    s_waitcnt vmcnt(0)
194; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
195; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
196; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
197; GFX11-NEXT:    s_endpgm
198  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
199  %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id
200  %load = load <2 x i32>, ptr addrspace(1) %gep
201  %tmp0 = call <2 x i32> @llvm.amdgcn.update.dpp.v2i32(<2 x i32> %in1, <2 x i32> %load, i32 1, i32 1, i32 1, i1 false) #1
202  store <2 x i32> %tmp0, ptr addrspace(1) %gep
203  ret void
204}
205
206define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
207; GFX8-LABEL: update_dppv2f32_test:
208; GFX8:       ; %bb.0:
209; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
210; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
211; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX8-NEXT:    v_mov_b32_e32 v0, s0
213; GFX8-NEXT:    v_mov_b32_e32 v1, s1
214; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
215; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
216; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
217; GFX8-NEXT:    v_mov_b32_e32 v4, s2
218; GFX8-NEXT:    v_mov_b32_e32 v5, s3
219; GFX8-NEXT:    s_waitcnt vmcnt(0)
220; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
221; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
222; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
223; GFX8-NEXT:    s_endpgm
224;
225; GFX10-LABEL: update_dppv2f32_test:
226; GFX10:       ; %bb.0:
227; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
228; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
229; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
230; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
231; GFX10-NEXT:    v_mov_b32_e32 v2, s2
232; GFX10-NEXT:    v_mov_b32_e32 v3, s3
233; GFX10-NEXT:    s_waitcnt vmcnt(0)
234; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
235; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
236; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
237; GFX10-NEXT:    s_endpgm
238;
239; GFX11-LABEL: update_dppv2f32_test:
240; GFX11:       ; %bb.0:
241; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
242; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
243; GFX11-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
244; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
245; GFX11-NEXT:    v_mov_b32_e32 v2, s2
246; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
247; GFX11-NEXT:    s_waitcnt vmcnt(0)
248; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
249; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
250; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
251; GFX11-NEXT:    s_endpgm
252  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
253  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
254  %load = load <2 x float>, ptr addrspace(1) %gep
255  %tmp0 = call <2 x float> @llvm.amdgcn.update.dpp.v2f32(<2 x float> %in1, <2 x float> %load, i32 1, i32 1, i32 1, i1 false) #1
256  store <2 x float> %tmp0, ptr addrspace(1) %gep
257  ret void
258}
259
260define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
261; GFX8-LABEL: update_dpp_p0_test:
262; GFX8:       ; %bb.0:
263; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
264; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
265; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX8-NEXT:    v_mov_b32_e32 v0, s0
267; GFX8-NEXT:    v_mov_b32_e32 v1, s1
268; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
269; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
270; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
271; GFX8-NEXT:    v_mov_b32_e32 v4, s2
272; GFX8-NEXT:    v_mov_b32_e32 v5, s3
273; GFX8-NEXT:    s_waitcnt vmcnt(0)
274; GFX8-NEXT:    v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
275; GFX8-NEXT:    v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
276; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
277; GFX8-NEXT:    s_endpgm
278;
279; GFX10-LABEL: update_dpp_p0_test:
280; GFX10:       ; %bb.0:
281; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
282; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
283; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
284; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
285; GFX10-NEXT:    v_mov_b32_e32 v2, s2
286; GFX10-NEXT:    v_mov_b32_e32 v3, s3
287; GFX10-NEXT:    s_waitcnt vmcnt(0)
288; GFX10-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
289; GFX10-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
290; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
291; GFX10-NEXT:    s_endpgm
292;
293; GFX11-LABEL: update_dpp_p0_test:
294; GFX11:       ; %bb.0:
295; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
296; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX11-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
298; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
299; GFX11-NEXT:    v_mov_b32_e32 v2, s2
300; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[0:1]
301; GFX11-NEXT:    s_waitcnt vmcnt(0)
302; GFX11-NEXT:    v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
303; GFX11-NEXT:    v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
304; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
305; GFX11-NEXT:    s_endpgm
306  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
307  %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id
308  %load = load ptr, ptr addrspace(1) %gep
309  %tmp0 = call ptr @llvm.amdgcn.update.dpp.v2f32(ptr %in1, ptr %load, i32 1, i32 1, i32 1, i1 false) #1
310  store ptr %tmp0, ptr addrspace(1) %gep
311  ret void
312}
313
314define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) {
315; GFX8-LABEL: update_dpp_p3_test:
316; GFX8:       ; %bb.0:
317; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
318; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
319; GFX8-NEXT:    s_mov_b32 m0, -1
320; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
321; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
322; GFX8-NEXT:    ds_read_b32 v1, v0
323; GFX8-NEXT:    v_mov_b32_e32 v2, s1
324; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX8-NEXT:    s_nop 0
326; GFX8-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
327; GFX8-NEXT:    ds_write_b32 v0, v2
328; GFX8-NEXT:    s_endpgm
329;
330; GFX10-LABEL: update_dpp_p3_test:
331; GFX10:       ; %bb.0:
332; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
333; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
334; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
335; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
336; GFX10-NEXT:    v_mov_b32_e32 v2, s1
337; GFX10-NEXT:    ds_read_b32 v1, v0
338; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
339; GFX10-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
340; GFX10-NEXT:    ds_write_b32 v0, v2
341; GFX10-NEXT:    s_endpgm
342;
343; GFX11-LABEL: update_dpp_p3_test:
344; GFX11:       ; %bb.0:
345; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
346; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
347; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
348; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
349; GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
350; GFX11-NEXT:    v_mov_b32_e32 v2, s1
351; GFX11-NEXT:    ds_load_b32 v1, v0
352; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX11-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
354; GFX11-NEXT:    ds_store_b32 v0, v2
355; GFX11-NEXT:    s_endpgm
356  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
357  %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %arg, i32 %id
358  %load = load ptr addrspace(3), ptr addrspace(3) %gep
359  %tmp0 = call ptr addrspace(3) @llvm.amdgcn.update.dpp.p3(ptr addrspace(3) %in1, ptr addrspace(3) %load, i32 1, i32 1, i32 1, i1 false) #1
360  store ptr addrspace(3) %tmp0, ptr addrspace(3) %gep
361  ret void
362}
363
364define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspace(5) %in1, ptr %in2) {
365; GFX8-LABEL: update_dpp_p5_test:
366; GFX8:       ; %bb.0:
367; GFX8-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
368; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
369; GFX8-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
370; GFX8-NEXT:    s_mov_b32 s90, -1
371; GFX8-NEXT:    s_mov_b32 s91, 0xe80000
372; GFX8-NEXT:    s_add_u32 s88, s88, s11
373; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
374; GFX8-NEXT:    s_addc_u32 s89, s89, 0
375; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
376; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
377; GFX8-NEXT:    buffer_load_dword v1, v0, s[88:91], 0 offen
378; GFX8-NEXT:    v_mov_b32_e32 v2, s1
379; GFX8-NEXT:    s_waitcnt vmcnt(0)
380; GFX8-NEXT:    s_nop 0
381; GFX8-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
382; GFX8-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
383; GFX8-NEXT:    s_endpgm
384;
385; GFX10-LABEL: update_dpp_p5_test:
386; GFX10:       ; %bb.0:
387; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
388; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
389; GFX10-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
390; GFX10-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
391; GFX10-NEXT:    s_mov_b32 s14, -1
392; GFX10-NEXT:    s_mov_b32 s15, 0x31c16000
393; GFX10-NEXT:    s_add_u32 s12, s12, s11
394; GFX10-NEXT:    s_addc_u32 s13, s13, 0
395; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
396; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
397; GFX10-NEXT:    v_mov_b32_e32 v2, s1
398; GFX10-NEXT:    buffer_load_dword v1, v0, s[12:15], 0 offen
399; GFX10-NEXT:    s_waitcnt vmcnt(0)
400; GFX10-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
401; GFX10-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
402; GFX10-NEXT:    s_endpgm
403;
404; GFX11-LABEL: update_dpp_p5_test:
405; GFX11:       ; %bb.0:
406; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
407; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
408; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
409; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
410; GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
411; GFX11-NEXT:    v_mov_b32_e32 v2, s1
412; GFX11-NEXT:    scratch_load_b32 v1, v0, off
413; GFX11-NEXT:    s_waitcnt vmcnt(0)
414; GFX11-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
415; GFX11-NEXT:    scratch_store_b32 v0, v2, off
416; GFX11-NEXT:    s_endpgm
417  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
418  %gep = getelementptr inbounds ptr addrspace(5), ptr addrspace(5) %arg, i32 %id
419  %load = load ptr addrspace(5), ptr addrspace(5) %gep
420  %tmp0 = call ptr addrspace(5) @llvm.amdgcn.update.dpp.p5(ptr addrspace(5) %in1, ptr addrspace(5) %load, i32 1, i32 1, i32 1, i1 false) #1
421  store ptr addrspace(5) %tmp0, ptr addrspace(5) %gep
422  ret void
423}
424
425declare i32 @llvm.amdgcn.workitem.id.x() #0
426declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1
427declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1
428
429attributes #0 = { nounwind readnone speculatable }
430attributes #1 = { convergent nounwind readnone }
431