xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll (revision 6d7e51de5ec46c1fcc7a7e80135f561a88a1296b)
1; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s
2; RUN: llc -mtriple=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s
5
6; GCN-LABEL: {{^}}dpp_test:
7; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
8; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
9; GFX8-OPT: s_mov
10; GFX8-OPT: s_mov
11; GFX8-NOOPT: s_nop 1
12; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
13define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
14  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0
15  store i32 %tmp0, ptr addrspace(1) %out
16  ret void
17}
18
19; GCN-LABEL: {{^}}dpp_test_bc:
20; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
21; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
22; GFX8-OPT: s_mov
23; GFX8-OPT: s_mov
24; GFX8-NOOPT: s_nop 1
25; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1{{$}}
26define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
27  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 true) #0
28  store i32 %tmp0, ptr addrspace(1) %out
29  ret void
30}
31
32
33; GCN-LABEL: {{^}}dpp_test1:
34; GFX10,GFX11: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
35; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
36; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
37; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
38; GFX8: s_nop 1
39; GFX8-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
40@0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4
41define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
42bb:
43  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
44  %tmp1 = zext i32 %tmp to i64
45  %tmp2 = getelementptr inbounds [448 x i32], ptr addrspace(3) @0, i32 0, i32 %tmp
46  %tmp3 = load i32, ptr addrspace(3) %tmp2, align 4
47  fence syncscope("workgroup-one-as") release
48  tail call void @llvm.amdgcn.s.barrier()
49  fence syncscope("workgroup-one-as") acquire
50  %tmp4 = add nsw i32 %tmp3, %tmp3
51  %tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)
52  %tmp6 = add nsw i32 %tmp5, %tmp4
53  %tmp7 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
54  store i32 %tmp6, ptr %tmp7, align 4
55  ret void
56}
57
58; GCN-LABEL: {{^}}update_dppi64_test:
59; GCN:     load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
60; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
61; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
62; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
63; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
64define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
65  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
66  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
67  %load = load i64, ptr addrspace(1) %gep
68  %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 false) #0
69  store i64 %tmp0, ptr addrspace(1) %gep
70  ret void
71}
72
73; GCN-LABEL: {{^}}update_dppf64_test:
74; GCN:     load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
75; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
76; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
77; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
78; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
79define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) {
80  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
81  %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id
82  %load = load double, ptr addrspace(1) %gep
83  %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double %load, i32 1, i32 1, i32 1, i1 false) #0
84  store double %tmp0, ptr addrspace(1) %gep
85  ret void
86}
87
88; GCN-LABEL: {{^}}update_dppv2i32_test:
89; GCN:     load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
90; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
91; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
92; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
93; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
94define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) {
95  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
96  %gep = getelementptr inbounds <2 x i32>, ptr addrspace(1) %arg, i32 %id
97  %load = load <2 x i32>, ptr addrspace(1) %gep
98  %tmp0 = call <2 x i32> @llvm.amdgcn.update.dpp.v2i32(<2 x i32> %in1, <2 x i32> %load, i32 1, i32 1, i32 1, i1 false) #0
99  store <2 x i32> %tmp0, ptr addrspace(1) %gep
100  ret void
101}
102
103; GCN-LABEL: {{^}}update_dppv2f32_test:
104; GCN:     load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
105; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
106; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
107; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
108; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
109define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) {
110  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
111  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
112  %load = load <2 x float>, ptr addrspace(1) %gep
113  %tmp0 = call <2 x float> @llvm.amdgcn.update.dpp.v2f32(<2 x float> %in1, <2 x float> %load, i32 1, i32 1, i32 1, i1 false) #0
114  store <2 x float> %tmp0, ptr addrspace(1) %gep
115  ret void
116}
117
118; GCN-LABEL: {{^}}update_dpp_p0_test:
119; GCN:     load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
120; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
121; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
122; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
123; GCN-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
124define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
125  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
126  %gep = getelementptr inbounds ptr, ptr addrspace(1) %arg, i32 %id
127  %load = load ptr, ptr addrspace(1) %gep
128  %tmp0 = call ptr @llvm.amdgcn.update.dpp.p0(ptr %in1, ptr %load, i32 1, i32 1, i32 1, i1 false) #0
129  store ptr %tmp0, ptr addrspace(1) %gep
130  ret void
131}
132
133; GCN-LABEL: {{^}}update_dpp_p3_test:
134; GCN: {{load|read}}_{{dword|b32}} v[[SRC:[0-9]+]]
135; GCN: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
136define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) {
137  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
138  %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %arg, i32 %id
139  %load = load ptr addrspace(3), ptr addrspace(3) %gep
140  %tmp0 = call ptr addrspace(3) @llvm.amdgcn.update.dpp.p3(ptr addrspace(3) %in1, ptr addrspace(3) %load, i32 1, i32 1, i32 1, i1 false) #0
141  store ptr addrspace(3) %tmp0, ptr addrspace(3) %gep
142  ret void
143}
144
145; GCN-LABEL: {{^}}update_dpp_p5_test:
146; GCN: {{load|read}}_{{dword|b32}} v[[SRC:[0-9]+]]
147; GCN: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
148define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspace(5) %in1, ptr %in2) {
149  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
150  %gep = getelementptr inbounds ptr addrspace(5), ptr addrspace(5) %arg, i32 %id
151  %load = load ptr addrspace(5), ptr addrspace(5) %gep
152  %tmp0 = call ptr addrspace(5) @llvm.amdgcn.update.dpp.p5(ptr addrspace(5) %in1, ptr addrspace(5) %load, i32 1, i32 1, i32 1, i1 false) #0
153  store ptr addrspace(5) %tmp0, ptr addrspace(5) %gep
154  ret void
155}
156
157; GCN-LABEL: {{^}}update_dppi64_imm_old_test:
158; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
159; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
160; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
161; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
162; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x7047
163; GCN-DAG: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
164; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
165; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
166; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
167; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
168define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64 %in2) {
169  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
170  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
171  %load = load i64, ptr addrspace(1) %gep
172  %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 false) #0
173  store i64 %tmp0, ptr addrspace(1) %gep
174  ret void
175}
176
177; GCN-LABEL: {{^}}update_dppf64_imm_old_test:
178; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x6b8564a
179; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x405edce1
180; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x405edce1
181; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x6b8564a
182; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x405edce1
183; GCN-DAG: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
184; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
185; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
186; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
187; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
188define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, double %in2) {
189  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
190  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
191  %load = load double, ptr addrspace(1) %gep
192  %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double 123.4512345123450, double %load, i32 1, i32 1, i32 1, i1 false) #0
193  store double %tmp0, ptr addrspace(1) %gep
194  ret void
195}
196
197; GCN-LABEL: {{^}}update_dppi64_imm_src_test:
198; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
199; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
200; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
201; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x7047
202; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
203; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
204; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
205; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
206define amdgpu_kernel void @update_dppi64_imm_src_test(ptr addrspace(1) %out, i64 %in1) {
207  %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 false) #0
208  store i64 %tmp0, ptr addrspace(1) %out
209  ret void
210}
211
212; GCN-LABEL: {{^}}update_dppf64_imm_src_test:
213; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x6b8564a
214; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x405edce1
215; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x6b8564a
216; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x405edce1
217; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
218; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
219; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
220; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
221define amdgpu_kernel void @update_dppf64_imm_src_test(ptr addrspace(1) %out, double %in1) {
222  %tmp0 = call double @llvm.amdgcn.update.dpp.f64(double %in1, double 123.451234512345, i32 1, i32 1, i32 1, i1 false) #0
223  store double %tmp0, ptr addrspace(1) %out
224  ret void
225}
226
227; GCN-LABEL: {{^}}dpp_test_f32:
228; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
229; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
230; GFX8-OPT: s_mov
231; GFX8-OPT: s_mov
232; GFX8-NOOPT: s_nop 1
233; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
234define amdgpu_kernel void @dpp_test_f32(ptr addrspace(1) %out, float %in1, float %in2) {
235  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 1, i32 1, i1 false)
236  store float %tmp0, ptr addrspace(1) %out
237  ret void
238}
239
240; GCN-LABEL: {{^}}dpp_test_f32_imm_comb1:
241; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
242; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
243; GFX8-OPT: s_mov
244; GFX8-OPT: s_mov
245; GFX8-NOOPT: s_nop 1
246; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
247define amdgpu_kernel void @dpp_test_f32_imm_comb1(ptr addrspace(1) %out, float %in1, float %in2) {
248  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 0, i32 0, i32 0, i1 false)
249  store float %tmp0, ptr addrspace(1) %out
250  ret void
251}
252
253; GCN-LABEL: {{^}}dpp_test_f32_imm_comb2:
254; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
255; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
256; GFX8-OPT: s_mov
257; GFX8-OPT: s_mov
258; GFX8-NOOPT: s_nop 1
259; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
260define amdgpu_kernel void @dpp_test_f32_imm_comb2(ptr addrspace(1) %out, float %in1, float %in2) {
261  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 3, i32 3, i32 3, i1 false)
262  store float %tmp0, ptr addrspace(1) %out
263  ret void
264}
265
266; GCN-LABEL: {{^}}dpp_test_f32_imm_comb3:
267; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
268; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
269; GFX8-OPT: s_mov
270; GFX8-OPT: s_mov
271; GFX8-NOOPT: s_nop 1
272; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
273define amdgpu_kernel void @dpp_test_f32_imm_comb3(ptr addrspace(1) %out, float %in1, float %in2) {
274  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 2, i32 3, i1 true)
275  store float %tmp0, ptr addrspace(1) %out
276  ret void
277}
278
279; GCN-LABEL: {{^}}dpp_test_f32_imm_comb4:
280; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
281; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
282; GFX8-OPT: s_mov
283; GFX8-OPT: s_mov
284; GFX8-NOOPT: s_nop 1
285; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
286define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %in1, float %in2) {
287  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 4, i32 3, i32 2, i1 true)
288  store float %tmp0, ptr addrspace(1) %out
289  ret void
290}
291
292; GCN-LABEL: {{^}}dpp_test_f32_imm_comb5:
293; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
294; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
295; GFX8-OPT: s_mov
296; GFX8-OPT: s_mov
297; GFX8-NOOPT: s_nop 1
298; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
299define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %in1, float %in2) {
300  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 14, i32 13, i1 true)
301  store float %tmp0, ptr addrspace(1) %out
302  ret void
303}
304
305; GCN-LABEL: {{^}}dpp_test_f32_imm_comb6:
306; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
307; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
308; GFX8-OPT: s_mov
309; GFX8-OPT: s_mov
310; GFX8-NOOPT: s_nop 1
311; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
312define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %in1, float %in2) {
313  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 15, i32 15, i1 true)
314  store float %tmp0, ptr addrspace(1) %out
315  ret void
316}
317
318
319; GCN-LABEL: {{^}}dpp_test_f32_imm_comb7:
320; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
321; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
322; GFX8-OPT: s_mov
323; GFX8-OPT: s_mov
324; GFX8-NOOPT: s_nop 1
325; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
326define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %in1, float %in2) {
327  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 0, i32 0, i1 true)
328  store float %tmp0, ptr addrspace(1) %out
329  ret void
330}
331
332; GCN-LABEL: {{^}}dpp_test_f32_imm_comb8:
333; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
334; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
335; GFX8-OPT: s_mov
336; GFX8-OPT: s_mov
337; GFX8-NOOPT: s_nop 1
338; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
339define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %in1, float %in2) {
340  %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 15, i32 0, i1 true)
341  store float %tmp0, ptr addrspace(1) %out
342  ret void
343}
344
345; GCN-LABEL: {{^}}dpp_test_v2i16:
346; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
347; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
348; GFX8-OPT: s_mov
349; GFX8-OPT: s_mov
350; GFX8-NOOPT: s_nop 1
351; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
352define amdgpu_kernel void @dpp_test_v2i16(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
353  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 1, i32 1, i1 false)
354  store <2 x i16> %tmp0, ptr addrspace(1) %out
355  ret void
356}
357
358; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb1:
359; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
360; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
361; GFX8-OPT: s_mov
362; GFX8-OPT: s_mov
363; GFX8-NOOPT: s_nop 1
364; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
365define amdgpu_kernel void @dpp_test_v2i16_imm_comb1(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
366  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 0, i32 0, i32 0, i1 false)
367  store <2 x i16> %tmp0, ptr addrspace(1) %out
368  ret void
369}
370
371; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb2:
372; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
373; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
374; GFX8-OPT: s_mov
375; GFX8-OPT: s_mov
376; GFX8-NOOPT: s_nop 1
377; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
378define amdgpu_kernel void @dpp_test_v2i16_imm_comb2(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
379  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 3, i32 3, i32 3, i1 false)
380  store <2 x i16> %tmp0, ptr addrspace(1) %out
381  ret void
382}
383
384	; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb3:
385; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
386; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
387; GFX8-OPT: s_mov
388; GFX8-OPT: s_mov
389; GFX8-NOOPT: s_nop 1
390; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
391define amdgpu_kernel void @dpp_test_v2i16_imm_comb3(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
392  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 2, i32 3, i1 true)
393  store <2 x i16> %tmp0, ptr addrspace(1) %out
394  ret void
395}
396
397; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb4:
398; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
399; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
400; GFX8-OPT: s_mov
401; GFX8-OPT: s_mov
402; GFX8-NOOPT: s_nop 1
403; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
404define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
405  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 4, i32 3, i32 2, i1 true)
406  store <2 x i16> %tmp0, ptr addrspace(1) %out
407  ret void
408}
409
410; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb5:
411; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
412; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
413; GFX8-OPT: s_mov
414; GFX8-OPT: s_mov
415; GFX8-NOOPT: s_nop 1
416; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
417define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
418  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 14, i32 13, i1 true)
419  store <2 x i16> %tmp0, ptr addrspace(1) %out
420  ret void
421}
422
423; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb6:
424; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
425; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
426; GFX8-OPT: s_mov
427; GFX8-OPT: s_mov
428; GFX8-NOOPT: s_nop 1
429; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
430define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
431  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 15, i32 15, i1 true)
432  store <2 x i16> %tmp0, ptr addrspace(1) %out
433  ret void
434}
435
436; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb7:
437; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
438; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
439; GFX8-OPT: s_mov
440; GFX8-OPT: s_mov
441; GFX8-NOOPT: s_nop 1
442; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
443define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
444  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 0, i32 0, i1 true)
445  store <2 x i16> %tmp0, ptr addrspace(1) %out
446  ret void
447}
448
449; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb8:
450; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
451; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
452; GFX8-OPT: s_mov
453; GFX8-OPT: s_mov
454; GFX8-NOOPT: s_nop 1
455; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
456define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
457  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 15, i32 0, i1 true)
458  store <2 x i16> %tmp0, ptr addrspace(1) %out
459  ret void
460}
461
462; GCN-LABEL: {{^}}dpp_test_v2f16:
463; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
464; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
465; GFX8-OPT: s_mov
466; GFX8-OPT: s_mov
467; GFX8-NOOPT: s_nop 1
468; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
469define amdgpu_kernel void @dpp_test_v2f16(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
470  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 1, i32 1, i1 false)
471  store <2 x half> %tmp0, ptr addrspace(1) %out
472  ret void
473}
474
475; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb1:
476; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
477; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
478; GFX8-OPT: s_mov
479; GFX8-OPT: s_mov
480; GFX8-NOOPT: s_nop 1
481; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
482define amdgpu_kernel void @dpp_test_v2f16_imm_comb1(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
483  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 0, i32 0, i32 0, i1 false)
484  store <2 x half> %tmp0, ptr addrspace(1) %out
485  ret void
486}
487
488; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb2:
489; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
490; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
491; GFX8-OPT: s_mov
492; GFX8-OPT: s_mov
493; GFX8-NOOPT: s_nop 1
494; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
495define amdgpu_kernel void @dpp_test_v2f16_imm_comb2(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
496  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 3, i32 3, i32 3, i1 false)
497  store <2 x half> %tmp0, ptr addrspace(1) %out
498  ret void
499}
500
501	; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb3:
502; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
503; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
504; GFX8-OPT: s_mov
505; GFX8-OPT: s_mov
506; GFX8-NOOPT: s_nop 1
507; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
508define amdgpu_kernel void @dpp_test_v2f16_imm_comb3(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
509  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 2, i32 3, i1 true)
510  store <2 x half> %tmp0, ptr addrspace(1) %out
511  ret void
512}
513
514; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb4:
515; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
516; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
517; GFX8-OPT: s_mov
518; GFX8-OPT: s_mov
519; GFX8-NOOPT: s_nop 1
520; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
521define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
522  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 4, i32 3, i32 2, i1 true)
523  store <2 x half> %tmp0, ptr addrspace(1) %out
524  ret void
525}
526
527; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb5:
528; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
529; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
530; GFX8-OPT: s_mov
531; GFX8-OPT: s_mov
532; GFX8-NOOPT: s_nop 1
533; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
534define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
535  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 14, i32 13, i1 true)
536  store <2 x half> %tmp0, ptr addrspace(1) %out
537  ret void
538}
539
540; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb6:
541; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
542; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
543; GFX8-OPT: s_mov
544; GFX8-OPT: s_mov
545; GFX8-NOOPT: s_nop 1
546; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
547define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
548  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 15, i32 15, i1 true)
549  store <2 x half> %tmp0, ptr addrspace(1) %out
550  ret void
551}
552
553; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb7:
554; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
555; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
556; GFX8-OPT: s_mov
557; GFX8-OPT: s_mov
558; GFX8-NOOPT: s_nop 1
559; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
560define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
561  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 0, i32 0, i1 true)
562  store <2 x half> %tmp0, ptr addrspace(1) %out
563  ret void
564}
565
566; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb8:
567; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
568; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
569; GFX8-OPT: s_mov
570; GFX8-OPT: s_mov
571; GFX8-NOOPT: s_nop 1
572; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
573define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
574  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 15, i32 0, i1 true)
575  store <2 x half> %tmp0, ptr addrspace(1) %out
576  ret void
577}
578
579declare i32 @llvm.amdgcn.workitem.id.x()
580declare void @llvm.amdgcn.s.barrier()
581declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
582declare <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16>, <2 x i16>, i32, i32, i32, i1) #0
583declare <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half>, <2 x half>, i32, i32, i32, i1) #0
584declare float @llvm.amdgcn.update.dpp.f32(float, float, i32, i32, i32, i1) #0
585declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0
586
587attributes #0 = { nounwind readnone convergent }
588