xref: /llvm-project/llvm/test/CodeGen/AMDGPU/idot2.ll (revision 5a3299a684d7d8c40f48d732e5b80a8bd29aa882)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck --check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GFX10-DL %s
8
9; add(mul(S0.x, S1.y),
10;     add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
11
12define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
13; GFX7-LABEL: udot2:
14; GFX7:       ; %bb.0: ; %entry
15; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
16; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
17; GFX7-NEXT:    s_mov_b32 s7, 0xf000
18; GFX7-NEXT:    s_mov_b32 s10, 0
19; GFX7-NEXT:    s_mov_b32 s11, s7
20; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
22; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
23; GFX7-NEXT:    v_mov_b32_e32 v1, 0
24; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
25; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
26; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
27; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
28; GFX7-NEXT:    s_mov_b32 s6, -1
29; GFX7-NEXT:    s_waitcnt vmcnt(1)
30; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
31; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
32; GFX7-NEXT:    s_waitcnt vmcnt(0)
33; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
34; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
35; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
36; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s0
37; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
38; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
39; GFX7-NEXT:    s_endpgm
40;
41; GFX8-LABEL: udot2:
42; GFX8:       ; %bb.0: ; %entry
43; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
44; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
45; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
46; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX8-NEXT:    v_mov_b32_e32 v1, s1
48; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
49; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
50; GFX8-NEXT:    flat_load_dword v3, v[0:1]
51; GFX8-NEXT:    v_mov_b32_e32 v1, s3
52; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
53; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
54; GFX8-NEXT:    flat_load_dword v0, v[0:1]
55; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
56; GFX8-NEXT:    s_waitcnt vmcnt(1)
57; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
58; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
59; GFX8-NEXT:    s_waitcnt vmcnt(0)
60; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
61; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
62; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
63; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s0
64; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
65; GFX8-NEXT:    v_mov_b32_e32 v0, s4
66; GFX8-NEXT:    v_mov_b32_e32 v1, s5
67; GFX8-NEXT:    flat_store_dword v[0:1], v2
68; GFX8-NEXT:    s_endpgm
69;
70; GFX9-NODL-LABEL: udot2:
71; GFX9-NODL:       ; %bb.0: ; %entry
72; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
73; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
74; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
75; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
76; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
77; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
78; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
79; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
80; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
81; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
82; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
83; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
84; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
85; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
86; GFX9-NODL-NEXT:    s_endpgm
87;
88; GFX9-DL-LABEL: udot2:
89; GFX9-DL:       ; %bb.0: ; %entry
90; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
91; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
92; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
93; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
95; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
96; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
97; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
98; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
99; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
100; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
101; GFX9-DL-NEXT:    s_endpgm
102;
103; GFX10-DL-LABEL: udot2:
104; GFX10-DL:       ; %bb.0: ; %entry
105; GFX10-DL-NEXT:    s_clause 0x1
106; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
107; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
108; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
109; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
110; GFX10-DL-NEXT:    s_clause 0x1
111; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
112; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
113; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
114; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
115; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
116; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
117; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
118; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
119; GFX10-DL-NEXT:    s_endpgm
120                                 ptr addrspace(1) %src2,
121                                 ptr addrspace(1) nocapture %dst) {
122entry:
123  %idx = call i32 @llvm.amdgcn.workitem.id.x()
124  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
125  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
126  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
127  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
128
129  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
130  %conv = zext i16 %s1.elt1 to i32
131  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
132  %conv2 = zext i16 %s2.elt1 to i32
133  %mul1 = mul nuw i32 %conv2, %conv
134
135  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
136  %conv3 = zext i16 %s1.elt2 to i32
137  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
138  %conv4 = zext i16 %s2.elt2 to i32
139  %mul2 = mul nuw i32 %conv4, %conv3
140
141  %s3 = load i32, ptr addrspace(1) %dst, align 4
142  %add = add i32 %mul2, %s3
143  %add6 = add i32 %add, %mul1
144  store i32 %add6, ptr addrspace(1) %dst, align 4
145  ret void
146}
147
148; TODO: Support this pattern
149;      add(S3,
150;          add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
151define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
152; GFX7-LABEL: udot2_MulMul:
153; GFX7:       ; %bb.0: ; %entry
154; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
155; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
156; GFX7-NEXT:    s_mov_b32 s7, 0xf000
157; GFX7-NEXT:    s_mov_b32 s10, 0
158; GFX7-NEXT:    s_mov_b32 s11, s7
159; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
160; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
161; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
162; GFX7-NEXT:    v_mov_b32_e32 v1, 0
163; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
164; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
165; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
166; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
167; GFX7-NEXT:    s_mov_b32 s6, -1
168; GFX7-NEXT:    s_waitcnt vmcnt(1)
169; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
170; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
171; GFX7-NEXT:    s_waitcnt vmcnt(0)
172; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
173; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
174; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
175; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
176; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
177; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
178; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
179; GFX7-NEXT:    s_endpgm
180;
181; GFX8-LABEL: udot2_MulMul:
182; GFX8:       ; %bb.0: ; %entry
183; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
184; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
185; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
186; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
187; GFX8-NEXT:    v_mov_b32_e32 v1, s1
188; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
189; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
190; GFX8-NEXT:    flat_load_dword v3, v[0:1]
191; GFX8-NEXT:    v_mov_b32_e32 v1, s3
192; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
193; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
194; GFX8-NEXT:    flat_load_dword v0, v[0:1]
195; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
196; GFX8-NEXT:    s_waitcnt vmcnt(1)
197; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
198; GFX8-NEXT:    s_waitcnt vmcnt(0)
199; GFX8-NEXT:    v_mul_u32_u24_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
200; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
201; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
202; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
203; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
204; GFX8-NEXT:    v_mov_b32_e32 v0, s4
205; GFX8-NEXT:    v_mov_b32_e32 v1, s5
206; GFX8-NEXT:    flat_store_dword v[0:1], v2
207; GFX8-NEXT:    s_endpgm
208;
209; GFX9-NODL-LABEL: udot2_MulMul:
210; GFX9-NODL:       ; %bb.0: ; %entry
211; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
212; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
213; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
214; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
216; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
217; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
218; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
219; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
220; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
221; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
222; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, s0
224; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
225; GFX9-NODL-NEXT:    s_endpgm
226;
227; GFX9-DL-LABEL: udot2_MulMul:
228; GFX9-DL:       ; %bb.0: ; %entry
229; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
230; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
231; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
232; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
233; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
234; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
235; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
236; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
237; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
238; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
239; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
240; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, s0
242; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
243; GFX9-DL-NEXT:    s_endpgm
244;
245; GFX10-DL-LABEL: udot2_MulMul:
246; GFX10-DL:       ; %bb.0: ; %entry
247; GFX10-DL-NEXT:    s_clause 0x1
248; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
249; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
250; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
251; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
252; GFX10-DL-NEXT:    s_clause 0x1
253; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
254; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
255; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
256; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
257; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
258; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
259; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
260; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
261; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, s0
263; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
264; GFX10-DL-NEXT:    s_endpgm
265                                        ptr addrspace(1) %src2,
266                                        ptr addrspace(1) nocapture %dst) {
267entry:
268  %idx = call i32 @llvm.amdgcn.workitem.id.x()
269  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
270  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
271  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
272  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
273
274  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
275  %conv = zext i16 %s1.elt1 to i32
276  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
277  %conv2 = zext i16 %s2.elt1 to i32
278  %mul1 = mul nuw i32 %conv2, %conv
279
280  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
281  %conv3 = zext i16 %s1.elt2 to i32
282  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
283  %conv4 = zext i16 %s2.elt2 to i32
284  %mul2 = mul nuw i32 %conv4, %conv3
285  %s3 = load i32, ptr addrspace(1) %dst, align 4
286  %add = add i32 %mul2, %mul1
287  %add6 = add i32 %add, %s3
288  store i32 %add6, ptr addrspace(1) %dst, align 4
289  ret void
290}
291
292define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
293; GFX7-LABEL: idot2:
294; GFX7:       ; %bb.0: ; %entry
295; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
296; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
297; GFX7-NEXT:    s_mov_b32 s7, 0xf000
298; GFX7-NEXT:    s_mov_b32 s10, 0
299; GFX7-NEXT:    s_mov_b32 s11, s7
300; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
301; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
302; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
303; GFX7-NEXT:    v_mov_b32_e32 v1, 0
304; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
305; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
306; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
307; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
308; GFX7-NEXT:    s_mov_b32 s6, -1
309; GFX7-NEXT:    s_waitcnt vmcnt(1)
310; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
311; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
312; GFX7-NEXT:    s_waitcnt vmcnt(0)
313; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
314; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
315; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
316; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s0
317; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
318; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
319; GFX7-NEXT:    s_endpgm
320;
321; GFX8-LABEL: idot2:
322; GFX8:       ; %bb.0: ; %entry
323; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
324; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
325; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
326; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX8-NEXT:    v_mov_b32_e32 v1, s1
328; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
329; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
330; GFX8-NEXT:    flat_load_dword v3, v[0:1]
331; GFX8-NEXT:    v_mov_b32_e32 v1, s3
332; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
333; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
334; GFX8-NEXT:    flat_load_dword v0, v[0:1]
335; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
336; GFX8-NEXT:    s_waitcnt vmcnt(1)
337; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
338; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
339; GFX8-NEXT:    s_waitcnt vmcnt(0)
340; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
341; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
342; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
343; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s0
344; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
345; GFX8-NEXT:    v_mov_b32_e32 v0, s4
346; GFX8-NEXT:    v_mov_b32_e32 v1, s5
347; GFX8-NEXT:    flat_store_dword v[0:1], v2
348; GFX8-NEXT:    s_endpgm
349;
350; GFX9-NODL-LABEL: idot2:
351; GFX9-NODL:       ; %bb.0: ; %entry
352; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
353; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
354; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
355; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
356; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
357; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
358; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
359; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
360; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
361; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
362; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
363; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
364; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
365; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
366; GFX9-NODL-NEXT:    s_endpgm
367;
368; GFX9-DL-LABEL: idot2:
369; GFX9-DL:       ; %bb.0: ; %entry
370; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
371; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
372; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
373; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
375; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
376; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
377; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
378; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
379; GFX9-DL-NEXT:    v_dot2_i32_i16 v1, v2, v1, s0
380; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
381; GFX9-DL-NEXT:    s_endpgm
382;
383; GFX10-DL-LABEL: idot2:
384; GFX10-DL:       ; %bb.0: ; %entry
385; GFX10-DL-NEXT:    s_clause 0x1
386; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
387; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
388; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
389; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
390; GFX10-DL-NEXT:    s_clause 0x1
391; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
392; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
393; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
394; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
395; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
396; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
397; GFX10-DL-NEXT:    v_dot2_i32_i16 v1, v2, v1, s0
398; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
399; GFX10-DL-NEXT:    s_endpgm
400                                 ptr addrspace(1) %src2,
401                                 ptr addrspace(1) nocapture %dst) {
402entry:
403  %idx = call i32 @llvm.amdgcn.workitem.id.x()
404  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
405  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
406  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
407  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
408
409  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
410  %conv = sext i16 %s1.elt1 to i32
411  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
412  %conv2 = sext i16 %s2.elt1 to i32
413  %mul1 = mul nuw i32 %conv2, %conv
414
415  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
416  %conv3 = sext i16 %s1.elt2 to i32
417  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
418  %conv4 = sext i16 %s2.elt2 to i32
419  %mul2 = mul nuw i32 %conv4, %conv3
420
421  %s3 = load i32, ptr addrspace(1) %dst, align 4
422  %add = add i32 %mul2, %s3
423  %add6 = add i32 %add, %mul1
424  store i32 %add6, ptr addrspace(1) %dst, align 4
425  ret void
426}
427
428define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
429; GFX7-LABEL: idot2_MixedTypedMul:
430; GFX7:       ; %bb.0: ; %entry
431; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
432; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
433; GFX7-NEXT:    s_mov_b32 s7, 0xf000
434; GFX7-NEXT:    s_mov_b32 s10, 0
435; GFX7-NEXT:    s_mov_b32 s11, s7
436; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
438; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
439; GFX7-NEXT:    v_mov_b32_e32 v1, 0
440; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
441; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
442; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
443; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
444; GFX7-NEXT:    s_mov_b32 s6, -1
445; GFX7-NEXT:    s_waitcnt vmcnt(1)
446; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
447; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
448; GFX7-NEXT:    s_waitcnt vmcnt(0)
449; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
450; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
451; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s0
453; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, v1
454; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
455; GFX7-NEXT:    s_endpgm
456;
457; GFX8-LABEL: idot2_MixedTypedMul:
458; GFX8:       ; %bb.0: ; %entry
459; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
460; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
461; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
462; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX8-NEXT:    v_mov_b32_e32 v1, s1
464; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
465; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
466; GFX8-NEXT:    flat_load_dword v3, v[0:1]
467; GFX8-NEXT:    v_mov_b32_e32 v1, s3
468; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
469; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
470; GFX8-NEXT:    flat_load_dword v0, v[0:1]
471; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
472; GFX8-NEXT:    s_waitcnt vmcnt(1)
473; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
474; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
475; GFX8-NEXT:    s_waitcnt vmcnt(0)
476; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
477; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
478; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s0
480; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
481; GFX8-NEXT:    v_mov_b32_e32 v0, s4
482; GFX8-NEXT:    v_mov_b32_e32 v1, s5
483; GFX8-NEXT:    flat_store_dword v[0:1], v2
484; GFX8-NEXT:    s_endpgm
485;
486; GFX9-NODL-LABEL: idot2_MixedTypedMul:
487; GFX9-NODL:       ; %bb.0: ; %entry
488; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
489; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
490; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
491; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
492; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
493; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
494; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
495; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
496; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
497; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
498; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
499; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
500; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
501; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
502; GFX9-NODL-NEXT:    s_endpgm
503;
504; GFX9-DL-LABEL: idot2_MixedTypedMul:
505; GFX9-DL:       ; %bb.0: ; %entry
506; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
507; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
508; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
509; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
510; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
511; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
512; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
513; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
514; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
515; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
516; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
517; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
518; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
519; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
520; GFX9-DL-NEXT:    s_endpgm
521;
522; GFX10-DL-LABEL: idot2_MixedTypedMul:
523; GFX10-DL:       ; %bb.0: ; %entry
524; GFX10-DL-NEXT:    s_clause 0x1
525; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
526; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
527; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
528; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX10-DL-NEXT:    s_clause 0x1
530; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
531; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
532; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
533; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
534; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
535; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
536; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
537; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
538; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
540; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
541; GFX10-DL-NEXT:    s_endpgm
542                                               ptr addrspace(1) %src2,
543                                               ptr addrspace(1) nocapture %dst) {
544entry:
545  %idx = call i32 @llvm.amdgcn.workitem.id.x()
546  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
547  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
548  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
549  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
550
551  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
552  %conv = sext i16 %s1.elt1 to i32
553  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
554  %conv2 = sext i16 %s2.elt1 to i32
555  %mul1 = mul nuw i32 %conv2, %conv
556
557  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
558  %conv3 = zext i16 %s1.elt2 to i32
559  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
560  %conv4 = zext i16 %s2.elt2 to i32
561  %mul2 = mul nuw i32 %conv4, %conv3
562
563  %s3 = load i32, ptr addrspace(1) %dst, align 4
564  %add = add i32 %mul2, %s3
565  %add6 = add i32 %add, %mul1
566  store i32 %add6, ptr addrspace(1) %dst, align 4
567  ret void
568}
569
570define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
571; GFX7-LABEL: udot2_alt_AddOperands:
572; GFX7:       ; %bb.0: ; %entry
573; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
574; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
575; GFX7-NEXT:    s_mov_b32 s7, 0xf000
576; GFX7-NEXT:    s_mov_b32 s10, 0
577; GFX7-NEXT:    s_mov_b32 s11, s7
578; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
579; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
580; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
581; GFX7-NEXT:    v_mov_b32_e32 v1, 0
582; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
583; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
584; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
585; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
586; GFX7-NEXT:    s_mov_b32 s6, -1
587; GFX7-NEXT:    s_waitcnt vmcnt(1)
588; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
589; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
590; GFX7-NEXT:    s_waitcnt vmcnt(0)
591; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
592; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
593; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
594; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s0
595; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
596; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
597; GFX7-NEXT:    s_endpgm
598;
599; GFX8-LABEL: udot2_alt_AddOperands:
600; GFX8:       ; %bb.0: ; %entry
601; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
602; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
603; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
604; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX8-NEXT:    v_mov_b32_e32 v1, s1
606; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
607; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
608; GFX8-NEXT:    flat_load_dword v3, v[0:1]
609; GFX8-NEXT:    v_mov_b32_e32 v1, s3
610; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
611; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
612; GFX8-NEXT:    flat_load_dword v0, v[0:1]
613; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
614; GFX8-NEXT:    s_waitcnt vmcnt(1)
615; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
616; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
617; GFX8-NEXT:    s_waitcnt vmcnt(0)
618; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
619; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
620; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s0
622; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
623; GFX8-NEXT:    v_mov_b32_e32 v0, s4
624; GFX8-NEXT:    v_mov_b32_e32 v1, s5
625; GFX8-NEXT:    flat_store_dword v[0:1], v2
626; GFX8-NEXT:    s_endpgm
627;
628; GFX9-NODL-LABEL: udot2_alt_AddOperands:
629; GFX9-NODL:       ; %bb.0: ; %entry
630; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
631; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
632; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
633; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
634; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
635; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
636; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
637; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
638; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
639; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
640; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
641; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
642; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
643; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
644; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
645; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
646; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v4, v3, v1
647; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
648; GFX9-NODL-NEXT:    s_endpgm
649;
650; GFX9-DL-LABEL: udot2_alt_AddOperands:
651; GFX9-DL:       ; %bb.0: ; %entry
652; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
653; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
654; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
655; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
656; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
657; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
658; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
659; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
660; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
661; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
662; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
663; GFX9-DL-NEXT:    s_endpgm
664;
665; GFX10-DL-LABEL: udot2_alt_AddOperands:
666; GFX10-DL:       ; %bb.0: ; %entry
667; GFX10-DL-NEXT:    s_clause 0x1
668; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
669; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
670; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
671; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
672; GFX10-DL-NEXT:    s_clause 0x1
673; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
674; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
675; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
676; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
677; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
678; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
679; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
680; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
681; GFX10-DL-NEXT:    s_endpgm
682                                                 ptr addrspace(1) %src2,
683                                                 ptr addrspace(1) nocapture %dst) {
684entry:
685  %idx = call i32 @llvm.amdgcn.workitem.id.x()
686  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
687  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
688  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
689  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
690
691  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
692  %conv = zext i16 %s1.elt1 to i32
693  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
694  %conv2 = zext i16 %s2.elt1 to i32
695  %mul1 = mul nuw i32 %conv2, %conv
696
697  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
698  %conv3 = zext i16 %s1.elt2 to i32
699  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
700  %conv4 = zext i16 %s2.elt2 to i32
701  %mul2 = mul nuw i32 %conv4, %conv3
702
703  %s3 = load i32, ptr addrspace(1) %dst, align 4
704  %add = add i32 %s3, %mul2
705  %add6 = add i32 %mul1, %add
706  store i32 %add6, ptr addrspace(1) %dst, align 4
707  ret void
708}
709
710define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
711; GFX7-LABEL: idot2_MixedExt:
712; GFX7:       ; %bb.0: ; %entry
713; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
714; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
715; GFX7-NEXT:    s_mov_b32 s7, 0xf000
716; GFX7-NEXT:    s_mov_b32 s10, 0
717; GFX7-NEXT:    s_mov_b32 s11, s7
718; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
719; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
720; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
721; GFX7-NEXT:    v_mov_b32_e32 v1, 0
722; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
723; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
724; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
725; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
726; GFX7-NEXT:    s_mov_b32 s6, -1
727; GFX7-NEXT:    s_waitcnt vmcnt(1)
728; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
729; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
730; GFX7-NEXT:    s_waitcnt vmcnt(0)
731; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v0
732; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
733; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
734; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s0
735; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
736; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
737; GFX7-NEXT:    s_endpgm
738;
739; GFX8-LABEL: idot2_MixedExt:
740; GFX8:       ; %bb.0: ; %entry
741; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
742; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
743; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
744; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX8-NEXT:    v_mov_b32_e32 v1, s1
746; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
747; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
748; GFX8-NEXT:    flat_load_dword v3, v[0:1]
749; GFX8-NEXT:    v_mov_b32_e32 v1, s3
750; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
751; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
752; GFX8-NEXT:    flat_load_dword v0, v[0:1]
753; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
754; GFX8-NEXT:    s_waitcnt vmcnt(1)
755; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
756; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
757; GFX8-NEXT:    s_waitcnt vmcnt(0)
758; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
759; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
760; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
761; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s0
762; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
763; GFX8-NEXT:    v_mov_b32_e32 v0, s4
764; GFX8-NEXT:    v_mov_b32_e32 v1, s5
765; GFX8-NEXT:    flat_store_dword v[0:1], v2
766; GFX8-NEXT:    s_endpgm
767;
768; GFX9-NODL-LABEL: idot2_MixedExt:
769; GFX9-NODL:       ; %bb.0: ; %entry
770; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
771; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
772; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
773; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
775; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
776; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
777; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
778; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
779; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
780; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
781; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
782; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
783; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
784; GFX9-NODL-NEXT:    s_endpgm
785;
786; GFX9-DL-LABEL: idot2_MixedExt:
787; GFX9-DL:       ; %bb.0: ; %entry
788; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
789; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
790; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
791; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
792; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
793; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
794; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
795; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
796; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
797; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
798; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
799; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
800; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
801; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
802; GFX9-DL-NEXT:    s_endpgm
803;
804; GFX10-DL-LABEL: idot2_MixedExt:
805; GFX10-DL:       ; %bb.0: ; %entry
806; GFX10-DL-NEXT:    s_clause 0x1
807; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
808; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
809; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
810; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
811; GFX10-DL-NEXT:    s_clause 0x1
812; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
813; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
814; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
815; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
816; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
817; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
818; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
819; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
820; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
821; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
822; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
823; GFX10-DL-NEXT:    s_endpgm
824                                          ptr addrspace(1) %src2,
825                                          ptr addrspace(1) nocapture %dst) {
826entry:
827  %idx = call i32 @llvm.amdgcn.workitem.id.x()
828  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
829  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
830  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
831  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
832
833  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
834  %conv = sext i16 %s1.elt1 to i32
835  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
836  %conv2 = zext i16 %s2.elt1 to i32
837  %mul1 = mul nuw i32 %conv2, %conv
838
839  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
840  %conv3 = sext i16 %s1.elt2 to i32
841  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
842  %conv4 = sext i16 %s2.elt2 to i32
843  %mul2 = mul nuw i32 %conv4, %conv3
844
845  %s3 = load i32, ptr addrspace(1) %dst, align 4
846  %add = add i32 %mul2, %s3
847  %add6 = add i32 %add, %mul1
848  store i32 %add6, ptr addrspace(1) %dst, align 4
849  ret void
850}
851
852define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
853; GFX7-LABEL: notudot2_SameVec:
854; GFX7:       ; %bb.0: ; %entry
855; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
856; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
857; GFX7-NEXT:    s_mov_b32 s7, 0xf000
858; GFX7-NEXT:    s_mov_b32 s10, 0
859; GFX7-NEXT:    s_mov_b32 s11, s7
860; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
861; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
862; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
863; GFX7-NEXT:    v_mov_b32_e32 v1, 0
864; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
865; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
866; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
867; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
868; GFX7-NEXT:    s_mov_b32 s6, -1
869; GFX7-NEXT:    s_waitcnt vmcnt(1)
870; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
871; GFX7-NEXT:    s_waitcnt vmcnt(0)
872; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
873; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
874; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v0, s0
875; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v1, v0
876; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
877; GFX7-NEXT:    s_endpgm
878;
879; GFX8-LABEL: notudot2_SameVec:
880; GFX8:       ; %bb.0: ; %entry
881; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
882; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
883; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
884; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
885; GFX8-NEXT:    v_mov_b32_e32 v1, s1
886; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
887; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
888; GFX8-NEXT:    flat_load_dword v3, v[0:1]
889; GFX8-NEXT:    v_mov_b32_e32 v1, s3
890; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
891; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
892; GFX8-NEXT:    flat_load_dword v0, v[0:1]
893; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
894; GFX8-NEXT:    s_waitcnt vmcnt(1)
895; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
896; GFX8-NEXT:    s_waitcnt vmcnt(0)
897; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
898; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
899; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v0, s0
900; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v1, v0
901; GFX8-NEXT:    v_mov_b32_e32 v0, s4
902; GFX8-NEXT:    v_mov_b32_e32 v1, s5
903; GFX8-NEXT:    flat_store_dword v[0:1], v2
904; GFX8-NEXT:    s_endpgm
905;
906; GFX9-NODL-LABEL: notudot2_SameVec:
907; GFX9-NODL:       ; %bb.0: ; %entry
908; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
909; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
910; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
911; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
912; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
913; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
914; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
915; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
916; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
917; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
918; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
919; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
920; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
921; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, s0, v1
922; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
923; GFX9-NODL-NEXT:    s_endpgm
924;
925; GFX9-DL-LABEL: notudot2_SameVec:
926; GFX9-DL:       ; %bb.0: ; %entry
927; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
928; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
929; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
930; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
931; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
932; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
933; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
934; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
935; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
936; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
937; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
938; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
939; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
940; GFX9-DL-NEXT:    v_add3_u32 v1, v2, s0, v1
941; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
942; GFX9-DL-NEXT:    s_endpgm
943;
944; GFX10-DL-LABEL: notudot2_SameVec:
945; GFX10-DL:       ; %bb.0: ; %entry
946; GFX10-DL-NEXT:    s_clause 0x1
947; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
948; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
949; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
950; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
951; GFX10-DL-NEXT:    s_clause 0x1
952; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
953; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
954; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
955; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
956; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
957; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
958; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
959; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
960; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
961; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
962; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
963; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
964; GFX10-DL-NEXT:    s_endpgm
965                                            ptr addrspace(1) %src2,
966                                            ptr addrspace(1) nocapture %dst) {
967entry:
968  %idx = call i32 @llvm.amdgcn.workitem.id.x()
969  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
970  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
971  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
972  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
973
974  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
975  %conv = zext i16 %s1.elt1 to i32
976  %s2.elt1 = extractelement <2 x i16> %vec1, i64 0
977  %conv2 = zext i16 %s2.elt1 to i32
978  %mul1 = mul i32 %conv2, %conv
979
980  %s1.elt2 = extractelement <2 x i16> %vec2, i64 1
981  %conv3 = zext i16 %s1.elt2 to i32
982  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
983  %conv4 = zext i16 %s2.elt2 to i32
984  %mul2 = mul i32 %conv4, %conv3
985
986  %s3 = load i32, ptr addrspace(1) %dst, align 4
987  %add = add i32 %mul2, %s3
988  %add6 = add i32 %add, %mul1
989  store i32 %add6, ptr addrspace(1) %dst, align 4
990  ret void
991}
992
993define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
994; GFX7-LABEL: udot2_v4i16:
995; GFX7:       ; %bb.0: ; %entry
996; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
997; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
998; GFX7-NEXT:    s_mov_b32 s7, 0xf000
999; GFX7-NEXT:    s_mov_b32 s10, 0
1000; GFX7-NEXT:    s_mov_b32 s11, s7
1001; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1002; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
1003; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1004; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1005; GFX7-NEXT:    s_mov_b64 s[0:1], s[2:3]
1006; GFX7-NEXT:    s_mov_b64 s[2:3], s[10:11]
1007; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1008; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1009; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
1010; GFX7-NEXT:    s_mov_b32 s6, -1
1011; GFX7-NEXT:    s_waitcnt vmcnt(1)
1012; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
1013; GFX7-NEXT:    s_waitcnt vmcnt(0)
1014; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v0
1015; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1016; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1017; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1018; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, s0
1019; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
1020; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1021; GFX7-NEXT:    s_endpgm
1022;
1023; GFX8-LABEL: udot2_v4i16:
1024; GFX8:       ; %bb.0: ; %entry
1025; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1026; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1027; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1028; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1029; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1030; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1031; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1032; GFX8-NEXT:    v_mov_b32_e32 v3, s3
1033; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
1034; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1035; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1036; GFX8-NEXT:    flat_load_dword v1, v[2:3]
1037; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1038; GFX8-NEXT:    s_waitcnt vmcnt(1)
1039; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
1040; GFX8-NEXT:    s_waitcnt vmcnt(0)
1041; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v1
1042; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1043; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1044; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1045; GFX8-NEXT:    v_mad_u32_u24 v0, v1, v0, s0
1046; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v2, v0
1047; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1048; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1049; GFX8-NEXT:    flat_store_dword v[0:1], v2
1050; GFX8-NEXT:    s_endpgm
1051;
1052; GFX9-NODL-LABEL: udot2_v4i16:
1053; GFX9-NODL:       ; %bb.0: ; %entry
1054; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1055; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1056; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1057; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1058; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1059; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1060; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1061; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1062; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1063; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1064; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1065; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1066; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
1067; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
1068; GFX9-NODL-NEXT:    s_endpgm
1069;
1070; GFX9-DL-LABEL: udot2_v4i16:
1071; GFX9-DL:       ; %bb.0: ; %entry
1072; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1073; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1074; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1075; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1076; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1077; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1078; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1079; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1080; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1081; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
1082; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1083; GFX9-DL-NEXT:    s_endpgm
1084;
1085; GFX10-DL-LABEL: udot2_v4i16:
1086; GFX10-DL:       ; %bb.0: ; %entry
1087; GFX10-DL-NEXT:    s_clause 0x1
1088; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1089; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1090; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1091; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1092; GFX10-DL-NEXT:    s_clause 0x1
1093; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1094; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1095; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1096; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1097; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1098; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1099; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
1100; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1101; GFX10-DL-NEXT:    s_endpgm
1102                                       ptr addrspace(1) %src2,
1103                                       ptr addrspace(1) nocapture %dst) {
1104entry:
1105  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1106  %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1107  %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1108  %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1109  %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
1110
1111  %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1112  %conv = zext i16 %s1.elt1 to i32
1113  %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1114  %conv2 = zext i16 %s2.elt1 to i32
1115  %mul1 = mul i32 %conv2, %conv
1116
1117  %s1.elt2 = extractelement <4 x i16> %vec1, i64 1
1118  %conv3 = zext i16 %s1.elt2 to i32
1119  %s2.elt2 = extractelement <4 x i16> %vec2, i64 1
1120  %conv4 = zext i16 %s2.elt2 to i32
1121  %mul2 = mul i32 %conv4, %conv3
1122
1123  %s3 = load i32, ptr addrspace(1) %dst, align 4
1124  %add = add i32 %mul2, %s3
1125  %add6 = add i32 %add, %mul1
1126  store i32 %add6, ptr addrspace(1) %dst, align 4
1127  ret void
1128}
1129
1130define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
1131; GFX7-LABEL: udot2_v4i16_Hi:
1132; GFX7:       ; %bb.0: ; %entry
1133; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1134; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1135; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1136; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1137; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1138; GFX7-NEXT:    s_mov_b32 s10, 0
1139; GFX7-NEXT:    s_mov_b32 s11, s7
1140; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1141; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
1142; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
1143; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
1144; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
1145; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
1146; GFX7-NEXT:    s_mov_b32 s6, -1
1147; GFX7-NEXT:    s_waitcnt vmcnt(1)
1148; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
1149; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1150; GFX7-NEXT:    s_waitcnt vmcnt(0)
1151; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v0
1152; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1153; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1154; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, s0
1155; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
1156; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1157; GFX7-NEXT:    s_endpgm
1158;
1159; GFX8-LABEL: udot2_v4i16_Hi:
1160; GFX8:       ; %bb.0: ; %entry
1161; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1162; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1163; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1164; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1165; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1166; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
1167; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1168; GFX8-NEXT:    v_mov_b32_e32 v3, s3
1169; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
1170; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1171; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
1172; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1173; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1174; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
1175; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1176; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1177; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1178; GFX8-NEXT:    s_waitcnt vmcnt(1)
1179; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v2
1180; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1181; GFX8-NEXT:    s_waitcnt vmcnt(0)
1182; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v0
1183; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1184; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1185; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, s0
1186; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v1, v0
1187; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1188; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1189; GFX8-NEXT:    flat_store_dword v[0:1], v2
1190; GFX8-NEXT:    s_endpgm
1191;
1192; GFX9-NODL-LABEL: udot2_v4i16_Hi:
1193; GFX9-NODL:       ; %bb.0: ; %entry
1194; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1195; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1196; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1197; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1198; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1] offset:4
1199; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
1200; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1201; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1202; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1203; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1204; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1205; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1206; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
1207; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
1208; GFX9-NODL-NEXT:    s_endpgm
1209;
1210; GFX9-DL-LABEL: udot2_v4i16_Hi:
1211; GFX9-DL:       ; %bb.0: ; %entry
1212; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1213; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1214; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1215; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1216; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1] offset:4
1217; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
1218; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1219; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1220; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1221; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
1222; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1223; GFX9-DL-NEXT:    s_endpgm
1224;
1225; GFX10-DL-LABEL: udot2_v4i16_Hi:
1226; GFX10-DL:       ; %bb.0: ; %entry
1227; GFX10-DL-NEXT:    s_clause 0x1
1228; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1229; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1230; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1231; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1232; GFX10-DL-NEXT:    s_clause 0x1
1233; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1] offset:4
1234; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
1235; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1236; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1237; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1238; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1239; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
1240; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1241; GFX10-DL-NEXT:    s_endpgm
1242                                          ptr addrspace(1) %src2,
1243                                          ptr addrspace(1) nocapture %dst) {
1244entry:
1245  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1246  %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1247  %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1248  %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1249  %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
1250
1251  %s1.elt1 = extractelement <4 x i16> %vec1, i64 2
1252  %conv = zext i16 %s1.elt1 to i32
1253  %s2.elt1 = extractelement <4 x i16> %vec2, i64 2
1254  %conv2 = zext i16 %s2.elt1 to i32
1255  %mul1 = mul i32 %conv2, %conv
1256
1257  %s1.elt2 = extractelement <4 x i16> %vec1, i64 3
1258  %conv3 = zext i16 %s1.elt2 to i32
1259  %s2.elt2 = extractelement <4 x i16> %vec2, i64 3
1260  %conv4 = zext i16 %s2.elt2 to i32
1261  %mul2 = mul i32 %conv4, %conv3
1262
1263  %s3 = load i32, ptr addrspace(1) %dst, align 4
1264  %add = add i32 %mul2, %s3
1265  %add6 = add i32 %add, %mul1
1266  store i32 %add6, ptr addrspace(1) %dst, align 4
1267  ret void
1268}
1269
1270define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
1271; GFX7-LABEL: notudot2_v4i16_Even:
1272; GFX7:       ; %bb.0: ; %entry
1273; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1274; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1275; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1276; GFX7-NEXT:    s_mov_b32 s10, 0
1277; GFX7-NEXT:    s_mov_b32 s11, s7
1278; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1279; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
1280; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1281; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1282; GFX7-NEXT:    s_mov_b64 s[0:1], s[2:3]
1283; GFX7-NEXT:    s_mov_b64 s[2:3], s[10:11]
1284; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
1285; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
1286; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
1287; GFX7-NEXT:    s_mov_b32 s6, -1
1288; GFX7-NEXT:    s_waitcnt vmcnt(1)
1289; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1290; GFX7-NEXT:    s_waitcnt vmcnt(0)
1291; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1292; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1293; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1294; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1295; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s0
1296; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
1297; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1298; GFX7-NEXT:    s_endpgm
1299;
1300; GFX8-LABEL: notudot2_v4i16_Even:
1301; GFX8:       ; %bb.0: ; %entry
1302; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1303; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1304; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1305; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1306; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1307; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1308; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1309; GFX8-NEXT:    v_mov_b32_e32 v3, s3
1310; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
1311; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1312; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1313; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1314; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1315; GFX8-NEXT:    s_waitcnt vmcnt(1)
1316; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1317; GFX8-NEXT:    s_waitcnt vmcnt(0)
1318; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1319; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1320; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1321; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1322; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v1, s0
1323; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v0, v1
1324; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1325; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1326; GFX8-NEXT:    flat_store_dword v[0:1], v2
1327; GFX8-NEXT:    s_endpgm
1328;
1329; GFX9-NODL-LABEL: notudot2_v4i16_Even:
1330; GFX9-NODL:       ; %bb.0: ; %entry
1331; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1332; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1333; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1334; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1335; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
1336; GFX9-NODL-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3]
1337; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1338; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, 0
1339; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1340; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1341; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1342; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX9-NODL-NEXT:    v_add3_u32 v0, v1, s0, v0
1344; GFX9-NODL-NEXT:    global_store_dword v4, v0, s[6:7]
1345; GFX9-NODL-NEXT:    s_endpgm
1346;
1347; GFX9-DL-LABEL: notudot2_v4i16_Even:
1348; GFX9-DL:       ; %bb.0: ; %entry
1349; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1350; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1351; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1352; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1353; GFX9-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
1354; GFX9-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3]
1355; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1356; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0
1357; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1358; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1359; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1360; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1361; GFX9-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
1362; GFX9-DL-NEXT:    global_store_dword v4, v0, s[6:7]
1363; GFX9-DL-NEXT:    s_endpgm
1364;
1365; GFX10-DL-LABEL: notudot2_v4i16_Even:
1366; GFX10-DL:       ; %bb.0: ; %entry
1367; GFX10-DL-NEXT:    s_clause 0x1
1368; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1369; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1370; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1371; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1372; GFX10-DL-NEXT:    s_clause 0x1
1373; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
1374; GFX10-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3]
1375; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1376; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1377; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1378; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1379; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1380; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1381; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1382; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
1383; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
1384; GFX10-DL-NEXT:    s_endpgm
1385                                               ptr addrspace(1) %src2,
1386                                               ptr addrspace(1) nocapture %dst) {
1387entry:
1388  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1389  %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1390  %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1391  %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1392  %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
1393
1394  %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1395  %conv = zext i16 %s1.elt1 to i32
1396  %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1397  %conv2 = zext i16 %s2.elt1 to i32
1398  %mul1 = mul i32 %conv2, %conv
1399
1400  %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1401  %conv3 = zext i16 %s1.elt2 to i32
1402  %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1403  %conv4 = zext i16 %s2.elt2 to i32
1404  %mul2 = mul i32 %conv4, %conv3
1405
1406  %s3 = load i32, ptr addrspace(1) %dst, align 4
1407  %add = add i32 %mul2, %s3
1408  %add6 = add i32 %add, %mul1
1409  store i32 %add6, ptr addrspace(1) %dst, align 4
1410  ret void
1411}
1412
1413define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
1414; GFX7-LABEL: notudot2_v4i16_Middle:
1415; GFX7:       ; %bb.0: ; %entry
1416; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1417; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1418; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1419; GFX7-NEXT:    s_mov_b32 s10, 0
1420; GFX7-NEXT:    s_mov_b32 s11, s7
1421; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1422; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
1423; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1424; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1425; GFX7-NEXT:    s_mov_b64 s[0:1], s[2:3]
1426; GFX7-NEXT:    s_mov_b64 s[2:3], s[10:11]
1427; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
1428; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
1429; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
1430; GFX7-NEXT:    s_mov_b32 s6, -1
1431; GFX7-NEXT:    s_waitcnt vmcnt(1)
1432; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1433; GFX7-NEXT:    s_waitcnt vmcnt(0)
1434; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1435; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1436; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1437; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1438; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s0
1439; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
1440; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1441; GFX7-NEXT:    s_endpgm
1442;
1443; GFX8-LABEL: notudot2_v4i16_Middle:
1444; GFX8:       ; %bb.0: ; %entry
1445; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1446; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1447; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1448; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1449; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1450; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1451; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1452; GFX8-NEXT:    v_mov_b32_e32 v3, s3
1453; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
1454; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1455; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1456; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1457; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1458; GFX8-NEXT:    s_waitcnt vmcnt(1)
1459; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1460; GFX8-NEXT:    s_waitcnt vmcnt(0)
1461; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1462; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1463; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1464; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1465; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v1, s0
1466; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v0, v1
1467; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1468; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1469; GFX8-NEXT:    flat_store_dword v[0:1], v2
1470; GFX8-NEXT:    s_endpgm
1471;
1472; GFX9-NODL-LABEL: notudot2_v4i16_Middle:
1473; GFX9-NODL:       ; %bb.0: ; %entry
1474; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1475; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1476; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1477; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1478; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
1479; GFX9-NODL-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3]
1480; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1481; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, 0
1482; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1483; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1484; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1485; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1486; GFX9-NODL-NEXT:    v_add3_u32 v0, v1, s0, v0
1487; GFX9-NODL-NEXT:    global_store_dword v4, v0, s[6:7]
1488; GFX9-NODL-NEXT:    s_endpgm
1489;
1490; GFX9-DL-LABEL: notudot2_v4i16_Middle:
1491; GFX9-DL:       ; %bb.0: ; %entry
1492; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1493; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1494; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1495; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1496; GFX9-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
1497; GFX9-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3]
1498; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1499; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0
1500; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1501; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1502; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1503; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1504; GFX9-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
1505; GFX9-DL-NEXT:    global_store_dword v4, v0, s[6:7]
1506; GFX9-DL-NEXT:    s_endpgm
1507;
1508; GFX10-DL-LABEL: notudot2_v4i16_Middle:
1509; GFX10-DL:       ; %bb.0: ; %entry
1510; GFX10-DL-NEXT:    s_clause 0x1
1511; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1512; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1513; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1514; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1515; GFX10-DL-NEXT:    s_clause 0x1
1516; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
1517; GFX10-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3]
1518; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1519; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1520; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1521; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1522; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1523; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1524; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1525; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
1526; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
1527; GFX10-DL-NEXT:    s_endpgm
1528                                                 ptr addrspace(1) %src2,
1529                                                 ptr addrspace(1) nocapture %dst) {
1530entry:
1531  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1532  %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx
1533  %vec1 = load <4 x i16>, ptr addrspace(1) %gep1
1534  %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx
1535  %vec2 = load <4 x i16>, ptr addrspace(1) %gep2
1536
1537  %s1.elt1 = extractelement <4 x i16> %vec1, i64 1
1538  %conv = zext i16 %s1.elt1 to i32
1539  %s2.elt1 = extractelement <4 x i16> %vec2, i64 1
1540  %conv2 = zext i16 %s2.elt1 to i32
1541  %mul1 = mul i32 %conv2, %conv
1542
1543  %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1544  %conv3 = zext i16 %s1.elt2 to i32
1545  %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1546  %conv4 = zext i16 %s2.elt2 to i32
1547  %mul2 = mul i32 %conv4, %conv3
1548
1549  %s3 = load i32, ptr addrspace(1) %dst, align 4
1550  %add = add i32 %mul2, %s3
1551  %add6 = add i32 %add, %mul1
1552  store i32 %add6, ptr addrspace(1) %dst, align 4
1553  ret void
1554}
1555
1556define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
1557; GFX7-LABEL: notudot2_DiffIndex:
1558; GFX7:       ; %bb.0: ; %entry
1559; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1560; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1561; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1562; GFX7-NEXT:    s_mov_b32 s10, 0
1563; GFX7-NEXT:    s_mov_b32 s11, s7
1564; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1565; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
1566; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1567; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1568; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1569; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
1570; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1571; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
1572; GFX7-NEXT:    s_mov_b32 s6, -1
1573; GFX7-NEXT:    s_waitcnt vmcnt(1)
1574; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1575; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1576; GFX7-NEXT:    s_waitcnt vmcnt(0)
1577; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1578; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1579; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1580; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v1, s0
1581; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v2, v0
1582; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1583; GFX7-NEXT:    s_endpgm
1584;
1585; GFX8-LABEL: notudot2_DiffIndex:
1586; GFX8:       ; %bb.0: ; %entry
1587; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1588; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1589; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1590; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1591; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1592; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1593; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1594; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1595; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1596; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1597; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1598; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1599; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1600; GFX8-NEXT:    s_waitcnt vmcnt(1)
1601; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
1602; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1603; GFX8-NEXT:    s_waitcnt vmcnt(0)
1604; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1605; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1606; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1607; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s0
1608; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
1609; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1610; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1611; GFX8-NEXT:    flat_store_dword v[0:1], v2
1612; GFX8-NEXT:    s_endpgm
1613;
1614; GFX9-NODL-LABEL: notudot2_DiffIndex:
1615; GFX9-NODL:       ; %bb.0: ; %entry
1616; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1617; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1618; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1619; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1620; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1621; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1622; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1623; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1624; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1625; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1626; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1627; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1628; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
1629; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
1630; GFX9-NODL-NEXT:    s_endpgm
1631;
1632; GFX9-DL-LABEL: notudot2_DiffIndex:
1633; GFX9-DL:       ; %bb.0: ; %entry
1634; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1635; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1636; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1637; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1638; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1639; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1640; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1641; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1642; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1643; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1644; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1645; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1646; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
1647; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1648; GFX9-DL-NEXT:    s_endpgm
1649;
1650; GFX10-DL-LABEL: notudot2_DiffIndex:
1651; GFX10-DL:       ; %bb.0: ; %entry
1652; GFX10-DL-NEXT:    s_clause 0x1
1653; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1654; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1655; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1656; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1657; GFX10-DL-NEXT:    s_clause 0x1
1658; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1659; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1660; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1661; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1662; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1663; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1664; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1665; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1666; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1667; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
1668; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
1669; GFX10-DL-NEXT:    s_endpgm
1670                                              ptr addrspace(1) %src2,
1671                                              ptr addrspace(1) nocapture %dst) {
1672entry:
1673  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1674  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
1675  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
1676  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
1677  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
1678
1679  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1680  %conv = zext i16 %s1.elt1 to i32
1681  %s2.elt1 = extractelement <2 x i16> %vec2, i64 1
1682  %conv2 = zext i16 %s2.elt1 to i32
1683  %mul1 = mul i32 %conv2, %conv
1684
1685  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1686  %conv3 = zext i16 %s1.elt2 to i32
1687  %s2.elt2 = extractelement <2 x i16> %vec2, i64 0
1688  %conv4 = zext i16 %s2.elt2 to i32
1689  %mul2 = mul i32 %conv4, %conv3
1690
1691  %s3 = load i32, ptr addrspace(1) %dst, align 4
1692  %add = add i32 %mul2, %s3
1693  %add6 = add i32 %add, %mul1
1694  store i32 %add6, ptr addrspace(1) %dst, align 4
1695  ret void
1696}
1697
1698define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
1699; GFX7-LABEL: udot2_MultipleUses_add1:
1700; GFX7:       ; %bb.0: ; %entry
1701; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1702; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1703; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1704; GFX7-NEXT:    s_mov_b32 s10, 0
1705; GFX7-NEXT:    s_mov_b32 s11, s7
1706; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1707; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
1708; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1709; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1710; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1711; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
1712; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1713; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
1714; GFX7-NEXT:    s_mov_b32 s6, -1
1715; GFX7-NEXT:    s_waitcnt vmcnt(1)
1716; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1717; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1718; GFX7-NEXT:    s_waitcnt vmcnt(0)
1719; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1720; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1721; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1722; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s0
1723; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
1724; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1725; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1726; GFX7-NEXT:    s_endpgm
1727;
1728; GFX8-LABEL: udot2_MultipleUses_add1:
1729; GFX8:       ; %bb.0: ; %entry
1730; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1731; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1732; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1733; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1734; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1735; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1736; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1737; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1738; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1739; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1740; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1741; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1742; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1743; GFX8-NEXT:    s_waitcnt vmcnt(1)
1744; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
1745; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1746; GFX8-NEXT:    s_waitcnt vmcnt(0)
1747; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
1748; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1749; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1750; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s0
1751; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v1, v0
1752; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v0
1753; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1754; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1755; GFX8-NEXT:    flat_store_dword v[0:1], v2
1756; GFX8-NEXT:    s_endpgm
1757;
1758; GFX9-NODL-LABEL: udot2_MultipleUses_add1:
1759; GFX9-NODL:       ; %bb.0: ; %entry
1760; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1761; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1762; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1763; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1764; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1765; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1766; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1767; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1768; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1769; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1770; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1771; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1772; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1773; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
1774; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v1
1775; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
1776; GFX9-NODL-NEXT:    s_endpgm
1777;
1778; GFX9-DL-LABEL: udot2_MultipleUses_add1:
1779; GFX9-DL:       ; %bb.0: ; %entry
1780; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1781; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1782; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1783; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1784; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1785; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1786; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1787; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1788; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1789; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1790; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1791; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1792; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1793; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
1794; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v1
1795; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1796; GFX9-DL-NEXT:    s_endpgm
1797;
1798; GFX10-DL-LABEL: udot2_MultipleUses_add1:
1799; GFX10-DL:       ; %bb.0: ; %entry
1800; GFX10-DL-NEXT:    s_clause 0x1
1801; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1802; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1803; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1804; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1805; GFX10-DL-NEXT:    s_clause 0x1
1806; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1807; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1808; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1809; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1810; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1811; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1812; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1813; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1814; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1815; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1816; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1817; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v3, v0, s0
1818; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v0
1819; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
1820; GFX10-DL-NEXT:    s_endpgm
1821                                                   ptr addrspace(1) %src2,
1822                                                   ptr addrspace(1) nocapture %dst) {
1823entry:
1824  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1825  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
1826  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
1827  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
1828  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
1829
1830  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1831  %conv = zext i16 %s1.elt1 to i32
1832  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1833  %conv2 = zext i16 %s2.elt1 to i32
1834  %mul1 = mul i32 %conv2, %conv
1835
1836  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1837  %conv3 = zext i16 %s1.elt2 to i32
1838  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1839  %conv4 = zext i16 %s2.elt2 to i32
1840  %mul2 = mul i32 %conv4, %conv3
1841
1842  %s3 = load i32, ptr addrspace(1) %dst, align 4
1843  %add1 = add i32 %mul2, %s3
1844  %add2 = add i32 %add1, %mul1
1845
1846  %res = add i32 %add2, %add1
1847  store i32 %res, ptr addrspace(1) %dst, align 4
1848  ret void
1849}
1850
1851define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
1852; GFX7-LABEL: idot2_MultipleUses_add1:
1853; GFX7:       ; %bb.0: ; %entry
1854; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1855; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1856; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1857; GFX7-NEXT:    s_mov_b32 s10, 0
1858; GFX7-NEXT:    s_mov_b32 s11, s7
1859; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1860; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
1861; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1862; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1863; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1864; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
1865; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1866; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
1867; GFX7-NEXT:    s_mov_b32 s6, -1
1868; GFX7-NEXT:    s_waitcnt vmcnt(1)
1869; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
1870; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
1871; GFX7-NEXT:    s_waitcnt vmcnt(0)
1872; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
1873; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
1874; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1875; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s0
1876; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v1, v0
1877; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
1878; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1879; GFX7-NEXT:    s_endpgm
1880;
1881; GFX8-LABEL: idot2_MultipleUses_add1:
1882; GFX8:       ; %bb.0: ; %entry
1883; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1884; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1885; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1886; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1887; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1888; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1889; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1890; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1891; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1892; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1893; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1894; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1895; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1896; GFX8-NEXT:    s_waitcnt vmcnt(1)
1897; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
1898; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
1899; GFX8-NEXT:    s_waitcnt vmcnt(0)
1900; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
1901; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
1902; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1903; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s0
1904; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v1, v0
1905; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v0
1906; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1907; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1908; GFX8-NEXT:    flat_store_dword v[0:1], v2
1909; GFX8-NEXT:    s_endpgm
1910;
1911; GFX9-NODL-LABEL: idot2_MultipleUses_add1:
1912; GFX9-NODL:       ; %bb.0: ; %entry
1913; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1914; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1915; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1916; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1917; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1918; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1919; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1920; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1921; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1922; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1923; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
1924; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
1925; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1926; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
1927; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v1
1928; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
1929; GFX9-NODL-NEXT:    s_endpgm
1930;
1931; GFX9-DL-LABEL: idot2_MultipleUses_add1:
1932; GFX9-DL:       ; %bb.0: ; %entry
1933; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1934; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1935; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1936; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1937; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1938; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1939; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1940; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1941; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1942; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1943; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
1944; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
1945; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1946; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
1947; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v1
1948; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1949; GFX9-DL-NEXT:    s_endpgm
1950;
1951; GFX10-DL-LABEL: idot2_MultipleUses_add1:
1952; GFX10-DL:       ; %bb.0: ; %entry
1953; GFX10-DL-NEXT:    s_clause 0x1
1954; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1955; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1956; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1957; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1958; GFX10-DL-NEXT:    s_clause 0x1
1959; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1960; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1961; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1962; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1963; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1964; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v0, 16, v1
1965; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1966; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
1967; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1968; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1969; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1970; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v3, v0, s0
1971; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v0
1972; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
1973; GFX10-DL-NEXT:    s_endpgm
1974                                                   ptr addrspace(1) %src2,
1975                                                   ptr addrspace(1) nocapture %dst) {
1976entry:
1977  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1978  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
1979  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
1980  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
1981  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
1982
1983  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1984  %conv = sext i16 %s1.elt1 to i32
1985  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1986  %conv2 = sext i16 %s2.elt1 to i32
1987  %mul1 = mul i32 %conv2, %conv
1988
1989  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1990  %conv3 = sext i16 %s1.elt2 to i32
1991  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1992  %conv4 = sext i16 %s2.elt2 to i32
1993  %mul2 = mul i32 %conv4, %conv3
1994
1995  %s3 = load i32, ptr addrspace(1) %dst, align 4
1996  %add1 = add i32 %mul2, %s3
1997  %add2 = add i32 %add1, %mul1
1998
1999  %res = add i32 %add2, %add1
2000  store i32 %res, ptr addrspace(1) %dst, align 4
2001  ret void
2002}
2003
2004define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
2005; GFX7-LABEL: udot2_MultipleUses_mul1:
2006; GFX7:       ; %bb.0: ; %entry
2007; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2008; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2009; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2010; GFX7-NEXT:    s_mov_b32 s10, 0
2011; GFX7-NEXT:    s_mov_b32 s11, s7
2012; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2013; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
2014; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2015; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2016; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2017; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
2018; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2019; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
2020; GFX7-NEXT:    s_mov_b32 s6, -1
2021; GFX7-NEXT:    s_waitcnt vmcnt(1)
2022; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
2023; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2024; GFX7-NEXT:    s_waitcnt vmcnt(0)
2025; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
2026; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2027; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2028; GFX7-NEXT:    v_mad_u32_u24 v4, v0, v2, s0
2029; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, v4
2030; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
2031; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2032; GFX7-NEXT:    s_endpgm
2033;
2034; GFX8-LABEL: udot2_MultipleUses_mul1:
2035; GFX8:       ; %bb.0: ; %entry
2036; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2037; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2038; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2039; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2040; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2041; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2042; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2043; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2044; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2045; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2046; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2047; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2048; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2049; GFX8-NEXT:    s_waitcnt vmcnt(1)
2050; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
2051; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2052; GFX8-NEXT:    s_waitcnt vmcnt(0)
2053; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
2054; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2055; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2056; GFX8-NEXT:    v_mad_u32_u24 v4, v2, v1, s0
2057; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, v4
2058; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
2059; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2060; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2061; GFX8-NEXT:    flat_store_dword v[0:1], v2
2062; GFX8-NEXT:    s_endpgm
2063;
2064; GFX9-NODL-LABEL: udot2_MultipleUses_mul1:
2065; GFX9-NODL:       ; %bb.0: ; %entry
2066; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2067; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2068; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2069; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2070; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
2071; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
2072; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
2073; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2074; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2075; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
2076; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2077; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
2078; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2079; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v2, v4, v3
2080; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2081; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v3, s0
2082; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v2
2083; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
2084; GFX9-NODL-NEXT:    s_endpgm
2085;
2086; GFX9-DL-LABEL: udot2_MultipleUses_mul1:
2087; GFX9-DL:       ; %bb.0: ; %entry
2088; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2089; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2090; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2091; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2092; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2093; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2094; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2095; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2096; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2097; GFX9-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
2098; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2099; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
2100; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2101; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v4, v3
2102; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2103; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v4, v3, s0
2104; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v2
2105; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2106; GFX9-DL-NEXT:    s_endpgm
2107;
2108; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
2109; GFX10-DL:       ; %bb.0: ; %entry
2110; GFX10-DL-NEXT:    s_clause 0x1
2111; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2112; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2113; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2114; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2115; GFX10-DL-NEXT:    s_clause 0x1
2116; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2117; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2118; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
2119; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2120; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2121; GFX10-DL-NEXT:    v_and_b32_e32 v0, 0xffff, v1
2122; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2123; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v2
2124; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2125; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v0
2126; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2127; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v3, v0, s0
2128; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2129; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, v2
2130; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
2131; GFX10-DL-NEXT:    s_endpgm
2132                                                   ptr addrspace(1) %src2,
2133                                                   ptr addrspace(1) nocapture %dst) {
2134entry:
2135  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2136  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2137  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2138  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2139  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
2140
2141  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2142  %conv = zext i16 %s1.elt1 to i32
2143  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2144  %conv2 = zext i16 %s2.elt1 to i32
2145  %mul1 = mul i32 %conv2, %conv
2146
2147  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2148  %conv3 = zext i16 %s1.elt2 to i32
2149  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2150  %conv4 = zext i16 %s2.elt2 to i32
2151  %mul2 = mul i32 %conv4, %conv3
2152
2153  %s3 = load i32, ptr addrspace(1) %dst, align 4
2154  %add0 = add i32 %mul1, %s3
2155
2156  %add1 = add i32 %mul2, %add0
2157  %add2 = add i32 %add1, %mul1
2158
2159  store i32 %add2, ptr addrspace(1) %dst, align 4
2160  ret void
2161}
2162
2163define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
2164; GFX7-LABEL: idot2_MultipleUses_mul1:
2165; GFX7:       ; %bb.0: ; %entry
2166; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2167; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2168; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2169; GFX7-NEXT:    s_mov_b32 s10, 0
2170; GFX7-NEXT:    s_mov_b32 s11, s7
2171; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2172; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
2173; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2174; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2175; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2176; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
2177; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2178; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
2179; GFX7-NEXT:    s_mov_b32 s6, -1
2180; GFX7-NEXT:    s_waitcnt vmcnt(1)
2181; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
2182; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2183; GFX7-NEXT:    s_waitcnt vmcnt(0)
2184; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
2185; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2186; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2187; GFX7-NEXT:    v_mad_i32_i24 v4, v3, v1, s0
2188; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, v4
2189; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
2190; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2191; GFX7-NEXT:    s_endpgm
2192;
2193; GFX8-LABEL: idot2_MultipleUses_mul1:
2194; GFX8:       ; %bb.0: ; %entry
2195; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2196; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2197; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2198; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2199; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2200; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2201; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2202; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2203; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2204; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2205; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2206; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2207; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2208; GFX8-NEXT:    s_waitcnt vmcnt(1)
2209; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
2210; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
2211; GFX8-NEXT:    s_waitcnt vmcnt(0)
2212; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
2213; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2214; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2215; GFX8-NEXT:    v_mad_i32_i24 v4, v2, v1, s0
2216; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, v4
2217; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
2218; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2219; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2220; GFX8-NEXT:    flat_store_dword v[0:1], v2
2221; GFX8-NEXT:    s_endpgm
2222;
2223; GFX9-NODL-LABEL: idot2_MultipleUses_mul1:
2224; GFX9-NODL:       ; %bb.0: ; %entry
2225; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2226; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2227; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2228; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2229; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
2230; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
2231; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
2232; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2233; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2234; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 16
2235; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2236; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 16
2237; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2238; GFX9-NODL-NEXT:    v_mul_i32_i24_e32 v2, v4, v3
2239; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2240; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v3, s0
2241; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v2
2242; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
2243; GFX9-NODL-NEXT:    s_endpgm
2244;
2245; GFX9-DL-LABEL: idot2_MultipleUses_mul1:
2246; GFX9-DL:       ; %bb.0: ; %entry
2247; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2248; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2249; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2250; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2251; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2252; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2253; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2254; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2255; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2256; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 16
2257; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2258; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 16
2259; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2260; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v2, v4, v3
2261; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2262; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v4, v3, s0
2263; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v2
2264; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2265; GFX9-DL-NEXT:    s_endpgm
2266;
2267; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
2268; GFX10-DL:       ; %bb.0: ; %entry
2269; GFX10-DL-NEXT:    s_clause 0x1
2270; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2271; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2272; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2273; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2274; GFX10-DL-NEXT:    s_clause 0x1
2275; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2276; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2277; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
2278; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2279; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2280; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 16
2281; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2282; GFX10-DL-NEXT:    v_bfe_i32 v3, v2, 0, 16
2283; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2284; GFX10-DL-NEXT:    v_mul_i32_i24_e32 v2, v3, v0
2285; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2286; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v3, v0, s0
2287; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2288; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, v2
2289; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
2290; GFX10-DL-NEXT:    s_endpgm
2291                                                   ptr addrspace(1) %src2,
2292                                                   ptr addrspace(1) nocapture %dst) {
2293entry:
2294  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2295  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2296  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2297  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2298  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
2299
2300  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2301  %conv = sext i16 %s1.elt1 to i32
2302  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2303  %conv2 = sext i16 %s2.elt1 to i32
2304  %mul1 = mul i32 %conv2, %conv
2305
2306  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2307  %conv3 = sext i16 %s1.elt2 to i32
2308  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2309  %conv4 = sext i16 %s2.elt2 to i32
2310  %mul2 = mul i32 %conv4, %conv3
2311
2312  %s3 = load i32, ptr addrspace(1) %dst, align 4
2313  %add0 = add i32 %mul1, %s3
2314
2315  %add1 = add i32 %mul2, %add0
2316  %add2 = add i32 %add1, %mul1
2317
2318  store i32 %add2, ptr addrspace(1) %dst, align 4
2319  ret void
2320}
2321
2322define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
2323; GFX7-LABEL: udot2_MultipleUses_mul2:
2324; GFX7:       ; %bb.0: ; %entry
2325; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2326; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2327; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2328; GFX7-NEXT:    s_mov_b32 s10, 0
2329; GFX7-NEXT:    s_mov_b32 s11, s7
2330; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2331; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
2332; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2333; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2334; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2335; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
2336; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2337; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
2338; GFX7-NEXT:    s_mov_b32 s6, -1
2339; GFX7-NEXT:    s_waitcnt vmcnt(1)
2340; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
2341; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2342; GFX7-NEXT:    s_waitcnt vmcnt(0)
2343; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
2344; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2345; GFX7-NEXT:    v_mad_u32_u24 v4, v3, v1, s0
2346; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2347; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, v4
2348; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
2349; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2350; GFX7-NEXT:    s_endpgm
2351;
2352; GFX8-LABEL: udot2_MultipleUses_mul2:
2353; GFX8:       ; %bb.0: ; %entry
2354; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2355; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2356; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2357; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2358; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2359; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2360; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2361; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2362; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2363; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2364; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2365; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2366; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2367; GFX8-NEXT:    s_waitcnt vmcnt(1)
2368; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
2369; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2370; GFX8-NEXT:    s_waitcnt vmcnt(0)
2371; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
2372; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2373; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2374; GFX8-NEXT:    v_mad_u32_u24 v4, v0, v3, s0
2375; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, v4
2376; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
2377; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2378; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2379; GFX8-NEXT:    flat_store_dword v[0:1], v2
2380; GFX8-NEXT:    s_endpgm
2381;
2382; GFX9-NODL-LABEL: udot2_MultipleUses_mul2:
2383; GFX9-NODL:       ; %bb.0: ; %entry
2384; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2385; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2386; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2387; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2388; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
2389; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
2390; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
2391; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2392; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2393; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2394; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2395; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2396; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v4, v2, v1
2397; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2398; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
2399; GFX9-NODL-NEXT:    v_add3_u32 v1, v4, v1, v3
2400; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
2401; GFX9-NODL-NEXT:    s_endpgm
2402;
2403; GFX9-DL-LABEL: udot2_MultipleUses_mul2:
2404; GFX9-DL:       ; %bb.0: ; %entry
2405; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2406; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2407; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2408; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2409; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2410; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2411; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2412; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2413; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2414; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2415; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2416; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2417; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v4, v2, v1
2418; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2419; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
2420; GFX9-DL-NEXT:    v_add3_u32 v1, v4, v1, v3
2421; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2422; GFX9-DL-NEXT:    s_endpgm
2423;
2424; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
2425; GFX10-DL:       ; %bb.0: ; %entry
2426; GFX10-DL-NEXT:    s_clause 0x1
2427; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2428; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2429; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2430; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2431; GFX10-DL-NEXT:    s_clause 0x1
2432; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2433; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2434; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
2435; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2436; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2437; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
2438; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2439; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2440; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2441; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v0
2442; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2443; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v3, v0, s0
2444; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2445; GFX10-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
2446; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
2447; GFX10-DL-NEXT:    s_endpgm
2448                                                   ptr addrspace(1) %src2,
2449                                                   ptr addrspace(1) nocapture %dst) {
2450entry:
2451  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2452  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2453  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2454  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2455  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
2456
2457  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2458  %conv = zext i16 %s1.elt1 to i32
2459  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2460  %conv2 = zext i16 %s2.elt1 to i32
2461  %mul1 = mul i32 %conv2, %conv
2462
2463  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2464  %conv3 = zext i16 %s1.elt2 to i32
2465  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2466  %conv4 = zext i16 %s2.elt2 to i32
2467  %mul2 = mul i32 %conv4, %conv3
2468
2469  %s3 = load i32, ptr addrspace(1) %dst, align 4
2470  %add0 = add i32 %mul2, %s3
2471
2472  %add1 = add i32 %mul2, %add0
2473  %add2 = add i32 %add1, %mul1
2474
2475  store i32 %add2, ptr addrspace(1) %dst, align 4
2476  ret void
2477}
2478
2479define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
2480; GFX7-LABEL: idot2_MultipleUses_mul2:
2481; GFX7:       ; %bb.0: ; %entry
2482; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2483; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2484; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2485; GFX7-NEXT:    s_mov_b32 s10, 0
2486; GFX7-NEXT:    s_mov_b32 s11, s7
2487; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2488; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
2489; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2490; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2491; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2492; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
2493; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2494; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
2495; GFX7-NEXT:    s_mov_b32 s6, -1
2496; GFX7-NEXT:    s_waitcnt vmcnt(1)
2497; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
2498; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2499; GFX7-NEXT:    s_waitcnt vmcnt(0)
2500; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
2501; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2502; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2503; GFX7-NEXT:    v_mad_i32_i24 v4, v0, v2, s0
2504; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, v4
2505; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
2506; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2507; GFX7-NEXT:    s_endpgm
2508;
2509; GFX8-LABEL: idot2_MultipleUses_mul2:
2510; GFX8:       ; %bb.0: ; %entry
2511; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2512; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2513; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2514; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2515; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2516; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2517; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2518; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2519; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2520; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2521; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2522; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2523; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2524; GFX8-NEXT:    s_waitcnt vmcnt(1)
2525; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
2526; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
2527; GFX8-NEXT:    s_waitcnt vmcnt(0)
2528; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
2529; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2530; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2531; GFX8-NEXT:    v_mad_i32_i24 v4, v0, v3, s0
2532; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, v4
2533; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
2534; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2535; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2536; GFX8-NEXT:    flat_store_dword v[0:1], v2
2537; GFX8-NEXT:    s_endpgm
2538;
2539; GFX9-NODL-LABEL: idot2_MultipleUses_mul2:
2540; GFX9-NODL:       ; %bb.0: ; %entry
2541; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2542; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2543; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2544; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2545; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
2546; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
2547; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
2548; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2549; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2550; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2551; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2552; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2553; GFX9-NODL-NEXT:    v_mul_i32_i24_e32 v4, v2, v1
2554; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2555; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
2556; GFX9-NODL-NEXT:    v_add3_u32 v1, v4, v1, v3
2557; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
2558; GFX9-NODL-NEXT:    s_endpgm
2559;
2560; GFX9-DL-LABEL: idot2_MultipleUses_mul2:
2561; GFX9-DL:       ; %bb.0: ; %entry
2562; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2563; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2564; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2565; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2566; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2567; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2568; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2569; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2570; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2571; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2572; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2573; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2574; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v4, v2, v1
2575; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2576; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
2577; GFX9-DL-NEXT:    v_add3_u32 v1, v4, v1, v3
2578; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2579; GFX9-DL-NEXT:    s_endpgm
2580;
2581; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
2582; GFX10-DL:       ; %bb.0: ; %entry
2583; GFX10-DL-NEXT:    s_clause 0x1
2584; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2585; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2586; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2587; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2588; GFX10-DL-NEXT:    s_clause 0x1
2589; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2590; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2591; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
2592; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2593; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2594; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v0, 16, v1
2595; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2596; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
2597; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2598; GFX10-DL-NEXT:    v_mul_i32_i24_e32 v2, v3, v0
2599; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2600; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v3, v0, s0
2601; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2602; GFX10-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
2603; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
2604; GFX10-DL-NEXT:    s_endpgm
2605                                                   ptr addrspace(1) %src2,
2606                                                   ptr addrspace(1) nocapture %dst) {
2607entry:
2608  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2609  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2610  %vec1 = load <2 x i16>, ptr addrspace(1) %gep1
2611  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2612  %vec2 = load <2 x i16>, ptr addrspace(1) %gep2
2613
2614  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2615  %conv = sext i16 %s1.elt1 to i32
2616  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2617  %conv2 = sext i16 %s2.elt1 to i32
2618  %mul1 = mul i32 %conv2, %conv
2619
2620  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2621  %conv3 = sext i16 %s1.elt2 to i32
2622  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2623  %conv4 = sext i16 %s2.elt2 to i32
2624  %mul2 = mul i32 %conv4, %conv3
2625
2626  %s3 = load i32, ptr addrspace(1) %dst, align 4
2627  %add0 = add i32 %mul2, %s3
2628
2629  %add1 = add i32 %mul2, %add0
2630  %add2 = add i32 %add1, %mul1
2631
2632  store i32 %add2, ptr addrspace(1) %dst, align 4
2633  ret void
2634}
2635
2636define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
2637; GFX7-LABEL: udot2_acc16:
2638; GFX7:       ; %bb.0: ; %entry
2639; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2640; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2641; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2642; GFX7-NEXT:    s_mov_b32 s10, 0
2643; GFX7-NEXT:    s_mov_b32 s11, s7
2644; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2645; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
2646; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2647; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2648; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2649; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
2650; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2651; GFX7-NEXT:    s_mov_b32 s6, -1
2652; GFX7-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
2653; GFX7-NEXT:    s_waitcnt vmcnt(2)
2654; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2655; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2656; GFX7-NEXT:    s_waitcnt vmcnt(1)
2657; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
2658; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2659; GFX7-NEXT:    s_waitcnt vmcnt(0)
2660; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
2661; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2662; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
2663; GFX7-NEXT:    s_endpgm
2664;
2665; GFX8-LABEL: udot2_acc16:
2666; GFX8:       ; %bb.0: ; %entry
2667; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2668; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2669; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2670; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2671; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2672; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2673; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2674; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2675; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2676; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2677; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2678; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2679; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2680; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2681; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
2682; GFX8-NEXT:    s_waitcnt vmcnt(2)
2683; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2684; GFX8-NEXT:    s_waitcnt vmcnt(1)
2685; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2686; GFX8-NEXT:    s_waitcnt vmcnt(0)
2687; GFX8-NEXT:    v_mad_u16 v4, v5, v6, v4
2688; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2689; GFX8-NEXT:    flat_store_short v[0:1], v2
2690; GFX8-NEXT:    s_endpgm
2691;
2692; GFX9-NODL-LABEL: udot2_acc16:
2693; GFX9-NODL:       ; %bb.0: ; %entry
2694; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2695; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2696; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2697; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, 0
2698; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2699; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[0:1]
2700; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[2:3]
2701; GFX9-NODL-NEXT:    global_load_ushort v4, v1, s[6:7]
2702; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
2703; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
2704; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2705; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2706; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2707; GFX9-NODL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v4
2708; GFX9-NODL-NEXT:    v_mad_legacy_u16 v0, v2, v3, v0
2709; GFX9-NODL-NEXT:    global_store_short v1, v0, s[6:7]
2710; GFX9-NODL-NEXT:    s_endpgm
2711;
2712; GFX9-DL-LABEL: udot2_acc16:
2713; GFX9-DL:       ; %bb.0: ; %entry
2714; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2715; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2716; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2717; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
2718; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2719; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
2720; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
2721; GFX9-DL-NEXT:    global_load_ushort v4, v1, s[6:7]
2722; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2723; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
2724; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2725; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2726; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2727; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v4
2728; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v2, v3, v0
2729; GFX9-DL-NEXT:    global_store_short v1, v0, s[6:7]
2730; GFX9-DL-NEXT:    s_endpgm
2731;
2732; GFX10-DL-LABEL: udot2_acc16:
2733; GFX10-DL:       ; %bb.0: ; %entry
2734; GFX10-DL-NEXT:    s_clause 0x1
2735; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2736; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2737; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2738; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
2739; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2740; GFX10-DL-NEXT:    s_clause 0x1
2741; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
2742; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
2743; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[6:7]
2744; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2745; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
2746; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2747; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2748; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2749; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
2750; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
2751; GFX10-DL-NEXT:    global_store_short v1, v0, s[6:7]
2752; GFX10-DL-NEXT:    s_endpgm
2753                                       ptr addrspace(1) %src2,
2754                                       ptr addrspace(1) nocapture %dst) {
2755entry:
2756  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2757  %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx
2758  %v1 = load <2 x i16>, ptr addrspace(1) %gep1
2759  %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx
2760  %v2 = load <2 x i16>, ptr addrspace(1) %gep2
2761
2762  %v1e1 = extractelement <2 x i16> %v1, i64 0
2763  %v2e1 = extractelement <2 x i16> %v2, i64 0
2764  %mul1 = mul i16 %v1e1, %v2e1
2765
2766  %v1e2 = extractelement <2 x i16> %v1, i64 1
2767  %v2e2 = extractelement <2 x i16> %v2, i64 1
2768  %mul2 = mul i16 %v1e2, %v2e2
2769
2770  %s2 = load i16, ptr addrspace(1) %dst, align 2
2771  %add1 = add i16 %mul2, %s2
2772  %add2 = add i16 %add1, %mul1
2773  store i16 %add2, ptr addrspace(1) %dst, align 2
2774  ret void
2775}
2776
2777define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
2778; GFX7-LABEL: notsdot2_sext8:
2779; GFX7:       ; %bb.0: ; %entry
2780; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2781; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2782; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2783; GFX7-NEXT:    s_mov_b32 s10, 0
2784; GFX7-NEXT:    s_mov_b32 s11, s7
2785; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2786; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
2787; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2788; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2789; GFX7-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64
2790; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
2791; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
2792; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
2793; GFX7-NEXT:    s_mov_b32 s6, -1
2794; GFX7-NEXT:    s_waitcnt vmcnt(1)
2795; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
2796; GFX7-NEXT:    v_bfe_i32 v2, v2, 8, 8
2797; GFX7-NEXT:    s_waitcnt vmcnt(0)
2798; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 8
2799; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
2800; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2801; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s0
2802; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
2803; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2804; GFX7-NEXT:    s_endpgm
2805;
2806; GFX8-LABEL: notsdot2_sext8:
2807; GFX8:       ; %bb.0: ; %entry
2808; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2809; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2810; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
2811; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2812; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2813; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2814; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2815; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
2816; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2817; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2818; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2819; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
2820; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2821; GFX8-NEXT:    s_waitcnt vmcnt(1)
2822; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
2823; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 8, v3
2824; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
2825; GFX8-NEXT:    s_waitcnt vmcnt(0)
2826; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
2827; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
2828; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
2829; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2830; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s0
2831; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
2832; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2833; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2834; GFX8-NEXT:    flat_store_dword v[0:1], v2
2835; GFX8-NEXT:    s_endpgm
2836;
2837; GFX9-NODL-LABEL: notsdot2_sext8:
2838; GFX9-NODL:       ; %bb.0: ; %entry
2839; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2840; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2841; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2842; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2843; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[0:1]
2844; GFX9-NODL-NEXT:    global_load_ushort v2, v0, s[2:3]
2845; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
2846; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2847; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2848; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2849; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
2850; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
2851; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2852; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2853; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
2854; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
2855; GFX9-NODL-NEXT:    s_endpgm
2856;
2857; GFX9-DL-LABEL: notsdot2_sext8:
2858; GFX9-DL:       ; %bb.0: ; %entry
2859; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2860; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2861; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2862; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2863; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
2864; GFX9-DL-NEXT:    global_load_ushort v2, v0, s[2:3]
2865; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2866; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0001
2867; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2868; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2869; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
2870; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2871; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
2872; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2873; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
2874; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2875; GFX9-DL-NEXT:    s_endpgm
2876;
2877; GFX10-DL-LABEL: notsdot2_sext8:
2878; GFX10-DL:       ; %bb.0: ; %entry
2879; GFX10-DL-NEXT:    s_clause 0x1
2880; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2881; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2882; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2883; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2884; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2885; GFX10-DL-NEXT:    s_clause 0x1
2886; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[0:1]
2887; GFX10-DL-NEXT:    global_load_ushort v2, v0, s[2:3]
2888; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
2889; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2890; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2891; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc0c0001
2892; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2893; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc0c0001
2894; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2895; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s0
2896; GFX10-DL-NEXT:    v_dot4c_i32_i8 v2, v1, v0
2897; GFX10-DL-NEXT:    global_store_dword v3, v2, s[6:7]
2898; GFX10-DL-NEXT:    s_endpgm
2899                                          ptr addrspace(1) %src2,
2900                                          ptr addrspace(1) nocapture %dst) {
2901entry:
2902  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2903  %gep1 = getelementptr <2 x i8>, ptr addrspace(1) %src1, i32 %idx
2904  %vec1 = load <2 x i8>, ptr addrspace(1) %gep1
2905  %gep2 = getelementptr <2 x i8>, ptr addrspace(1) %src2, i32 %idx
2906  %vec2 = load <2 x i8>, ptr addrspace(1) %gep2
2907
2908  %s1.elt1 = extractelement <2 x i8> %vec1, i64 0
2909  %conv = sext i8 %s1.elt1 to i32
2910  %s2.elt1 = extractelement <2 x i8> %vec2, i64 0
2911  %conv2 = sext i8 %s2.elt1 to i32
2912  %mul1 = mul nuw i32 %conv2, %conv
2913
2914  %s1.elt2 = extractelement <2 x i8> %vec1, i64 1
2915  %conv3 = sext i8 %s1.elt2 to i32
2916  %s2.elt2 = extractelement <2 x i8> %vec2, i64 1
2917  %conv4 = sext i8 %s2.elt2 to i32
2918  %mul2 = mul nuw i32 %conv4, %conv3
2919
2920  %s3 = load i32, ptr addrspace(1) %dst, align 4
2921  %add = add i32 %mul2, %s3
2922  %add6 = add i32 %add, %mul1
2923  store i32 %add6, ptr addrspace(1) %dst, align 4
2924  ret void
2925}
2926
2927declare i32 @llvm.amdgcn.workitem.id.x()
2928