xref: /llvm-project/llvm/test/CodeGen/AMDGPU/idot4u.ll (revision bfd9bc274586b0261e16e22ac50d50586a0152e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-DL %s
9
10define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
11; GFX7-LABEL: udot4_acc32:
12; GFX7:       ; %bb.0: ; %entry
13; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
14; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
15; GFX7-NEXT:    s_mov_b32 s3, 0xf000
16; GFX7-NEXT:    s_mov_b32 s6, 0
17; GFX7-NEXT:    s_mov_b32 s7, s3
18; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
20; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
21; GFX7-NEXT:    v_mov_b32_e32 v1, 0
22; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
23; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
24; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
25; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
26; GFX7-NEXT:    s_mov_b32 s2, -1
27; GFX7-NEXT:    s_waitcnt vmcnt(1)
28; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
29; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
30; GFX7-NEXT:    s_waitcnt vmcnt(0)
31; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
32; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
33; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
34; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, s4
35; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
36; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
37; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
38; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
39; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
40; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
41; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
42; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
43; GFX7-NEXT:    s_endpgm
44;
45; GFX8-LABEL: udot4_acc32:
46; GFX8:       ; %bb.0: ; %entry
47; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
48; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
49; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
50; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX8-NEXT:    v_mov_b32_e32 v1, s1
52; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
53; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
54; GFX8-NEXT:    flat_load_dword v3, v[0:1]
55; GFX8-NEXT:    v_mov_b32_e32 v1, s3
56; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
57; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
58; GFX8-NEXT:    flat_load_dword v0, v[0:1]
59; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
60; GFX8-NEXT:    s_waitcnt vmcnt(1)
61; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
62; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
63; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
64; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
65; GFX8-NEXT:    s_waitcnt vmcnt(0)
66; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
67; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
68; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s0
70; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
71; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
72; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
73; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
74; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
75; GFX8-NEXT:    v_mov_b32_e32 v0, s4
76; GFX8-NEXT:    v_mov_b32_e32 v1, s5
77; GFX8-NEXT:    flat_store_dword v[0:1], v2
78; GFX8-NEXT:    s_endpgm
79;
80; GFX9-NODL-LABEL: udot4_acc32:
81; GFX9-NODL:       ; %bb.0: ; %entry
82; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
83; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
84; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
85; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
86; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
87; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
88; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
89; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
90; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
91; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
92; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
93; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
94; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
95; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX9-NODL-NEXT:    v_add3_u32 v2, v3, s0, v4
97; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
98; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
99; GFX9-NODL-NEXT:    s_endpgm
100;
101; GFX9-DL-LABEL: udot4_acc32:
102; GFX9-DL:       ; %bb.0: ; %entry
103; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
104; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
105; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
106; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
107; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
108; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
109; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
110; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
111; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
112; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
113; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
114; GFX9-DL-NEXT:    s_endpgm
115;
116; GFX10-DL-LABEL: udot4_acc32:
117; GFX10-DL:       ; %bb.0: ; %entry
118; GFX10-DL-NEXT:    s_clause 0x1
119; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
120; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
121; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
122; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX10-DL-NEXT:    s_clause 0x1
124; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
125; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
126; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
127; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
128; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
129; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
130; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
131; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
132; GFX10-DL-NEXT:    s_endpgm
133;
134; GFX11-DL-LABEL: udot4_acc32:
135; GFX11-DL:       ; %bb.0: ; %entry
136; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
137; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
138; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
139; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
140; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
141; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
142; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
143; GFX11-DL-NEXT:    s_clause 0x1
144; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
145; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
146; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
147; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
148; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
149; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
150; GFX11-DL-NEXT:    s_endpgm
151                                       ptr addrspace(1) %src2,
152                                       ptr addrspace(1) nocapture %dst) {
153entry:
154  %idx = call i32 @llvm.amdgcn.workitem.id.x()
155  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
156  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
157  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
158  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
159
160  %v1e0 = extractelement <4 x i8> %vec1, i64 0
161  %cv1e0 = zext i8 %v1e0 to i32
162  %v2e0 = extractelement <4 x i8> %vec2, i64 0
163  %cv2e0 = zext i8 %v2e0 to i32
164  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
165
166  %v1e1 = extractelement <4 x i8> %vec1, i64 1
167  %cv1e1 = zext i8 %v1e1 to i32
168  %v2e1 = extractelement <4 x i8> %vec2, i64 1
169  %cv2e1 = zext i8 %v2e1 to i32
170  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
171
172  %v1e2 = extractelement <4 x i8> %vec1, i64 2
173  %cv1e2 = zext i8 %v1e2 to i32
174  %v2e2 = extractelement <4 x i8> %vec2, i64 2
175  %cv2e2 = zext i8 %v2e2 to i32
176  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
177
178  %v1e3 = extractelement <4 x i8> %vec1, i64 3
179  %cv1e3 = zext i8 %v1e3 to i32
180  %v2e3 = extractelement <4 x i8> %vec2, i64 3
181  %cv2e3 = zext i8 %v2e3 to i32
182  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
183
184  %acc = load i32, ptr addrspace(1) %dst, align 4
185  %mad1 = add i32 %mul1, %acc
186  %mad2 = add i32 %mad1, %mul2
187  %mad3 = add i32 %mad2, %mul3
188  %mad4 = add i32 %mad3, %mul4
189
190  store i32 %mad4, ptr addrspace(1) %dst, align 4
191  ret void
192}
193
194define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
195; GFX7-LABEL: udot4_acc16:
196; GFX7:       ; %bb.0: ; %entry
197; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
198; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
199; GFX7-NEXT:    s_mov_b32 s3, 0xf000
200; GFX7-NEXT:    s_mov_b32 s6, 0
201; GFX7-NEXT:    s_mov_b32 s7, s3
202; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
203; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
204; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
205; GFX7-NEXT:    v_mov_b32_e32 v1, 0
206; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
207; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
208; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
209; GFX7-NEXT:    s_mov_b32 s2, -1
210; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
211; GFX7-NEXT:    s_waitcnt vmcnt(2)
212; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
213; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
214; GFX7-NEXT:    s_waitcnt vmcnt(1)
215; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
216; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
217; GFX7-NEXT:    s_waitcnt vmcnt(0)
218; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
219; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
220; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
221; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
222; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
223; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
224; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
225; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
226; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
227; GFX7-NEXT:    s_endpgm
228;
229; GFX8-LABEL: udot4_acc16:
230; GFX8:       ; %bb.0: ; %entry
231; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
232; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
233; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
234; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
235; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX8-NEXT:    v_mov_b32_e32 v1, s1
237; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
238; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
239; GFX8-NEXT:    flat_load_dword v3, v[0:1]
240; GFX8-NEXT:    v_mov_b32_e32 v1, s3
241; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
242; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
243; GFX8-NEXT:    flat_load_dword v2, v[0:1]
244; GFX8-NEXT:    v_mov_b32_e32 v0, s4
245; GFX8-NEXT:    v_mov_b32_e32 v1, s5
246; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
247; GFX8-NEXT:    s_waitcnt vmcnt(2)
248; GFX8-NEXT:    v_and_b32_e32 v6, 0xff, v3
249; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
250; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v8
251; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
252; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
253; GFX8-NEXT:    s_waitcnt vmcnt(1)
254; GFX8-NEXT:    v_and_b32_e32 v7, 0xff, v2
255; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
256; GFX8-NEXT:    v_and_b32_e32 v9, 0xff, v9
257; GFX8-NEXT:    s_waitcnt vmcnt(0)
258; GFX8-NEXT:    v_mad_u16 v4, v6, v7, v4
259; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
260; GFX8-NEXT:    v_mad_u16 v4, v8, v9, v4
261; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
262; GFX8-NEXT:    v_mad_u16 v4, v10, v5, v4
263; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
264; GFX8-NEXT:    flat_store_short v[0:1], v2
265; GFX8-NEXT:    s_endpgm
266;
267; GFX9-NODL-LABEL: udot4_acc16:
268; GFX9-NODL:       ; %bb.0: ; %entry
269; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
270; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
271; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
272; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
274; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
275; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
276; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[6:7]
277; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
278; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
279; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v1
280; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
281; GFX9-NODL-NEXT:    v_and_b32_e32 v5, 0xff, v2
282; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
283; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
284; GFX9-NODL-NEXT:    v_and_b32_e32 v6, 0xff, v6
285; GFX9-NODL-NEXT:    v_and_b32_e32 v7, 0xff, v7
286; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
287; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
288; GFX9-NODL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
289; GFX9-NODL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
290; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
291; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
292; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
293; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
294; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
295; GFX9-NODL-NEXT:    global_store_short v0, v1, s[6:7]
296; GFX9-NODL-NEXT:    s_endpgm
297;
298; GFX9-DL-LABEL: udot4_acc16:
299; GFX9-DL:       ; %bb.0: ; %entry
300; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
301; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
302; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
303; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
304; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
305; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
306; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
307; GFX9-DL-NEXT:    global_load_ushort v4, v1, s[6:7]
308; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
309; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
310; GFX9-DL-NEXT:    global_store_short v1, v0, s[6:7]
311; GFX9-DL-NEXT:    s_endpgm
312;
313; GFX10-DL-LABEL: udot4_acc16:
314; GFX10-DL:       ; %bb.0: ; %entry
315; GFX10-DL-NEXT:    s_clause 0x1
316; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
317; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
318; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
319; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
320; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
321; GFX10-DL-NEXT:    s_clause 0x1
322; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
323; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
324; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[6:7]
325; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
326; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
327; GFX10-DL-NEXT:    global_store_short v1, v0, s[6:7]
328; GFX10-DL-NEXT:    s_endpgm
329;
330; GFX11-DL-LABEL: udot4_acc16:
331; GFX11-DL:       ; %bb.0: ; %entry
332; GFX11-DL-NEXT:    s_clause 0x1
333; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
334; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
335; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
336; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
337; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
338; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
339; GFX11-DL-NEXT:    s_clause 0x1
340; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
341; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
342; GFX11-DL-NEXT:    global_load_u16 v3, v1, s[4:5]
343; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
344; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
345; GFX11-DL-NEXT:    global_store_b16 v1, v0, s[4:5]
346; GFX11-DL-NEXT:    s_endpgm
347                                       ptr addrspace(1) %src2,
348                                       ptr addrspace(1) nocapture %dst) {
349entry:
350  %idx = call i32 @llvm.amdgcn.workitem.id.x()
351  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
352  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
353  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
354  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
355
356  %v1e0 = extractelement <4 x i8> %vec1, i64 0
357  %cv1e0 = zext i8 %v1e0 to i16
358  %v2e0 = extractelement <4 x i8> %vec2, i64 0
359  %cv2e0 = zext i8 %v2e0 to i16
360  %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
361
362  %v1e1 = extractelement <4 x i8> %vec1, i64 1
363  %cv1e1 = zext i8 %v1e1 to i16
364  %v2e1 = extractelement <4 x i8> %vec2, i64 1
365  %cv2e1 = zext i8 %v2e1 to i16
366  %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
367
368  %v1e2 = extractelement <4 x i8> %vec1, i64 2
369  %cv1e2 = zext i8 %v1e2 to i16
370  %v2e2 = extractelement <4 x i8> %vec2, i64 2
371  %cv2e2 = zext i8 %v2e2 to i16
372  %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
373
374  %v1e3 = extractelement <4 x i8> %vec1, i64 3
375  %cv1e3 = zext i8 %v1e3 to i16
376  %v2e3 = extractelement <4 x i8> %vec2, i64 3
377  %cv2e3 = zext i8 %v2e3 to i16
378  %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
379
380  %acc = load i16, ptr addrspace(1) %dst, align 2
381  %mad1 = add i16 %mul1, %acc
382  %mad2 = add i16 %mad1, %mul2
383  %mad3 = add i16 %mad2, %mul3
384  %mad4 = add i16 %mad3, %mul4
385
386  store i16 %mad4, ptr addrspace(1) %dst, align 2
387  ret void
388}
389
390define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
391; GFX7-LABEL: udot4_acc8:
392; GFX7:       ; %bb.0: ; %entry
393; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
394; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
395; GFX7-NEXT:    s_mov_b32 s3, 0xf000
396; GFX7-NEXT:    s_mov_b32 s6, 0
397; GFX7-NEXT:    s_mov_b32 s7, s3
398; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
400; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
401; GFX7-NEXT:    v_mov_b32_e32 v1, 0
402; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
403; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
404; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
405; GFX7-NEXT:    s_mov_b32 s2, -1
406; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
407; GFX7-NEXT:    s_waitcnt vmcnt(2)
408; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
409; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
410; GFX7-NEXT:    s_waitcnt vmcnt(1)
411; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
412; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
413; GFX7-NEXT:    s_waitcnt vmcnt(0)
414; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
415; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
416; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
417; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
418; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
419; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
420; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
421; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
422; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
423; GFX7-NEXT:    s_endpgm
424;
425; GFX8-LABEL: udot4_acc8:
426; GFX8:       ; %bb.0: ; %entry
427; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
428; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
429; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
430; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
431; GFX8-NEXT:    v_mov_b32_e32 v1, s1
432; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
433; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
434; GFX8-NEXT:    flat_load_dword v3, v[0:1]
435; GFX8-NEXT:    v_mov_b32_e32 v1, s3
436; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
437; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
438; GFX8-NEXT:    flat_load_dword v2, v[0:1]
439; GFX8-NEXT:    v_mov_b32_e32 v0, s4
440; GFX8-NEXT:    v_mov_b32_e32 v1, s5
441; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
442; GFX8-NEXT:    s_waitcnt vmcnt(2)
443; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
444; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
445; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
446; GFX8-NEXT:    s_waitcnt vmcnt(1)
447; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
448; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
449; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
450; GFX8-NEXT:    s_waitcnt vmcnt(0)
451; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
452; GFX8-NEXT:    v_mad_u16 v2, v7, v8, v2
453; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
454; GFX8-NEXT:    v_mad_u16 v2, v9, v10, v2
455; GFX8-NEXT:    flat_store_byte v[0:1], v2
456; GFX8-NEXT:    s_endpgm
457;
458; GFX9-NODL-LABEL: udot4_acc8:
459; GFX9-NODL:       ; %bb.0: ; %entry
460; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
461; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
462; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
463; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
465; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
466; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
467; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[6:7]
468; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
469; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
470; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
471; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
472; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
473; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
474; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
475; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
476; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
477; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
478; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
479; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
480; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
481; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[6:7]
482; GFX9-NODL-NEXT:    s_endpgm
483;
484; GFX9-DL-LABEL: udot4_acc8:
485; GFX9-DL:       ; %bb.0: ; %entry
486; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
487; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
488; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
489; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
490; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
491; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
492; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
493; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[6:7]
494; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
495; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
496; GFX9-DL-NEXT:    global_store_byte v1, v0, s[6:7]
497; GFX9-DL-NEXT:    s_endpgm
498;
499; GFX10-DL-LABEL: udot4_acc8:
500; GFX10-DL:       ; %bb.0: ; %entry
501; GFX10-DL-NEXT:    s_clause 0x1
502; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
503; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
504; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
505; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
506; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX10-DL-NEXT:    s_clause 0x1
508; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
509; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
510; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[6:7]
511; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
512; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
513; GFX10-DL-NEXT:    global_store_byte v1, v0, s[6:7]
514; GFX10-DL-NEXT:    s_endpgm
515;
516; GFX11-DL-LABEL: udot4_acc8:
517; GFX11-DL:       ; %bb.0: ; %entry
518; GFX11-DL-NEXT:    s_clause 0x1
519; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
520; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
521; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
522; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
523; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
524; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX11-DL-NEXT:    s_clause 0x1
526; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
527; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
528; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[4:5]
529; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
530; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
531; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[4:5]
532; GFX11-DL-NEXT:    s_endpgm
533                                      ptr addrspace(1) %src2,
534                                      ptr addrspace(1) nocapture %dst) {
535entry:
536  %idx = call i32 @llvm.amdgcn.workitem.id.x()
537  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
538  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
539  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
540  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
541
542  %v1e0 = extractelement <4 x i8> %vec1, i64 0
543  %v2e0 = extractelement <4 x i8> %vec2, i64 0
544  %mul1 = mul nuw nsw i8 %v1e0, %v2e0
545
546  %v1e1 = extractelement <4 x i8> %vec1, i64 1
547  %v2e1 = extractelement <4 x i8> %vec2, i64 1
548  %mul2 = mul nuw nsw i8 %v1e1, %v2e1
549
550  %v1e2 = extractelement <4 x i8> %vec1, i64 2
551  %v2e2 = extractelement <4 x i8> %vec2, i64 2
552  %mul3 = mul nuw nsw i8 %v1e2, %v2e2
553
554  %v1e3 = extractelement <4 x i8> %vec1, i64 3
555  %v2e3 = extractelement <4 x i8> %vec2, i64 3
556  %mul4 = mul nuw nsw i8 %v1e3, %v2e3
557
558  %acc = load i8, ptr addrspace(1) %dst, align 2
559  %mad1 = add i8 %mul1, %acc
560  %mad2 = add i8 %mad1, %mul2
561  %mad3 = add i8 %mad2, %mul3
562  %mad4 = add i8 %mad3, %mul4
563
564  store i8 %mad4, ptr addrspace(1) %dst, align 2
565  ret void
566}
567
568define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
569; GFX7-LABEL: udot2_8:
570; GFX7:       ; %bb.0: ; %entry
571; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
572; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
573; GFX7-NEXT:    s_mov_b32 s7, 0xf000
574; GFX7-NEXT:    s_mov_b32 s10, 0
575; GFX7-NEXT:    s_mov_b32 s11, s7
576; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
577; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
578; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
579; GFX7-NEXT:    v_mov_b32_e32 v1, 0
580; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
581; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
582; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
583; GFX7-NEXT:    s_mov_b32 s6, -1
584; GFX7-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0
585; GFX7-NEXT:    s_waitcnt vmcnt(2)
586; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
587; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
588; GFX7-NEXT:    s_waitcnt vmcnt(1)
589; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
590; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
591; GFX7-NEXT:    s_waitcnt vmcnt(0)
592; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
593; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
594; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
595; GFX7-NEXT:    s_endpgm
596;
597; GFX8-LABEL: udot2_8:
598; GFX8:       ; %bb.0: ; %entry
599; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
600; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
601; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
602; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
603; GFX8-NEXT:    v_mov_b32_e32 v1, s1
604; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
605; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
606; GFX8-NEXT:    flat_load_dword v3, v[0:1]
607; GFX8-NEXT:    v_mov_b32_e32 v1, s3
608; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
609; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
610; GFX8-NEXT:    flat_load_dword v2, v[0:1]
611; GFX8-NEXT:    v_mov_b32_e32 v0, s4
612; GFX8-NEXT:    v_mov_b32_e32 v1, s5
613; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
614; GFX8-NEXT:    s_waitcnt vmcnt(2)
615; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
616; GFX8-NEXT:    s_waitcnt vmcnt(1)
617; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
618; GFX8-NEXT:    s_waitcnt vmcnt(0)
619; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
620; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
621; GFX8-NEXT:    flat_store_byte v[0:1], v2
622; GFX8-NEXT:    s_endpgm
623;
624; GFX9-NODL-LABEL: udot2_8:
625; GFX9-NODL:       ; %bb.0: ; %entry
626; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
627; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
628; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
629; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, 0
630; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
631; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[0:1]
632; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[2:3]
633; GFX9-NODL-NEXT:    global_load_ubyte v4, v1, s[6:7]
634; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
635; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
636; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
637; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
638; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
639; GFX9-NODL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v4
640; GFX9-NODL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v2
641; GFX9-NODL-NEXT:    global_store_byte v1, v0, s[6:7]
642; GFX9-NODL-NEXT:    s_endpgm
643;
644; GFX9-DL-LABEL: udot2_8:
645; GFX9-DL:       ; %bb.0: ; %entry
646; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
647; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
648; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
649; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
650; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
651; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
652; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
653; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[6:7]
654; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc0c0100
655; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
656; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
657; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
658; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s0
659; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
660; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
661; GFX9-DL-NEXT:    global_store_byte v0, v1, s[6:7]
662; GFX9-DL-NEXT:    s_endpgm
663;
664; GFX10-DL-LABEL: udot2_8:
665; GFX10-DL:       ; %bb.0: ; %entry
666; GFX10-DL-NEXT:    s_clause 0x1
667; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
668; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
669; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
670; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
671; GFX10-DL-NEXT:    s_clause 0x1
672; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
673; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
674; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
675; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[6:7]
676; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
677; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
678; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
679; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0100
680; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
681; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
682; GFX10-DL-NEXT:    global_store_byte v0, v1, s[6:7]
683; GFX10-DL-NEXT:    s_endpgm
684;
685; GFX11-DL-LABEL: udot2_8:
686; GFX11-DL:       ; %bb.0: ; %entry
687; GFX11-DL-NEXT:    s_clause 0x1
688; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
689; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
690; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
691; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
692; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
693; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
694; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
695; GFX11-DL-NEXT:    s_clause 0x1
696; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
697; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
698; GFX11-DL-NEXT:    global_load_u8 v3, v2, s[4:5]
699; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
700; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
701; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
702; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0100
703; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
704; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
705; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v3
706; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[4:5]
707; GFX11-DL-NEXT:    s_endpgm
708                                   ptr addrspace(1) %src2,
709                                   ptr addrspace(1) nocapture %dst) {
710entry:
711  %idx = call i32 @llvm.amdgcn.workitem.id.x()
712  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
713  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
714  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
715  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
716
717  %v1e0 = extractelement <4 x i8> %vec1, i64 0
718  %v2e0 = extractelement <4 x i8> %vec2, i64 0
719  %mul1 = mul nuw nsw i8 %v1e0, %v2e0
720
721  %v1e1 = extractelement <4 x i8> %vec1, i64 1
722  %v2e1 = extractelement <4 x i8> %vec2, i64 1
723  %mul2 = mul nuw nsw i8 %v1e1, %v2e1
724
725  %acc = load i8, ptr addrspace(1) %dst, align 2
726  %mad1 = add i8 %mul1, %acc
727  %mad2 = add i8 %mad1, %mul2
728  store i8 %mad2, ptr addrspace(1) %dst, align 2
729  ret void
730}
731
732define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
733; GFX7-LABEL: udot4_CommutationInsideMAD:
734; GFX7:       ; %bb.0: ; %entry
735; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
736; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
737; GFX7-NEXT:    s_mov_b32 s3, 0xf000
738; GFX7-NEXT:    s_mov_b32 s6, 0
739; GFX7-NEXT:    s_mov_b32 s7, s3
740; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
742; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
743; GFX7-NEXT:    v_mov_b32_e32 v1, 0
744; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
745; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
746; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
747; GFX7-NEXT:    s_mov_b32 s2, -1
748; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
749; GFX7-NEXT:    s_waitcnt vmcnt(2)
750; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
751; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
752; GFX7-NEXT:    s_waitcnt vmcnt(1)
753; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
754; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
755; GFX7-NEXT:    s_waitcnt vmcnt(0)
756; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
757; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
758; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
759; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v4, v1
760; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
761; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
762; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v5, v1
763; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
764; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
765; GFX7-NEXT:    s_endpgm
766;
767; GFX8-LABEL: udot4_CommutationInsideMAD:
768; GFX8:       ; %bb.0: ; %entry
769; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
770; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
771; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
772; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
773; GFX8-NEXT:    v_mov_b32_e32 v1, s1
774; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
775; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
776; GFX8-NEXT:    flat_load_dword v3, v[0:1]
777; GFX8-NEXT:    v_mov_b32_e32 v1, s3
778; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
779; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
780; GFX8-NEXT:    flat_load_dword v2, v[0:1]
781; GFX8-NEXT:    v_mov_b32_e32 v0, s4
782; GFX8-NEXT:    v_mov_b32_e32 v1, s5
783; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
784; GFX8-NEXT:    s_waitcnt vmcnt(2)
785; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
786; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
787; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
788; GFX8-NEXT:    s_waitcnt vmcnt(1)
789; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
790; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
791; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
792; GFX8-NEXT:    s_waitcnt vmcnt(0)
793; GFX8-NEXT:    v_mad_u16 v2, v2, v3, v4
794; GFX8-NEXT:    v_mad_u16 v2, v8, v7, v2
795; GFX8-NEXT:    v_mad_u16 v2, v6, v5, v2
796; GFX8-NEXT:    v_mad_u16 v2, v10, v9, v2
797; GFX8-NEXT:    flat_store_byte v[0:1], v2
798; GFX8-NEXT:    s_endpgm
799;
800; GFX9-NODL-LABEL: udot4_CommutationInsideMAD:
801; GFX9-NODL:       ; %bb.0: ; %entry
802; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
803; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
804; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
805; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
806; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
807; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
808; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
809; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[6:7]
810; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
811; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
812; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
813; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
814; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
815; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
816; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
817; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
818; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
819; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v7, v6, v1
820; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
821; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
822; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
823; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[6:7]
824; GFX9-NODL-NEXT:    s_endpgm
825;
826; GFX9-DL-LABEL: udot4_CommutationInsideMAD:
827; GFX9-DL:       ; %bb.0: ; %entry
828; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
829; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
830; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
831; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
832; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
833; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
834; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
835; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[6:7]
836; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
837; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
838; GFX9-DL-NEXT:    global_store_byte v1, v0, s[6:7]
839; GFX9-DL-NEXT:    s_endpgm
840;
841; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
842; GFX10-DL:       ; %bb.0: ; %entry
843; GFX10-DL-NEXT:    s_clause 0x1
844; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
845; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
846; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
847; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
848; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
849; GFX10-DL-NEXT:    s_clause 0x1
850; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
851; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
852; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[6:7]
853; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
854; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
855; GFX10-DL-NEXT:    global_store_byte v1, v0, s[6:7]
856; GFX10-DL-NEXT:    s_endpgm
857;
858; GFX11-DL-LABEL: udot4_CommutationInsideMAD:
859; GFX11-DL:       ; %bb.0: ; %entry
860; GFX11-DL-NEXT:    s_clause 0x1
861; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
862; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
863; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
864; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
865; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
866; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX11-DL-NEXT:    s_clause 0x1
868; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
869; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
870; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[4:5]
871; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
872; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v3
873; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[4:5]
874; GFX11-DL-NEXT:    s_endpgm
875                                                      ptr addrspace(1) %src2,
876                                                      ptr addrspace(1) nocapture %dst) {
877entry:
878  %idx = call i32 @llvm.amdgcn.workitem.id.x()
879  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
880  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
881  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
882  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
883
884  %v1e0 = extractelement <4 x i8> %vec1, i64 0
885  %v2e0 = extractelement <4 x i8> %vec2, i64 0
886  %mul1 = mul nuw nsw i8 %v2e0, %v1e0
887
888  %v1e1 = extractelement <4 x i8> %vec1, i64 1
889  %v2e1 = extractelement <4 x i8> %vec2, i64 1
890  %mul2 = mul nuw nsw i8 %v2e1, %v1e1
891
892  %v1e2 = extractelement <4 x i8> %vec1, i64 2
893  %v2e2 = extractelement <4 x i8> %vec2, i64 2
894  %mul3 = mul nuw nsw i8 %v2e2, %v1e2
895
896  %v1e3 = extractelement <4 x i8> %vec1, i64 3
897  %v2e3 = extractelement <4 x i8> %vec2, i64 3
898  %mul4 = mul nuw nsw i8 %v2e3, %v1e3
899
900  %acc = load i8, ptr addrspace(1) %dst, align 2
901  %mad1 = add i8 %acc, %mul1
902  %mad2 = add i8 %mul2, %mad1
903  %mad3 = add i8 %mul3, %mad2
904  %mad4 = add i8 %mul4, %mad3
905
906  store i8 %mad4, ptr addrspace(1) %dst, align 2
907  ret void
908}
909
910define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
911; GFX7-LABEL: udot4_CommutationAccrossMADs:
912; GFX7:       ; %bb.0: ; %entry
913; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
914; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
915; GFX7-NEXT:    s_mov_b32 s3, 0xf000
916; GFX7-NEXT:    s_mov_b32 s6, 0
917; GFX7-NEXT:    s_mov_b32 s7, s3
918; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
919; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
920; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
921; GFX7-NEXT:    v_mov_b32_e32 v1, 0
922; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
923; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
924; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
925; GFX7-NEXT:    s_mov_b32 s2, -1
926; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
927; GFX7-NEXT:    s_waitcnt vmcnt(2)
928; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
929; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
930; GFX7-NEXT:    s_waitcnt vmcnt(1)
931; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
932; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
933; GFX7-NEXT:    s_waitcnt vmcnt(0)
934; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v4, v1
935; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
936; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
937; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
938; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
939; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
940; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v5, v1
941; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
942; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
943; GFX7-NEXT:    s_endpgm
944;
945; GFX8-LABEL: udot4_CommutationAccrossMADs:
946; GFX8:       ; %bb.0: ; %entry
947; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
948; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
949; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
950; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
951; GFX8-NEXT:    v_mov_b32_e32 v1, s1
952; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
953; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
954; GFX8-NEXT:    flat_load_dword v3, v[0:1]
955; GFX8-NEXT:    v_mov_b32_e32 v1, s3
956; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
957; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
958; GFX8-NEXT:    flat_load_dword v2, v[0:1]
959; GFX8-NEXT:    v_mov_b32_e32 v0, s4
960; GFX8-NEXT:    v_mov_b32_e32 v1, s5
961; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
962; GFX8-NEXT:    s_waitcnt vmcnt(2)
963; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
964; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
965; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
966; GFX8-NEXT:    s_waitcnt vmcnt(1)
967; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
968; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
969; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
970; GFX8-NEXT:    s_waitcnt vmcnt(0)
971; GFX8-NEXT:    v_mad_u16 v4, v8, v7, v4
972; GFX8-NEXT:    v_mad_u16 v2, v2, v3, v4
973; GFX8-NEXT:    v_mad_u16 v2, v6, v5, v2
974; GFX8-NEXT:    v_mad_u16 v2, v10, v9, v2
975; GFX8-NEXT:    flat_store_byte v[0:1], v2
976; GFX8-NEXT:    s_endpgm
977;
978; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs:
979; GFX9-NODL:       ; %bb.0: ; %entry
980; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
981; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
982; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
983; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
984; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
985; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
986; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
987; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[6:7]
988; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
989; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
990; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
991; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
992; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
993; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
994; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v7, v6, v3
995; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
996; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
997; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
998; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
999; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
1000; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
1001; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[6:7]
1002; GFX9-NODL-NEXT:    s_endpgm
1003;
1004; GFX9-DL-LABEL: udot4_CommutationAccrossMADs:
1005; GFX9-DL:       ; %bb.0: ; %entry
1006; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1007; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1008; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1009; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
1010; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1011; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
1012; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
1013; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[6:7]
1014; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1015; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
1016; GFX9-DL-NEXT:    global_store_byte v1, v0, s[6:7]
1017; GFX9-DL-NEXT:    s_endpgm
1018;
1019; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
1020; GFX10-DL:       ; %bb.0: ; %entry
1021; GFX10-DL-NEXT:    s_clause 0x1
1022; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1023; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1024; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1025; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1026; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1027; GFX10-DL-NEXT:    s_clause 0x1
1028; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
1029; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
1030; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[6:7]
1031; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1032; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v3, v2, v4
1033; GFX10-DL-NEXT:    global_store_byte v1, v0, s[6:7]
1034; GFX10-DL-NEXT:    s_endpgm
1035;
1036; GFX11-DL-LABEL: udot4_CommutationAccrossMADs:
1037; GFX11-DL:       ; %bb.0: ; %entry
1038; GFX11-DL-NEXT:    s_clause 0x1
1039; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1040; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1041; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1042; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1043; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1044; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1045; GFX11-DL-NEXT:    s_clause 0x1
1046; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
1047; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
1048; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[4:5]
1049; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1050; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v3
1051; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[4:5]
1052; GFX11-DL-NEXT:    s_endpgm
1053                                                        ptr addrspace(1) %src2,
1054                                                        ptr addrspace(1) nocapture %dst) {
1055entry:
1056  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1057  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1058  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1059  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1060  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1061
1062  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1063  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1064  %mul1 = mul nuw nsw i8 %v2e0, %v1e0
1065
1066  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1067  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1068  %mul2 = mul nuw nsw i8 %v2e1, %v1e1
1069
1070  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1071  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1072  %mul3 = mul nuw nsw i8 %v2e2, %v1e2
1073
1074  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1075  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1076  %mul4 = mul nuw nsw i8 %v2e3, %v1e3
1077
1078  %acc = load i8, ptr addrspace(1) %dst, align 2
1079  %mad1 = add i8 %acc, %mul2
1080  %mad2 = add i8 %mad1, %mul1
1081  %mad3 = add i8 %mad2, %mul3
1082  %mad4 = add i8 %mad3, %mul4
1083
1084  store i8 %mad4, ptr addrspace(1) %dst, align 2
1085  ret void
1086}
1087
1088define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
1089; GFX7-LABEL: udot4_multiuse_mul1:
1090; GFX7:       ; %bb.0: ; %entry
1091; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1092; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1093; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1094; GFX7-NEXT:    s_mov_b32 s6, 0
1095; GFX7-NEXT:    s_mov_b32 s7, s3
1096; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1097; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1098; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1099; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1100; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1101; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1102; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1103; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1104; GFX7-NEXT:    s_mov_b32 s2, -1
1105; GFX7-NEXT:    s_waitcnt vmcnt(1)
1106; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
1107; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
1108; GFX7-NEXT:    s_waitcnt vmcnt(0)
1109; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
1110; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
1111; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1112; GFX7-NEXT:    v_mad_u32_u24 v8, v1, v5, s4
1113; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v6, v8
1114; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
1115; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
1116; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
1117; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1118; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1119; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
1120; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1121; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1122; GFX7-NEXT:    s_endpgm
1123;
1124; GFX8-LABEL: udot4_multiuse_mul1:
1125; GFX8:       ; %bb.0: ; %entry
1126; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1127; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1128; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1129; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1130; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1131; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1132; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1133; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1134; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1135; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1136; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1137; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1138; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1139; GFX8-NEXT:    s_waitcnt vmcnt(1)
1140; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
1141; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
1142; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
1143; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1144; GFX8-NEXT:    s_waitcnt vmcnt(0)
1145; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
1146; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
1147; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1148; GFX8-NEXT:    v_mad_u32_u24 v8, v1, v2, s0
1149; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, v8
1150; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
1151; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
1152; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1153; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
1154; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
1155; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1156; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1157; GFX8-NEXT:    flat_store_dword v[0:1], v2
1158; GFX8-NEXT:    s_endpgm
1159;
1160; GFX9-NODL-LABEL: udot4_multiuse_mul1:
1161; GFX9-NODL:       ; %bb.0: ; %entry
1162; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1163; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1164; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1165; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1166; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1167; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1168; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1169; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1170; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1171; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
1172; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1173; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
1174; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1175; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1176; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1177; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
1178; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1179; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v3, v4, s0
1180; GFX9-NODL-NEXT:    v_add3_u32 v2, v5, v3, v2
1181; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
1182; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
1183; GFX9-NODL-NEXT:    s_endpgm
1184;
1185; GFX9-DL-LABEL: udot4_multiuse_mul1:
1186; GFX9-DL:       ; %bb.0: ; %entry
1187; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1188; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1189; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1190; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1191; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1192; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1193; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1194; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1195; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1196; GFX9-DL-NEXT:    v_and_b32_e32 v3, 0xff, v1
1197; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1198; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v2
1199; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1200; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v3, v4, s0
1201; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
1202; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1203; GFX9-DL-NEXT:    s_endpgm
1204;
1205; GFX10-DL-LABEL: udot4_multiuse_mul1:
1206; GFX10-DL:       ; %bb.0: ; %entry
1207; GFX10-DL-NEXT:    s_clause 0x1
1208; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1209; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1210; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1211; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1212; GFX10-DL-NEXT:    s_clause 0x1
1213; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1214; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1215; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1216; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1217; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1218; GFX10-DL-NEXT:    v_and_b32_e32 v0, 0xff, v1
1219; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1220; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xff, v2
1221; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1222; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v3, s0
1223; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
1224; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v2, v0
1225; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
1226; GFX10-DL-NEXT:    s_endpgm
1227;
1228; GFX11-DL-LABEL: udot4_multiuse_mul1:
1229; GFX11-DL:       ; %bb.0: ; %entry
1230; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1231; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1232; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1233; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1234; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1235; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1236; GFX11-DL-NEXT:    s_clause 0x1
1237; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
1238; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
1239; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
1240; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
1241; GFX11-DL-NEXT:    v_and_b32_e32 v2, 0xff, v1
1242; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1243; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xff, v0
1244; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1245; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1246; GFX11-DL-NEXT:    v_mad_u32_u24 v2, v2, v3, s0
1247; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
1248; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v2
1249; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[4:5]
1250; GFX11-DL-NEXT:    s_endpgm
1251                                               ptr addrspace(1) %src2,
1252                                               ptr addrspace(1) nocapture %dst) {
1253entry:
1254  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1255  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1256  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1257  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1258  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1259
1260  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1261  %cv1e0 = zext i8 %v1e0 to i32
1262  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1263  %cv2e0 = zext i8 %v2e0 to i32
1264  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1265
1266  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1267  %cv1e1 = zext i8 %v1e1 to i32
1268  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1269  %cv2e1 = zext i8 %v2e1 to i32
1270  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1271
1272  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1273  %cv1e2 = zext i8 %v1e2 to i32
1274  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1275  %cv2e2 = zext i8 %v2e2 to i32
1276  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1277
1278  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1279  %cv1e3 = zext i8 %v1e3 to i32
1280  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1281  %cv2e3 = zext i8 %v2e3 to i32
1282  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1283
1284  %acc = load i32, ptr addrspace(1) %dst, align 4
1285  %add = add i32 %mul1, %acc
1286  %add1 = add i32 %mul2, %add
1287  %add2 = add i32 %add1, %mul1
1288  %add3 = add i32 %add2, %mul3
1289  %add4 = add i32 %add3, %mul4
1290
1291  store i32 %add4, ptr addrspace(1) %dst, align 4
1292  ret void
1293}
1294
1295define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
1296; GFX7-LABEL: udot4_multiuse_add1:
1297; GFX7:       ; %bb.0: ; %entry
1298; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1299; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1300; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1301; GFX7-NEXT:    s_mov_b32 s6, 0
1302; GFX7-NEXT:    s_mov_b32 s7, s3
1303; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1305; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1306; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1307; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1308; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1309; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1310; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1311; GFX7-NEXT:    s_mov_b32 s2, -1
1312; GFX7-NEXT:    s_waitcnt vmcnt(1)
1313; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
1314; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
1315; GFX7-NEXT:    s_waitcnt vmcnt(0)
1316; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
1317; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
1318; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1319; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v6, s4
1320; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
1321; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
1322; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
1323; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1324; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1325; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
1326; GFX7-NEXT:    v_add_i32_e32 v6, vcc, s4, v3
1327; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1328; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
1329; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1330; GFX7-NEXT:    s_endpgm
1331;
1332; GFX8-LABEL: udot4_multiuse_add1:
1333; GFX8:       ; %bb.0: ; %entry
1334; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1335; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1336; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1337; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1338; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1339; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1340; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1341; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1342; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1343; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1344; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1345; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1346; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1347; GFX8-NEXT:    s_waitcnt vmcnt(1)
1348; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
1349; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
1350; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
1351; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1352; GFX8-NEXT:    s_waitcnt vmcnt(0)
1353; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
1354; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
1355; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1356; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, s0
1357; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
1358; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
1359; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1360; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
1361; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v4
1362; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
1363; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v5
1364; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1365; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1366; GFX8-NEXT:    flat_store_dword v[0:1], v2
1367; GFX8-NEXT:    s_endpgm
1368;
1369; GFX9-NODL-LABEL: udot4_multiuse_add1:
1370; GFX9-NODL:       ; %bb.0: ; %entry
1371; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1372; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1373; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1374; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1375; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1376; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1377; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1378; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1379; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1380; GFX9-NODL-NEXT:    v_bfe_u32 v4, v1, 8, 8
1381; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1382; GFX9-NODL-NEXT:    v_bfe_u32 v5, v2, 8, 8
1383; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1384; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1385; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1386; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1387; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v4, v5, s0
1388; GFX9-NODL-NEXT:    v_add_u32_e32 v4, s0, v2
1389; GFX9-NODL-NEXT:    v_add3_u32 v2, v2, v3, v6
1390; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v1, v4
1391; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
1392; GFX9-NODL-NEXT:    s_endpgm
1393;
1394; GFX9-DL-LABEL: udot4_multiuse_add1:
1395; GFX9-DL:       ; %bb.0: ; %entry
1396; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1397; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1398; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1399; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1400; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1401; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1402; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1403; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1404; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1405; GFX9-DL-NEXT:    s_add_i32 s1, s0, s0
1406; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1407; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1408; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
1409; GFX9-DL-NEXT:    v_add3_u32 v1, s1, v3, v1
1410; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1411; GFX9-DL-NEXT:    s_endpgm
1412;
1413; GFX10-DL-LABEL: udot4_multiuse_add1:
1414; GFX10-DL:       ; %bb.0: ; %entry
1415; GFX10-DL-NEXT:    s_clause 0x1
1416; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1417; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1418; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1419; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1420; GFX10-DL-NEXT:    s_clause 0x1
1421; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1422; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1423; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1424; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1425; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1426; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1427; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1428; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
1429; GFX10-DL-NEXT:    s_add_i32 s0, s0, s0
1430; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1431; GFX10-DL-NEXT:    v_add3_u32 v0, s0, v0, v1
1432; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
1433; GFX10-DL-NEXT:    s_endpgm
1434;
1435; GFX11-DL-LABEL: udot4_multiuse_add1:
1436; GFX11-DL:       ; %bb.0: ; %entry
1437; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1438; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1439; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1440; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1441; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1442; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1443; GFX11-DL-NEXT:    s_clause 0x1
1444; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
1445; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
1446; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
1447; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
1448; GFX11-DL-NEXT:    v_bfe_u32 v2, v1, 8, 8
1449; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1450; GFX11-DL-NEXT:    v_bfe_u32 v3, v0, 8, 8
1451; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1452; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
1453; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
1454; GFX11-DL-NEXT:    s_add_i32 s0, s0, s0
1455; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v2, v2, v3
1456; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1457; GFX11-DL-NEXT:    v_add3_u32 v0, s0, v2, v0
1458; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[4:5]
1459; GFX11-DL-NEXT:    s_endpgm
1460                                               ptr addrspace(1) %src2,
1461                                               ptr addrspace(1) nocapture %dst) {
1462entry:
1463  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1464  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1465  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1466  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1467  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1468
1469  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1470  %cv1e0 = zext i8 %v1e0 to i32
1471  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1472  %cv2e0 = zext i8 %v2e0 to i32
1473  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1474
1475  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1476  %cv1e1 = zext i8 %v1e1 to i32
1477  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1478  %cv2e1 = zext i8 %v2e1 to i32
1479  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1480
1481  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1482  %cv1e2 = zext i8 %v1e2 to i32
1483  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1484  %cv2e2 = zext i8 %v2e2 to i32
1485  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1486
1487  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1488  %cv1e3 = zext i8 %v1e3 to i32
1489  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1490  %cv2e3 = zext i8 %v2e3 to i32
1491  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1492
1493  %acc = load i32, ptr addrspace(1) %dst, align 4
1494  %add1 = add i32 %mul2, %acc
1495  %add = add i32 %add1, %acc
1496  %add2 = add i32 %add1, %mul1
1497  %add3 = add i32 %add2, %mul3
1498  %add4 = add i32 %add3, %mul4
1499  %res = add i32 %add4, %add
1500  store i32 %res, ptr addrspace(1) %dst, align 4
1501  ret void
1502}
1503
1504define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
1505; GFX7-LABEL: notdot4_mixedtypes:
1506; GFX7:       ; %bb.0: ; %entry
1507; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1508; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1509; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1510; GFX7-NEXT:    s_mov_b32 s6, 0
1511; GFX7-NEXT:    s_mov_b32 s7, s3
1512; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1513; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1514; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1515; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1516; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1517; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1518; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1519; GFX7-NEXT:    s_mov_b32 s2, -1
1520; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
1521; GFX7-NEXT:    s_waitcnt vmcnt(2)
1522; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
1523; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
1524; GFX7-NEXT:    s_waitcnt vmcnt(1)
1525; GFX7-NEXT:    v_bfe_i32 v6, v0, 0, 8
1526; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
1527; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1528; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
1529; GFX7-NEXT:    s_waitcnt vmcnt(0)
1530; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
1531; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
1532; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
1533; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
1534; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1535; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1536; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
1537; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1538; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
1539; GFX7-NEXT:    s_endpgm
1540;
1541; GFX8-LABEL: notdot4_mixedtypes:
1542; GFX8:       ; %bb.0: ; %entry
1543; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1544; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1545; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1546; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
1547; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1548; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1549; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1550; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1551; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1552; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1553; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1554; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1555; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1556; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1557; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1558; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
1559; GFX8-NEXT:    s_waitcnt vmcnt(2)
1560; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
1561; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v8
1562; GFX8-NEXT:    v_bfe_i32 v6, v3, 0, 8
1563; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1564; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1565; GFX8-NEXT:    s_waitcnt vmcnt(1)
1566; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
1567; GFX8-NEXT:    v_and_b32_e32 v9, 0xff, v9
1568; GFX8-NEXT:    v_bfe_i32 v7, v2, 0, 8
1569; GFX8-NEXT:    s_waitcnt vmcnt(0)
1570; GFX8-NEXT:    v_mad_u16 v4, v8, v9, v4
1571; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1572; GFX8-NEXT:    v_mad_u16 v4, v6, v7, v4
1573; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1574; GFX8-NEXT:    v_mad_u16 v4, v10, v5, v4
1575; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1576; GFX8-NEXT:    flat_store_short v[0:1], v2
1577; GFX8-NEXT:    s_endpgm
1578;
1579; GFX9-NODL-LABEL: notdot4_mixedtypes:
1580; GFX9-NODL:       ; %bb.0: ; %entry
1581; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1582; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1583; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1584; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1585; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1586; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1587; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1588; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[6:7]
1589; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
1590; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
1591; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
1592; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1593; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
1594; GFX9-NODL-NEXT:    v_and_b32_e32 v6, 0xff, v6
1595; GFX9-NODL-NEXT:    v_and_b32_e32 v7, 0xff, v7
1596; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 0, 8
1597; GFX9-NODL-NEXT:    v_bfe_i32 v5, v2, 0, 8
1598; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1599; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
1600; GFX9-NODL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1601; GFX9-NODL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1602; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
1603; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1604; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1605; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
1606; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1607; GFX9-NODL-NEXT:    global_store_short v0, v1, s[6:7]
1608; GFX9-NODL-NEXT:    s_endpgm
1609;
1610; GFX9-DL-LABEL: notdot4_mixedtypes:
1611; GFX9-DL:       ; %bb.0: ; %entry
1612; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1613; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1614; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1615; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1616; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1617; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1618; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1619; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[6:7]
1620; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc0c0302
1621; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1622; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
1623; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1624; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
1625; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
1626; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xff, v7
1627; GFX9-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
1628; GFX9-DL-NEXT:    v_bfe_i32 v5, v2, 0, 8
1629; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1630; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
1631; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s0
1632; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
1633; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
1634; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
1635; GFX9-DL-NEXT:    global_store_short v0, v1, s[6:7]
1636; GFX9-DL-NEXT:    s_endpgm
1637;
1638; GFX10-DL-LABEL: notdot4_mixedtypes:
1639; GFX10-DL:       ; %bb.0: ; %entry
1640; GFX10-DL-NEXT:    s_clause 0x1
1641; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1642; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1643; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1644; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1645; GFX10-DL-NEXT:    s_clause 0x1
1646; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1647; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1648; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1649; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[6:7]
1650; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1651; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
1652; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1653; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
1654; GFX10-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
1655; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
1656; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0302
1657; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
1658; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
1659; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0302
1660; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1661; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
1662; GFX10-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
1663; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1664; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
1665; GFX10-DL-NEXT:    global_store_short v0, v1, s[6:7]
1666; GFX10-DL-NEXT:    s_endpgm
1667;
1668; GFX11-DL-LABEL: notdot4_mixedtypes:
1669; GFX11-DL:       ; %bb.0: ; %entry
1670; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1671; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1672; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1673; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1674; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1675; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1676; GFX11-DL-NEXT:    s_clause 0x1
1677; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
1678; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
1679; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
1680; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
1681; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1682; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
1683; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
1684; GFX11-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
1685; GFX11-DL-NEXT:    v_bfe_i32 v7, v0, 0, 8
1686; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
1687; GFX11-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
1688; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[4:5]
1689; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0302
1690; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0302
1691; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1692; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
1693; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1694; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
1695; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1696; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1697; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v3
1698; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[4:5]
1699; GFX11-DL-NEXT:    s_endpgm
1700                                              ptr addrspace(1) %src2,
1701                                              ptr addrspace(1) nocapture %dst) {
1702entry:
1703  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1704  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1705  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1706  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1707  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1708
1709  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1710  %cv1e0 = sext i8 %v1e0 to i16
1711  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1712  %cv2e0 = sext i8 %v2e0 to i16
1713  %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
1714
1715  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1716  %cv1e1 = zext i8 %v1e1 to i16
1717  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1718  %cv2e1 = zext i8 %v2e1 to i16
1719  %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
1720
1721  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1722  %cv1e2 = zext i8 %v1e2 to i16
1723  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1724  %cv2e2 = zext i8 %v2e2 to i16
1725  %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
1726
1727  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1728  %cv1e3 = zext i8 %v1e3 to i16
1729  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1730  %cv2e3 = zext i8 %v2e3 to i16
1731  %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
1732
1733  %acc = load i16, ptr addrspace(1) %dst, align 2
1734  %add1 = add i16 %mul2, %acc
1735  %add2 = add i16 %add1, %mul1
1736  %add3 = add i16 %add2, %mul3
1737  %add4 = add i16 %add3, %mul4
1738
1739  store i16 %add4, ptr addrspace(1) %dst, align 2
1740  ret void
1741}
1742
1743
1744define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
1745; GFX7-LABEL: notdot4_mixedtypes2:
1746; GFX7:       ; %bb.0: ; %entry
1747; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1748; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1749; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1750; GFX7-NEXT:    s_mov_b32 s6, 0
1751; GFX7-NEXT:    s_mov_b32 s7, s3
1752; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1753; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1754; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1755; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1756; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1757; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1758; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1759; GFX7-NEXT:    s_mov_b32 s2, -1
1760; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
1761; GFX7-NEXT:    s_waitcnt vmcnt(2)
1762; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
1763; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
1764; GFX7-NEXT:    s_waitcnt vmcnt(1)
1765; GFX7-NEXT:    v_bfe_i32 v7, v0, 8, 8
1766; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
1767; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
1768; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1769; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
1770; GFX7-NEXT:    s_waitcnt vmcnt(0)
1771; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
1772; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
1773; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
1774; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
1775; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
1776; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1777; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1778; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
1779; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1780; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
1781; GFX7-NEXT:    s_endpgm
1782;
1783; GFX8-LABEL: notdot4_mixedtypes2:
1784; GFX8:       ; %bb.0: ; %entry
1785; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1786; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1787; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1788; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
1789; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1790; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1791; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1792; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1793; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1794; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1795; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1796; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1797; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1798; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1799; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1800; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
1801; GFX8-NEXT:    s_waitcnt vmcnt(2)
1802; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
1803; GFX8-NEXT:    v_and_b32_e32 v9, 0xff, v9
1804; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1805; GFX8-NEXT:    v_bfe_i32 v7, v3, 0, 8
1806; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 8
1807; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1808; GFX8-NEXT:    s_waitcnt vmcnt(1)
1809; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
1810; GFX8-NEXT:    v_bfe_i32 v10, v10, 0, 8
1811; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v2
1812; GFX8-NEXT:    s_waitcnt vmcnt(0)
1813; GFX8-NEXT:    v_mad_u16 v4, v9, v10, v4
1814; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1815; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1816; GFX8-NEXT:    v_mad_u16 v4, v7, v8, v4
1817; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
1818; GFX8-NEXT:    v_mad_u16 v4, v6, v5, v4
1819; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1820; GFX8-NEXT:    flat_store_short v[0:1], v2
1821; GFX8-NEXT:    s_endpgm
1822;
1823; GFX9-NODL-LABEL: notdot4_mixedtypes2:
1824; GFX9-NODL:       ; %bb.0: ; %entry
1825; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1826; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1827; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1828; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1829; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1830; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1831; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1832; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[6:7]
1833; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
1834; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
1835; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
1836; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1837; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
1838; GFX9-NODL-NEXT:    v_and_b32_e32 v7, 0xff, v7
1839; GFX9-NODL-NEXT:    v_bfe_i32 v8, v8, 0, 8
1840; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
1841; GFX9-NODL-NEXT:    v_bfe_i32 v5, v1, 0, 8
1842; GFX9-NODL-NEXT:    v_and_b32_e32 v6, 0xff, v2
1843; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1844; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v7, v8, v3
1845; GFX9-NODL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1846; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1847; GFX9-NODL-NEXT:    v_bfe_i32 v4, v4, 0, 8
1848; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v5, v6, v3
1849; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1850; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
1851; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v4, v9, v3
1852; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1853; GFX9-NODL-NEXT:    global_store_short v0, v1, s[6:7]
1854; GFX9-NODL-NEXT:    s_endpgm
1855;
1856; GFX9-DL-LABEL: notdot4_mixedtypes2:
1857; GFX9-DL:       ; %bb.0: ; %entry
1858; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1859; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1860; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1861; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1862; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1863; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1864; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1865; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[6:7]
1866; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
1867; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1868; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
1869; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1870; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
1871; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xff, v7
1872; GFX9-DL-NEXT:    v_bfe_i32 v8, v8, 0, 8
1873; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
1874; GFX9-DL-NEXT:    v_bfe_i32 v5, v1, 0, 8
1875; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v2
1876; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1877; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v8, v3
1878; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1879; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1880; GFX9-DL-NEXT:    v_bfe_i32 v4, v4, 0, 8
1881; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v5, v6, v3
1882; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1883; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
1884; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v9, v3
1885; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1886; GFX9-DL-NEXT:    global_store_short v0, v1, s[6:7]
1887; GFX9-DL-NEXT:    s_endpgm
1888;
1889; GFX10-DL-LABEL: notdot4_mixedtypes2:
1890; GFX10-DL:       ; %bb.0: ; %entry
1891; GFX10-DL-NEXT:    s_clause 0x1
1892; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1893; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1894; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1895; GFX10-DL-NEXT:    v_mov_b32_e32 v8, 0xff
1896; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1897; GFX10-DL-NEXT:    s_clause 0x1
1898; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1899; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1900; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1901; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[6:7]
1902; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1903; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
1904; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1905; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
1906; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
1907; GFX10-DL-NEXT:    v_bfe_i32 v7, v1, 0, 8
1908; GFX10-DL-NEXT:    v_and_b32_e32 v9, 0xff, v2
1909; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
1910; GFX10-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
1911; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1912; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1913; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
1914; GFX10-DL-NEXT:    v_bfe_i32 v4, v6, 0, 8
1915; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1916; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1917; GFX10-DL-NEXT:    v_mad_u16 v3, v7, v9, v3
1918; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
1919; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
1920; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
1921; GFX10-DL-NEXT:    global_store_short v0, v1, s[6:7]
1922; GFX10-DL-NEXT:    s_endpgm
1923;
1924; GFX11-DL-LABEL: notdot4_mixedtypes2:
1925; GFX11-DL:       ; %bb.0: ; %entry
1926; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1927; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1928; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1929; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1930; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1931; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1932; GFX11-DL-NEXT:    s_clause 0x1
1933; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
1934; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
1935; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
1936; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
1937; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
1938; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1939; GFX11-DL-NEXT:    v_and_b32_e32 v9, 0xff, v0
1940; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[4:5]
1941; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
1942; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
1943; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
1944; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
1945; GFX11-DL-NEXT:    v_bfe_i32 v8, v1, 0, 8
1946; GFX11-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
1947; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1948; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1949; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
1950; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
1951; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1952; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
1953; GFX11-DL-NEXT:    v_bfe_i32 v4, v6, 0, 8
1954; GFX11-DL-NEXT:    v_and_b32_e32 v5, 0xff, v7
1955; GFX11-DL-NEXT:    v_mad_u16 v3, v8, v9, v3
1956; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1957; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
1958; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v3
1959; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[4:5]
1960; GFX11-DL-NEXT:    s_endpgm
1961                                              ptr addrspace(1) %src2,
1962                                              ptr addrspace(1) nocapture %dst) {
1963entry:
1964  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1965  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1966  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1967  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1968  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1969
1970  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1971  %cv1e0 = sext i8 %v1e0 to i16
1972  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1973  %cv2e0 = zext i8 %v2e0 to i16
1974  %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
1975
1976  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1977  %cv1e1 = zext i8 %v1e1 to i16
1978  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1979  %cv2e1 = sext i8 %v2e1 to i16
1980  %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
1981
1982  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1983  %cv1e2 = sext i8 %v1e2 to i16
1984  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1985  %cv2e2 = zext i8 %v2e2 to i16
1986  %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
1987
1988  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1989  %cv1e3 = zext i8 %v1e3 to i16
1990  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1991  %cv2e3 = sext i8 %v2e3 to i16
1992  %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
1993
1994  %acc = load i16, ptr addrspace(1) %dst, align 2
1995  %add1 = add i16 %mul2, %acc
1996  %add2 = add i16 %add1, %mul1
1997  %add3 = add i16 %add2, %mul3
1998  %add4 = add i16 %add3, %mul4
1999
2000  store i16 %add4, ptr addrspace(1) %dst, align 2
2001  ret void
2002}
2003
2004; TODO: cleanup s_lshr_b32
2005define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
2006; GFX7-LABEL: udot4_acc32_vecMul:
2007; GFX7:       ; %bb.0: ; %entry
2008; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2009; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2010; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2011; GFX7-NEXT:    s_mov_b32 s6, 0
2012; GFX7-NEXT:    s_mov_b32 s7, s3
2013; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2014; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2015; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2016; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2017; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2018; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
2019; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2020; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
2021; GFX7-NEXT:    s_mov_b32 s2, -1
2022; GFX7-NEXT:    s_waitcnt vmcnt(1)
2023; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
2024; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
2025; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
2026; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
2027; GFX7-NEXT:    s_waitcnt vmcnt(0)
2028; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
2029; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
2030; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
2031; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
2032; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2033; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, s4
2034; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
2035; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v7, v0
2036; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v5, v0
2037; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2038; GFX7-NEXT:    s_endpgm
2039;
2040; GFX8-LABEL: udot4_acc32_vecMul:
2041; GFX8:       ; %bb.0: ; %entry
2042; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2043; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2044; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2045; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2046; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2047; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2048; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2049; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2050; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2051; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2052; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2053; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2054; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2055; GFX8-NEXT:    s_waitcnt vmcnt(1)
2056; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
2057; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 8
2058; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 8, v3
2059; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v3
2060; GFX8-NEXT:    s_waitcnt vmcnt(0)
2061; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
2062; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 8
2063; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 8, v0
2064; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
2065; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2066; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, s0
2067; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v7, v0
2068; GFX8-NEXT:    v_mad_u32_u24 v0, v4, v6, v0
2069; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v2, v0
2070; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2071; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2072; GFX8-NEXT:    flat_store_dword v[0:1], v2
2073; GFX8-NEXT:    s_endpgm
2074;
2075; GFX9-NODL-LABEL: udot4_acc32_vecMul:
2076; GFX9-NODL:       ; %bb.0: ; %entry
2077; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2078; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2079; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2080; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2081; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
2082; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
2083; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
2084; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2085; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2086; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2087; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2088; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
2089; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2090; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2091; GFX9-NODL-NEXT:    v_add3_u32 v2, v3, s0, v4
2092; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
2093; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
2094; GFX9-NODL-NEXT:    s_endpgm
2095;
2096; GFX9-DL-LABEL: udot4_acc32_vecMul:
2097; GFX9-DL:       ; %bb.0: ; %entry
2098; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2099; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2100; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2101; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2102; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2103; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2104; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2105; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2106; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2107; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
2108; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2109; GFX9-DL-NEXT:    s_endpgm
2110;
2111; GFX10-DL-LABEL: udot4_acc32_vecMul:
2112; GFX10-DL:       ; %bb.0: ; %entry
2113; GFX10-DL-NEXT:    s_clause 0x1
2114; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2115; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2116; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2117; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2118; GFX10-DL-NEXT:    s_clause 0x1
2119; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2120; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2121; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
2122; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2123; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2124; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2125; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
2126; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2127; GFX10-DL-NEXT:    s_endpgm
2128;
2129; GFX11-DL-LABEL: udot4_acc32_vecMul:
2130; GFX11-DL:       ; %bb.0: ; %entry
2131; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2132; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2133; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2134; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
2135; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2136; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2137; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2138; GFX11-DL-NEXT:    s_clause 0x1
2139; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
2140; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
2141; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
2142; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2143; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
2144; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
2145; GFX11-DL-NEXT:    s_endpgm
2146                                              ptr addrspace(1) %src2,
2147                                              ptr addrspace(1) nocapture %dst) {
2148entry:
2149  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2150  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2151  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2152  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2153  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2154
2155  %cvec1 = zext <4 x i8> %vec1 to <4 x i32>
2156  %cvec2 = zext <4 x i8> %vec2 to <4 x i32>
2157
2158  %mul = mul <4 x i32> %cvec1, %cvec2
2159  %mul0 = extractelement <4 x i32> %mul, i64 0
2160  %mul1 = extractelement <4 x i32> %mul, i64 1
2161  %mul2 = extractelement <4 x i32> %mul, i64 2
2162  %mul3 = extractelement <4 x i32> %mul, i64 3
2163
2164  %acc = load i32, ptr addrspace(1) %dst, align 4
2165  %add1 = add i32 %mul0, %acc
2166  %add2 = add i32 %add1, %mul1
2167  %add3 = add i32 %add2, %mul2
2168  %add4 = add i32 %add3, %mul3
2169
2170  store i32 %add4, ptr addrspace(1) %dst, align 4
2171  ret void
2172}
2173
2174; TODO: This pattern should be recognized.
2175define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
2176; GFX7-LABEL: udot4_acc16_vecMul:
2177; GFX7:       ; %bb.0: ; %entry
2178; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2179; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2180; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2181; GFX7-NEXT:    s_mov_b32 s6, 0
2182; GFX7-NEXT:    s_mov_b32 s7, s3
2183; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2184; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2185; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2186; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2187; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2188; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
2189; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2190; GFX7-NEXT:    s_mov_b32 s2, -1
2191; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
2192; GFX7-NEXT:    s_waitcnt vmcnt(2)
2193; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
2194; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v2
2195; GFX7-NEXT:    s_waitcnt vmcnt(1)
2196; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
2197; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v0
2198; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
2199; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
2200; GFX7-NEXT:    v_bfe_u32 v3, v0, 8, 8
2201; GFX7-NEXT:    v_alignbit_b32 v0, v6, v0, 16
2202; GFX7-NEXT:    s_waitcnt vmcnt(0)
2203; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v7, v1
2204; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2205; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
2206; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
2207; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
2208; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v3, v1
2209; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2210; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v5, v0
2211; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
2212; GFX7-NEXT:    s_endpgm
2213;
2214; GFX8-LABEL: udot4_acc16_vecMul:
2215; GFX8:       ; %bb.0: ; %entry
2216; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2217; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2218; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2219; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
2220; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2221; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2222; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2223; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2224; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2225; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2226; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2227; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2228; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2229; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2230; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2231; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
2232; GFX8-NEXT:    s_waitcnt vmcnt(2)
2233; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
2234; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 8, v3
2235; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2236; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v3
2237; GFX8-NEXT:    s_waitcnt vmcnt(1)
2238; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
2239; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 8, v2
2240; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2241; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v2
2242; GFX8-NEXT:    s_waitcnt vmcnt(0)
2243; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2244; GFX8-NEXT:    v_mad_u16 v2, v7, v9, v2
2245; GFX8-NEXT:    v_mad_u16 v2, v10, v5, v2
2246; GFX8-NEXT:    v_mad_u16 v2, v6, v8, v2
2247; GFX8-NEXT:    flat_store_short v[0:1], v2
2248; GFX8-NEXT:    s_endpgm
2249;
2250; GFX9-NODL-LABEL: udot4_acc16_vecMul:
2251; GFX9-NODL:       ; %bb.0: ; %entry
2252; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2253; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2254; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2255; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2256; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
2257; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
2258; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2259; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[6:7]
2260; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
2261; GFX9-NODL-NEXT:    s_mov_b32 s1, 0x5040100
2262; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
2263; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v4, 8, v1
2264; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
2265; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2266; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v6, 8, v2
2267; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
2268; GFX9-NODL-NEXT:    v_and_b32_e32 v8, 0xff, v1
2269; GFX9-NODL-NEXT:    v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2270; GFX9-NODL-NEXT:    v_and_b32_e32 v9, 0xff, v2
2271; GFX9-NODL-NEXT:    v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2272; GFX9-NODL-NEXT:    v_perm_b32 v2, v7, v2, s1
2273; GFX9-NODL-NEXT:    v_perm_b32 v1, v5, v1, s1
2274; GFX9-NODL-NEXT:    v_perm_b32 v5, v6, v9, s1
2275; GFX9-NODL-NEXT:    v_perm_b32 v4, v4, v8, s1
2276; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2277; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
2278; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2279; GFX9-NODL-NEXT:    v_add_u16_e32 v3, v2, v3
2280; GFX9-NODL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2281; GFX9-NODL-NEXT:    v_add_u16_e32 v2, v2, v1
2282; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2283; GFX9-NODL-NEXT:    global_store_short v0, v1, s[6:7]
2284; GFX9-NODL-NEXT:    s_endpgm
2285;
2286; GFX9-DL-LABEL: udot4_acc16_vecMul:
2287; GFX9-DL:       ; %bb.0: ; %entry
2288; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2289; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2290; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2291; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2292; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2293; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2294; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2295; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[6:7]
2296; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
2297; GFX9-DL-NEXT:    s_mov_b32 s1, 0x5040100
2298; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2299; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v4, 8, v1
2300; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
2301; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2302; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v6, 8, v2
2303; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
2304; GFX9-DL-NEXT:    v_and_b32_e32 v8, 0xff, v1
2305; GFX9-DL-NEXT:    v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2306; GFX9-DL-NEXT:    v_and_b32_e32 v9, 0xff, v2
2307; GFX9-DL-NEXT:    v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2308; GFX9-DL-NEXT:    v_perm_b32 v2, v7, v2, s1
2309; GFX9-DL-NEXT:    v_perm_b32 v1, v5, v1, s1
2310; GFX9-DL-NEXT:    v_perm_b32 v5, v6, v9, s1
2311; GFX9-DL-NEXT:    v_perm_b32 v4, v4, v8, s1
2312; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2313; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
2314; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2315; GFX9-DL-NEXT:    v_add_u16_e32 v3, v2, v3
2316; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2317; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v1
2318; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2319; GFX9-DL-NEXT:    global_store_short v0, v1, s[6:7]
2320; GFX9-DL-NEXT:    s_endpgm
2321;
2322; GFX10-DL-LABEL: udot4_acc16_vecMul:
2323; GFX10-DL:       ; %bb.0: ; %entry
2324; GFX10-DL-NEXT:    s_clause 0x1
2325; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2326; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2327; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2328; GFX10-DL-NEXT:    v_mov_b32_e32 v8, 0xff
2329; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2330; GFX10-DL-NEXT:    s_clause 0x1
2331; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2332; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2333; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2334; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[6:7]
2335; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2336; GFX10-DL-NEXT:    v_lshrrev_b16 v4, 8, v1
2337; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2338; GFX10-DL-NEXT:    v_lshrrev_b16 v5, 8, v2
2339; GFX10-DL-NEXT:    v_and_b32_e32 v6, 0xff, v2
2340; GFX10-DL-NEXT:    v_and_b32_e32 v7, 0xff, v1
2341; GFX10-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
2342; GFX10-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
2343; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
2344; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
2345; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2346; GFX10-DL-NEXT:    v_and_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2347; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
2348; GFX10-DL-NEXT:    v_perm_b32 v2, v7, v2, 0x5040100
2349; GFX10-DL-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
2350; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2351; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2352; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
2353; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2354; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
2355; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2356; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
2357; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
2358; GFX10-DL-NEXT:    global_store_short v0, v1, s[6:7]
2359; GFX10-DL-NEXT:    s_endpgm
2360;
2361; GFX11-DL-LABEL: udot4_acc16_vecMul:
2362; GFX11-DL:       ; %bb.0: ; %entry
2363; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2364; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2365; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2366; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2367; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2368; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2369; GFX11-DL-NEXT:    s_clause 0x1
2370; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
2371; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
2372; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
2373; GFX11-DL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1
2374; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[4:5]
2375; GFX11-DL-NEXT:    v_lshrrev_b16 v4, 8, v1
2376; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
2377; GFX11-DL-NEXT:    v_lshrrev_b16 v5, 8, v0
2378; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v0
2379; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
2380; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
2381; GFX11-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
2382; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
2383; GFX11-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
2384; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
2385; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v9
2386; GFX11-DL-NEXT:    v_and_b32_e32 v7, 0xff, v8
2387; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2388; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
2389; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v6, 0x5040100
2390; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2391; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
2392; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2393; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2394; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
2395; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
2396; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
2397; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
2398; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2399; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
2400; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
2401; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2402; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
2403; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[4:5]
2404; GFX11-DL-NEXT:    s_endpgm
2405                                              ptr addrspace(1) %src2,
2406                                              ptr addrspace(1) nocapture %dst) {
2407entry:
2408  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2409  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2410  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2411  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2412  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2413
2414  %cvec1 = zext <4 x i8> %vec1 to <4 x i16>
2415  %cvec2 = zext <4 x i8> %vec2 to <4 x i16>
2416
2417  %mul = mul <4 x i16> %cvec1, %cvec2
2418  %mul0 = extractelement <4 x i16> %mul, i64 0
2419  %mul1 = extractelement <4 x i16> %mul, i64 1
2420  %mul2 = extractelement <4 x i16> %mul, i64 2
2421  %mul3 = extractelement <4 x i16> %mul, i64 3
2422
2423  %acc = load i16, ptr addrspace(1) %dst, align 4
2424  %add1 = add i16 %mul0, %acc
2425  %add2 = add i16 %add1, %mul1
2426  %add3 = add i16 %add2, %mul2
2427  %add4 = add i16 %add3, %mul3
2428
2429  store i16 %add4, ptr addrspace(1) %dst, align 4
2430  ret void
2431}
2432
2433; TODO: Support this pattern.
2434define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
2435; GFX7-LABEL: udot4_acc8_vecMul:
2436; GFX7:       ; %bb.0: ; %entry
2437; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2438; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2439; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2440; GFX7-NEXT:    s_mov_b32 s6, 0
2441; GFX7-NEXT:    s_mov_b32 s7, s3
2442; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2443; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2444; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2445; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2446; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2447; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
2448; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2449; GFX7-NEXT:    s_mov_b32 s2, -1
2450; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
2451; GFX7-NEXT:    s_waitcnt vmcnt(2)
2452; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v2
2453; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
2454; GFX7-NEXT:    s_waitcnt vmcnt(1)
2455; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v0
2456; GFX7-NEXT:    v_bfe_u32 v8, v0, 8, 8
2457; GFX7-NEXT:    s_waitcnt vmcnt(0)
2458; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
2459; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
2460; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
2461; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
2462; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
2463; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
2464; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2465; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
2466; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2467; GFX7-NEXT:    s_endpgm
2468;
2469; GFX8-LABEL: udot4_acc8_vecMul:
2470; GFX8:       ; %bb.0: ; %entry
2471; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2472; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2473; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2474; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2475; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2476; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2477; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2478; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2479; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2480; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2481; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2482; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2483; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2484; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2485; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
2486; GFX8-NEXT:    s_waitcnt vmcnt(2)
2487; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2488; GFX8-NEXT:    s_waitcnt vmcnt(1)
2489; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2490; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2491; GFX8-NEXT:    v_mul_lo_u16_e32 v9, v5, v6
2492; GFX8-NEXT:    v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2493; GFX8-NEXT:    v_mul_lo_u16_sdwa v8, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2494; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
2495; GFX8-NEXT:    v_or_b32_e32 v8, v8, v9
2496; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
2497; GFX8-NEXT:    s_waitcnt vmcnt(0)
2498; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2499; GFX8-NEXT:    v_add_u16_e32 v2, v2, v8
2500; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
2501; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
2502; GFX8-NEXT:    v_add_u16_e32 v2, v2, v7
2503; GFX8-NEXT:    flat_store_byte v[0:1], v2
2504; GFX8-NEXT:    s_endpgm
2505;
2506; GFX9-NODL-LABEL: udot4_acc8_vecMul:
2507; GFX9-NODL:       ; %bb.0: ; %entry
2508; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2509; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2510; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2511; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2512; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
2513; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
2514; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2515; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[6:7]
2516; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
2517; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
2518; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2519; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
2520; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2521; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v8, v4, v5
2522; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2523; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 8, v6
2524; GFX9-NODL-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2525; GFX9-NODL-NEXT:    v_or_b32_e32 v6, v7, v6
2526; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
2527; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2528; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
2529; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v6
2530; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
2531; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v9
2532; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[6:7]
2533; GFX9-NODL-NEXT:    s_endpgm
2534;
2535; GFX9-DL-LABEL: udot4_acc8_vecMul:
2536; GFX9-DL:       ; %bb.0: ; %entry
2537; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2538; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2539; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2540; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2541; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2542; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2543; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2544; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[6:7]
2545; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2546; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
2547; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2548; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
2549; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2550; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v8, v4, v5
2551; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2552; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v6
2553; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2554; GFX9-DL-NEXT:    v_or_b32_e32 v6, v7, v6
2555; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
2556; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2557; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
2558; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
2559; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
2560; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v9
2561; GFX9-DL-NEXT:    global_store_byte v0, v1, s[6:7]
2562; GFX9-DL-NEXT:    s_endpgm
2563;
2564; GFX10-DL-LABEL: udot4_acc8_vecMul:
2565; GFX10-DL:       ; %bb.0: ; %entry
2566; GFX10-DL-NEXT:    s_clause 0x1
2567; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2568; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2569; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2570; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2571; GFX10-DL-NEXT:    s_clause 0x1
2572; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2573; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2574; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2575; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[6:7]
2576; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2577; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
2578; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2579; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
2580; GFX10-DL-NEXT:    v_lshrrev_b16 v6, 8, v1
2581; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
2582; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
2583; GFX10-DL-NEXT:    v_lshrrev_b16 v9, 8, v2
2584; GFX10-DL-NEXT:    v_mul_lo_u16 v4, v4, v5
2585; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2586; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
2587; GFX10-DL-NEXT:    v_mul_lo_u16 v5, v7, v8
2588; GFX10-DL-NEXT:    v_mul_lo_u16 v6, v6, v9
2589; GFX10-DL-NEXT:    v_lshlrev_b16 v4, 8, v4
2590; GFX10-DL-NEXT:    v_lshlrev_b16 v6, 8, v6
2591; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2592; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
2593; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2594; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
2595; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v5
2596; GFX10-DL-NEXT:    v_mad_u16 v1, v7, v8, v1
2597; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v2
2598; GFX10-DL-NEXT:    global_store_byte v0, v1, s[6:7]
2599; GFX10-DL-NEXT:    s_endpgm
2600;
2601; GFX11-DL-LABEL: udot4_acc8_vecMul:
2602; GFX11-DL:       ; %bb.0: ; %entry
2603; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2604; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2605; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2606; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
2607; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2608; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2609; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2610; GFX11-DL-NEXT:    s_clause 0x1
2611; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
2612; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
2613; GFX11-DL-NEXT:    global_load_u8 v3, v2, s[4:5]
2614; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
2615; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
2616; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
2617; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
2618; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
2619; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
2620; GFX11-DL-NEXT:    v_lshrrev_b16 v8, 8, v1
2621; GFX11-DL-NEXT:    v_lshrrev_b16 v9, 8, v0
2622; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
2623; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v3
2624; GFX11-DL-NEXT:    v_mul_lo_u16 v5, v5, v6
2625; GFX11-DL-NEXT:    v_mul_lo_u16 v6, v4, v7
2626; GFX11-DL-NEXT:    v_mul_lo_u16 v8, v8, v9
2627; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2628; GFX11-DL-NEXT:    v_lshlrev_b16 v5, 8, v5
2629; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
2630; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2631; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 8, v8
2632; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
2633; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2634; GFX11-DL-NEXT:    v_or_b32_e32 v6, v6, v5
2635; GFX11-DL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
2636; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2637; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
2638; GFX11-DL-NEXT:    v_or_b32_e32 v6, v8, v6
2639; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2640; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
2641; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v6
2642; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2643; GFX11-DL-NEXT:    v_mad_u16 v0, v4, v7, v0
2644; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v1
2645; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[4:5]
2646; GFX11-DL-NEXT:    s_endpgm
2647                                             ptr addrspace(1) %src2,
2648                                             ptr addrspace(1) nocapture %dst) {
2649entry:
2650  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2651  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2652  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2653  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2654  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2655
2656  %mul = mul <4 x i8> %vec1, %vec2
2657  %mul0 = extractelement <4 x i8> %mul, i64 0
2658  %mul1 = extractelement <4 x i8> %mul, i64 1
2659  %mul2 = extractelement <4 x i8> %mul, i64 2
2660  %mul3 = extractelement <4 x i8> %mul, i64 3
2661
2662  %acc = load i8, ptr addrspace(1) %dst, align 4
2663  %add1 = add i8 %mul0, %acc
2664  %add2 = add i8 %add1, %mul1
2665  %add3 = add i8 %add2, %mul2
2666  %add4 = add i8 %add3, %mul3
2667
2668  store i8 %add4, ptr addrspace(1) %dst, align 4
2669  ret void
2670}
2671
2672define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
2673; GFX7-LABEL: idot4_acc32_2ele:
2674; GFX7:       ; %bb.0: ; %entry
2675; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2676; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2677; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2678; GFX7-NEXT:    s_mov_b32 s10, 0
2679; GFX7-NEXT:    s_mov_b32 s11, s7
2680; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2681; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
2682; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2683; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2684; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2685; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
2686; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2687; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
2688; GFX7-NEXT:    s_mov_b32 s6, -1
2689; GFX7-NEXT:    s_waitcnt vmcnt(1)
2690; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
2691; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
2692; GFX7-NEXT:    s_waitcnt vmcnt(0)
2693; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v0
2694; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
2695; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2696; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s0
2697; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2698; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2699; GFX7-NEXT:    s_endpgm
2700;
2701; GFX8-LABEL: idot4_acc32_2ele:
2702; GFX8:       ; %bb.0: ; %entry
2703; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2704; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2705; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2706; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2707; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2708; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2709; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2710; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2711; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2712; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2713; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2714; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2715; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2716; GFX8-NEXT:    s_waitcnt vmcnt(1)
2717; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
2718; GFX8-NEXT:    v_bfe_u32 v3, v3, 8, 8
2719; GFX8-NEXT:    s_waitcnt vmcnt(0)
2720; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
2721; GFX8-NEXT:    v_bfe_u32 v0, v0, 8, 8
2722; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2723; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s0
2724; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
2725; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2726; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2727; GFX8-NEXT:    flat_store_dword v[0:1], v2
2728; GFX8-NEXT:    s_endpgm
2729;
2730; GFX9-NODL-LABEL: idot4_acc32_2ele:
2731; GFX9-NODL:       ; %bb.0: ; %entry
2732; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2733; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2734; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2735; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2736; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
2737; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
2738; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
2739; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2740; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2741; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2742; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2743; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2744; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, s0, v1
2745; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
2746; GFX9-NODL-NEXT:    s_endpgm
2747;
2748; GFX9-DL-LABEL: idot4_acc32_2ele:
2749; GFX9-DL:       ; %bb.0: ; %entry
2750; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2751; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2752; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2753; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2754; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
2755; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
2756; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2757; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0100
2758; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2759; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2760; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
2761; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2762; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
2763; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2764; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
2765; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2766; GFX9-DL-NEXT:    s_endpgm
2767;
2768; GFX10-DL-LABEL: idot4_acc32_2ele:
2769; GFX10-DL:       ; %bb.0: ; %entry
2770; GFX10-DL-NEXT:    s_clause 0x1
2771; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2772; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2773; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2774; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2775; GFX10-DL-NEXT:    s_clause 0x1
2776; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
2777; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
2778; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
2779; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2780; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2781; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc0c0100
2782; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2783; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc0c0100
2784; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
2785; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2786; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
2787; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
2788; GFX10-DL-NEXT:    s_endpgm
2789;
2790; GFX11-DL-LABEL: idot4_acc32_2ele:
2791; GFX11-DL:       ; %bb.0: ; %entry
2792; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2793; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2794; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2795; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
2796; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2797; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2798; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2799; GFX11-DL-NEXT:    s_clause 0x1
2800; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
2801; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
2802; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
2803; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
2804; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
2805; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
2806; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0100
2807; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2808; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2809; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
2810; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
2811; GFX11-DL-NEXT:    s_endpgm
2812                                       ptr addrspace(1) %src2,
2813                                       ptr addrspace(1) nocapture %dst) {
2814entry:
2815  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2816  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2817  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2818  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2819  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2820
2821  %v1e0 = extractelement <4 x i8> %vec1, i64 0
2822  %cv1e0 = zext i8 %v1e0 to i32
2823  %v2e0 = extractelement <4 x i8> %vec2, i64 0
2824  %cv2e0 = zext i8 %v2e0 to i32
2825  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
2826
2827  %v1e1 = extractelement <4 x i8> %vec1, i64 1
2828  %cv1e1 = zext i8 %v1e1 to i32
2829  %v2e1 = extractelement <4 x i8> %vec2, i64 1
2830  %cv2e1 = zext i8 %v2e1 to i32
2831  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
2832
2833  %acc = load i32, ptr addrspace(1) %dst, align 4
2834  %add1 = add i32 %mul1, %acc
2835  %add2 = add i32 %add1, %mul2
2836  store i32 %add2, ptr addrspace(1) %dst, align 4
2837  ret void
2838}
2839
2840define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
2841; GFX7-LABEL: idot4_acc32_3ele:
2842; GFX7:       ; %bb.0: ; %entry
2843; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2844; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2845; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2846; GFX7-NEXT:    s_mov_b32 s6, 0
2847; GFX7-NEXT:    s_mov_b32 s7, s3
2848; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2849; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2850; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2851; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2852; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2853; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
2854; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2855; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
2856; GFX7-NEXT:    s_mov_b32 s2, -1
2857; GFX7-NEXT:    s_waitcnt vmcnt(1)
2858; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
2859; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
2860; GFX7-NEXT:    s_waitcnt vmcnt(0)
2861; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
2862; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
2863; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2864; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
2865; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
2866; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
2867; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v5, v1
2868; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2869; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2870; GFX7-NEXT:    s_endpgm
2871;
2872; GFX8-LABEL: idot4_acc32_3ele:
2873; GFX8:       ; %bb.0: ; %entry
2874; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2875; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2876; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2877; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2878; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2879; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2880; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2881; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2882; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2883; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2884; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2885; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2886; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2887; GFX8-NEXT:    s_waitcnt vmcnt(1)
2888; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
2889; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
2890; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
2891; GFX8-NEXT:    s_waitcnt vmcnt(0)
2892; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
2893; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
2894; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2895; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s0
2896; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
2897; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
2898; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
2899; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2900; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2901; GFX8-NEXT:    flat_store_dword v[0:1], v2
2902; GFX8-NEXT:    s_endpgm
2903;
2904; GFX9-NODL-LABEL: idot4_acc32_3ele:
2905; GFX9-NODL:       ; %bb.0: ; %entry
2906; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2907; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2908; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2909; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2910; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
2911; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
2912; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
2913; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2914; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2915; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
2916; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2917; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
2918; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2919; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
2920; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2921; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, s0
2922; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
2923; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
2924; GFX9-NODL-NEXT:    s_endpgm
2925;
2926; GFX9-DL-LABEL: idot4_acc32_3ele:
2927; GFX9-DL:       ; %bb.0: ; %entry
2928; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2929; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2930; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2931; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2932; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
2933; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
2934; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2935; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
2936; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2937; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2938; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
2939; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2940; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
2941; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2942; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
2943; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2944; GFX9-DL-NEXT:    s_endpgm
2945;
2946; GFX10-DL-LABEL: idot4_acc32_3ele:
2947; GFX10-DL:       ; %bb.0: ; %entry
2948; GFX10-DL-NEXT:    s_clause 0x1
2949; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2950; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2951; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2952; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2953; GFX10-DL-NEXT:    s_clause 0x1
2954; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
2955; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
2956; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
2957; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2958; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2959; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
2960; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2961; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
2962; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
2963; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2964; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
2965; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
2966; GFX10-DL-NEXT:    s_endpgm
2967;
2968; GFX11-DL-LABEL: idot4_acc32_3ele:
2969; GFX11-DL:       ; %bb.0: ; %entry
2970; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2971; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2972; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2973; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
2974; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2975; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2976; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2977; GFX11-DL-NEXT:    s_clause 0x1
2978; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
2979; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
2980; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
2981; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
2982; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
2983; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
2984; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
2985; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2986; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2987; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
2988; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
2989; GFX11-DL-NEXT:    s_endpgm
2990                                       ptr addrspace(1) %src2,
2991                                       ptr addrspace(1) nocapture %dst) {
2992entry:
2993  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2994  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2995  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2996  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2997  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2998
2999  %v1e0 = extractelement <4 x i8> %vec1, i64 0
3000  %cv1e0 = zext i8 %v1e0 to i32
3001  %v2e0 = extractelement <4 x i8> %vec2, i64 0
3002  %cv2e0 = zext i8 %v2e0 to i32
3003  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
3004
3005  %v1e1 = extractelement <4 x i8> %vec1, i64 1
3006  %cv1e1 = zext i8 %v1e1 to i32
3007  %v2e1 = extractelement <4 x i8> %vec2, i64 1
3008  %cv2e1 = zext i8 %v2e1 to i32
3009  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
3010
3011  %v1e2 = extractelement <4 x i8> %vec1, i64 2
3012  %cv1e2 = zext i8 %v1e2 to i32
3013  %v2e2 = extractelement <4 x i8> %vec2, i64 2
3014  %cv2e2 = zext i8 %v2e2 to i32
3015  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
3016
3017  %acc = load i32, ptr addrspace(1) %dst, align 4
3018  %add1 = add i32 %mul1, %acc
3019  %add2 = add i32 %add1, %mul2
3020  %add3 = add i32 %add2, %mul3
3021  store i32 %add3, ptr addrspace(1) %dst, align 4
3022  ret void
3023}
3024
3025define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
3026; GFX7-LABEL: idot4_acc32_3ele_permuted:
3027; GFX7:       ; %bb.0: ; %entry
3028; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
3029; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
3030; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3031; GFX7-NEXT:    s_mov_b32 s6, 0
3032; GFX7-NEXT:    s_mov_b32 s7, s3
3033; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3034; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3035; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3036; GFX7-NEXT:    v_mov_b32_e32 v1, 0
3037; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3038; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
3039; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
3040; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
3041; GFX7-NEXT:    s_mov_b32 s2, -1
3042; GFX7-NEXT:    s_waitcnt vmcnt(1)
3043; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
3044; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
3045; GFX7-NEXT:    s_waitcnt vmcnt(0)
3046; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
3047; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
3048; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3049; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
3050; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
3051; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
3052; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v5, v1
3053; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
3054; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3055; GFX7-NEXT:    s_endpgm
3056;
3057; GFX8-LABEL: idot4_acc32_3ele_permuted:
3058; GFX8:       ; %bb.0: ; %entry
3059; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3060; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
3061; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3062; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3063; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3064; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3065; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3066; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3067; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3068; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3069; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3070; GFX8-NEXT:    flat_load_dword v0, v[0:1]
3071; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
3072; GFX8-NEXT:    s_waitcnt vmcnt(1)
3073; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
3074; GFX8-NEXT:    v_and_b32_e32 v4, 0xff, v3
3075; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
3076; GFX8-NEXT:    s_waitcnt vmcnt(0)
3077; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
3078; GFX8-NEXT:    v_and_b32_e32 v5, 0xff, v0
3079; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3080; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s0
3081; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
3082; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
3083; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
3084; GFX8-NEXT:    v_mov_b32_e32 v0, s4
3085; GFX8-NEXT:    v_mov_b32_e32 v1, s5
3086; GFX8-NEXT:    flat_store_dword v[0:1], v2
3087; GFX8-NEXT:    s_endpgm
3088;
3089; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted:
3090; GFX9-NODL:       ; %bb.0: ; %entry
3091; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3092; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3093; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3094; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3095; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
3096; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
3097; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
3098; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
3099; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
3100; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
3101; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
3102; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
3103; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3104; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
3105; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3106; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, s0
3107; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
3108; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
3109; GFX9-NODL-NEXT:    s_endpgm
3110;
3111; GFX9-DL-LABEL: idot4_acc32_3ele_permuted:
3112; GFX9-DL:       ; %bb.0: ; %entry
3113; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3114; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3115; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3116; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3117; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
3118; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
3119; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
3120; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020003
3121; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3122; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
3123; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
3124; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3125; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
3126; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3127; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
3128; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
3129; GFX9-DL-NEXT:    s_endpgm
3130;
3131; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
3132; GFX10-DL:       ; %bb.0: ; %entry
3133; GFX10-DL-NEXT:    s_clause 0x1
3134; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3135; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3136; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3137; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3138; GFX10-DL-NEXT:    s_clause 0x1
3139; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
3140; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
3141; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
3142; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
3143; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
3144; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020003
3145; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3146; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020003
3147; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
3148; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3149; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
3150; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
3151; GFX10-DL-NEXT:    s_endpgm
3152;
3153; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
3154; GFX11-DL:       ; %bb.0: ; %entry
3155; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3156; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3157; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3158; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
3159; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3160; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3161; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3162; GFX11-DL-NEXT:    s_clause 0x1
3163; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
3164; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
3165; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
3166; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
3167; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020003
3168; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
3169; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020003
3170; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3171; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3172; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
3173; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
3174; GFX11-DL-NEXT:    s_endpgm
3175                                       ptr addrspace(1) %src2,
3176                                       ptr addrspace(1) nocapture %dst) {
3177entry:
3178  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3179  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
3180  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
3181  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
3182  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
3183
3184  %v1e0 = extractelement <4 x i8> %vec1, i64 3
3185  %cv1e0 = zext i8 %v1e0 to i32
3186  %v2e0 = extractelement <4 x i8> %vec2, i64 3
3187  %cv2e0 = zext i8 %v2e0 to i32
3188  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
3189
3190  %v1e1 = extractelement <4 x i8> %vec1, i64 0
3191  %cv1e1 = zext i8 %v1e1 to i32
3192  %v2e1 = extractelement <4 x i8> %vec2, i64 0
3193  %cv2e1 = zext i8 %v2e1 to i32
3194  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
3195
3196  %v1e2 = extractelement <4 x i8> %vec1, i64 2
3197  %cv1e2 = zext i8 %v1e2 to i32
3198  %v2e2 = extractelement <4 x i8> %vec2, i64 2
3199  %cv2e2 = zext i8 %v2e2 to i32
3200  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
3201
3202  %acc = load i32, ptr addrspace(1) %dst, align 4
3203  %add1 = add i32 %mul1, %acc
3204  %add2 = add i32 %add1, %mul2
3205  %add3 = add i32 %add2, %mul3
3206  store i32 %add3, ptr addrspace(1) %dst, align 4
3207  ret void
3208}
3209
3210
3211define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
3212; GFX7-LABEL: idot4_acc32_opt:
3213; GFX7:       ; %bb.0: ; %entry
3214; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
3215; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
3216; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3217; GFX7-NEXT:    s_mov_b32 s6, 0
3218; GFX7-NEXT:    s_mov_b32 s7, s3
3219; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3220; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3221; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3222; GFX7-NEXT:    v_mov_b32_e32 v1, 0
3223; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3224; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
3225; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
3226; GFX7-NEXT:    s_mov_b32 s2, -1
3227; GFX7-NEXT:    s_waitcnt vmcnt(1)
3228; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
3229; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
3230; GFX7-NEXT:    s_waitcnt vmcnt(0)
3231; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
3232; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
3233; GFX7-NEXT:    v_mul_u32_u24_e32 v3, v3, v6
3234; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
3235; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
3236; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
3237; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
3238; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
3239; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
3240; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
3241; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3242; GFX7-NEXT:    s_endpgm
3243;
3244; GFX8-LABEL: idot4_acc32_opt:
3245; GFX8:       ; %bb.0: ; %entry
3246; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3247; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
3248; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3249; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3250; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3251; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3252; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3253; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3254; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3255; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3256; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3257; GFX8-NEXT:    flat_load_dword v2, v[0:1]
3258; GFX8-NEXT:    v_mov_b32_e32 v0, s4
3259; GFX8-NEXT:    v_mov_b32_e32 v1, s5
3260; GFX8-NEXT:    s_waitcnt vmcnt(1)
3261; GFX8-NEXT:    v_and_b32_e32 v4, 0xff, v3
3262; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 8
3263; GFX8-NEXT:    s_waitcnt vmcnt(0)
3264; GFX8-NEXT:    v_and_b32_e32 v5, 0xff, v2
3265; GFX8-NEXT:    v_mul_u32_u24_sdwa v6, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
3266; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 8
3267; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, v6
3268; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
3269; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
3270; GFX8-NEXT:    v_mad_u32_u24 v4, v7, v8, v4
3271; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v2, v4
3272; GFX8-NEXT:    flat_store_dword v[0:1], v2
3273; GFX8-NEXT:    s_endpgm
3274;
3275; GFX9-NODL-LABEL: idot4_acc32_opt:
3276; GFX9-NODL:       ; %bb.0: ; %entry
3277; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3278; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3279; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3280; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3281; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
3282; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
3283; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
3284; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
3285; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
3286; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
3287; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
3288; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
3289; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
3290; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
3291; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, v5
3292; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
3293; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
3294; GFX9-NODL-NEXT:    s_endpgm
3295;
3296; GFX9-DL-LABEL: idot4_acc32_opt:
3297; GFX9-DL:       ; %bb.0: ; %entry
3298; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3299; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3300; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3301; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3302; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
3303; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
3304; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3305; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3306; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, 0
3307; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
3308; GFX9-DL-NEXT:    s_endpgm
3309;
3310; GFX10-DL-LABEL: idot4_acc32_opt:
3311; GFX10-DL:       ; %bb.0: ; %entry
3312; GFX10-DL-NEXT:    s_clause 0x1
3313; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3314; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3315; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3316; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3317; GFX10-DL-NEXT:    s_clause 0x1
3318; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
3319; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
3320; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
3321; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3322; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, 0
3323; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
3324; GFX10-DL-NEXT:    s_endpgm
3325;
3326; GFX11-DL-LABEL: idot4_acc32_opt:
3327; GFX11-DL:       ; %bb.0: ; %entry
3328; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3329; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3330; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3331; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
3332; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3333; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3334; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3335; GFX11-DL-NEXT:    s_clause 0x1
3336; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
3337; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
3338; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
3339; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
3340; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
3341; GFX11-DL-NEXT:    s_endpgm
3342                                       ptr addrspace(1) %src2,
3343                                       ptr addrspace(1) nocapture %dst) {
3344entry:
3345  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3346  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
3347  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
3348  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
3349  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
3350
3351  %v1e0 = extractelement <4 x i8> %vec1, i64 0
3352  %cv1e0 = zext i8 %v1e0 to i32
3353  %v2e0 = extractelement <4 x i8> %vec2, i64 0
3354  %cv2e0 = zext i8 %v2e0 to i32
3355  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
3356
3357  %v1e1 = extractelement <4 x i8> %vec1, i64 1
3358  %cv1e1 = zext i8 %v1e1 to i32
3359  %v2e1 = extractelement <4 x i8> %vec2, i64 1
3360  %cv2e1 = zext i8 %v2e1 to i32
3361  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
3362
3363  %v1e2 = extractelement <4 x i8> %vec1, i64 2
3364  %cv1e2 = zext i8 %v1e2 to i32
3365  %v2e2 = extractelement <4 x i8> %vec2, i64 2
3366  %cv2e2 = zext i8 %v2e2 to i32
3367  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
3368
3369  %v1e3 = extractelement <4 x i8> %vec1, i64 3
3370  %cv1e3 = zext i8 %v1e3 to i32
3371  %v2e3 = extractelement <4 x i8> %vec2, i64 3
3372  %cv2e3 = zext i8 %v2e3 to i32
3373  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
3374
3375  %add2 = add i32 %mul1, %mul2
3376  %add3 = add i32 %add2, %mul3
3377  %add4 = add i32 %add3, %mul4
3378  store i32 %add4, ptr addrspace(1) %dst, align 4
3379  ret void
3380}
3381
3382define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
3383; GFX7-LABEL: udot4_acc32_3src:
3384; GFX7:       ; %bb.0: ; %entry
3385; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
3386; GFX7-NEXT:    s_mov_b32 s11, 0xf000
3387; GFX7-NEXT:    s_mov_b32 s14, 0
3388; GFX7-NEXT:    s_mov_b32 s15, s11
3389; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3390; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3391; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
3392; GFX7-NEXT:    v_mov_b32_e32 v1, 0
3393; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
3394; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
3395; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
3396; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
3397; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
3398; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
3399; GFX7-NEXT:    s_mov_b32 s10, -1
3400; GFX7-NEXT:    s_mov_b32 s8, s6
3401; GFX7-NEXT:    s_mov_b32 s9, s7
3402; GFX7-NEXT:    s_waitcnt vmcnt(2)
3403; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
3404; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
3405; GFX7-NEXT:    s_waitcnt vmcnt(1)
3406; GFX7-NEXT:    v_bfe_u32 v3, v3, 8, 8
3407; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3408; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
3409; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
3410; GFX7-NEXT:    s_waitcnt vmcnt(0)
3411; GFX7-NEXT:    v_bfe_u32 v6, v0, 16, 8
3412; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v3, v1
3413; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
3414; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
3415; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v6, v1
3416; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
3417; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
3418; GFX7-NEXT:    s_endpgm
3419;
3420; GFX8-LABEL: udot4_acc32_3src:
3421; GFX8:       ; %bb.0: ; %entry
3422; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
3423; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3424; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3425; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3426; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3427; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3428; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3429; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3430; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3431; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3432; GFX8-NEXT:    flat_load_dword v4, v[0:1]
3433; GFX8-NEXT:    v_mov_b32_e32 v1, s5
3434; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
3435; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3436; GFX8-NEXT:    flat_load_dword v0, v[0:1]
3437; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
3438; GFX8-NEXT:    s_waitcnt vmcnt(2)
3439; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
3440; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
3441; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3442; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
3443; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 8
3444; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
3445; GFX8-NEXT:    s_waitcnt vmcnt(1)
3446; GFX8-NEXT:    v_bfe_u32 v4, v4, 8, 8
3447; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v4, v1
3448; GFX8-NEXT:    s_waitcnt vmcnt(0)
3449; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 8
3450; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
3451; GFX8-NEXT:    v_mad_u32_u24 v1, v5, v6, v1
3452; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
3453; GFX8-NEXT:    v_mov_b32_e32 v0, s6
3454; GFX8-NEXT:    v_mov_b32_e32 v1, s7
3455; GFX8-NEXT:    flat_store_dword v[0:1], v2
3456; GFX8-NEXT:    s_endpgm
3457;
3458; GFX9-NODL-LABEL: udot4_acc32_3src:
3459; GFX9-NODL:       ; %bb.0: ; %entry
3460; GFX9-NODL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
3461; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3462; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3463; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[8:9]
3464; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[10:11]
3465; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[12:13]
3466; GFX9-NODL-NEXT:    s_load_dword s0, s[14:15], 0x0
3467; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
3468; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
3469; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3470; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
3471; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
3472; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
3473; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
3474; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
3475; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3476; GFX9-NODL-NEXT:    v_add3_u32 v2, v4, s0, v2
3477; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
3478; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[14:15]
3479; GFX9-NODL-NEXT:    s_endpgm
3480;
3481; GFX9-DL-LABEL: udot4_acc32_3src:
3482; GFX9-DL:       ; %bb.0: ; %entry
3483; GFX9-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
3484; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3485; GFX9-DL-NEXT:    s_mov_b32 s0, 0x706010c
3486; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0c00
3487; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3488; GFX9-DL-NEXT:    global_load_dword v1, v0, s[10:11]
3489; GFX9-DL-NEXT:    global_load_dword v2, v0, s[12:13]
3490; GFX9-DL-NEXT:    global_load_dword v3, v0, s[8:9]
3491; GFX9-DL-NEXT:    s_load_dword s1, s[14:15], 0x0
3492; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3493; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
3494; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
3495; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3496; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
3497; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
3498; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3499; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v3, v1, s1
3500; GFX9-DL-NEXT:    global_store_dword v0, v1, s[14:15]
3501; GFX9-DL-NEXT:    s_endpgm
3502;
3503; GFX10-DL-LABEL: udot4_acc32_3src:
3504; GFX10-DL:       ; %bb.0: ; %entry
3505; GFX10-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
3506; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3507; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3508; GFX10-DL-NEXT:    s_clause 0x2
3509; GFX10-DL-NEXT:    global_load_dword v1, v0, s[10:11]
3510; GFX10-DL-NEXT:    global_load_dword v2, v0, s[12:13]
3511; GFX10-DL-NEXT:    global_load_dword v3, v0, s[8:9]
3512; GFX10-DL-NEXT:    s_load_dword s0, s[14:15], 0x0
3513; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
3514; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0x706010c
3515; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3516; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
3517; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
3518; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
3519; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3520; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v3, v0, s0
3521; GFX10-DL-NEXT:    global_store_dword v1, v0, s[14:15]
3522; GFX10-DL-NEXT:    s_endpgm
3523;
3524; GFX11-DL-LABEL: udot4_acc32_3src:
3525; GFX11-DL:       ; %bb.0: ; %entry
3526; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
3527; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3528; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3529; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3530; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3531; GFX11-DL-NEXT:    s_clause 0x2
3532; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
3533; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
3534; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
3535; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
3536; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
3537; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0x706010c
3538; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
3539; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
3540; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
3541; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
3542; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
3543; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3544; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
3545; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
3546; GFX11-DL-NEXT:    s_endpgm
3547                                       ptr addrspace(1) %src2,
3548                                       ptr addrspace(1) %src3,
3549                                       ptr addrspace(1) nocapture %dst) {
3550entry:
3551  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3552  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
3553  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
3554  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
3555  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
3556  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
3557  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
3558
3559  %v1e0 = extractelement <4 x i8> %vec1, i64 0
3560  %cv1e0 = zext i8 %v1e0 to i32
3561  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
3562
3563  %v1e1 = extractelement <4 x i8> %vec1, i64 1
3564  %cv1e1 = zext i8 %v1e1 to i32
3565  %v2e1 = extractelement <4 x i8> %vec2, i64 1
3566  %cv2e1 = zext i8 %v2e1 to i32
3567  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
3568
3569  %v1e2 = extractelement <4 x i8> %vec1, i64 2
3570  %cv1e2 = zext i8 %v1e2 to i32
3571  %v3e2 = extractelement <4 x i8> %vec3, i64 2
3572  %cv3e2 = zext i8 %v3e2 to i32
3573  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
3574
3575  %v1e3 = extractelement <4 x i8> %vec1, i64 3
3576  %cv1e3 = zext i8 %v1e3 to i32
3577  %v3e3 = extractelement <4 x i8> %vec3, i64 3
3578  %cv3e3 = zext i8 %v3e3 to i32
3579  %mul4 = mul nuw nsw i32 %cv1e3, %cv3e3
3580
3581  %acc = load i32, ptr addrspace(1) %dst, align 4
3582  %mad1 = add i32 %mul1, %acc
3583  %mad2 = add i32 %mad1, %mul2
3584  %mad3 = add i32 %mad2, %mul3
3585  %mad4 = add i32 %mad3, %mul4
3586
3587  store i32 %mad4, ptr addrspace(1) %dst, align 4
3588  ret void
3589}
3590
3591define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
3592; GFX7-LABEL: udot4_acc32_3src_3ele:
3593; GFX7:       ; %bb.0: ; %entry
3594; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
3595; GFX7-NEXT:    s_mov_b32 s11, 0xf000
3596; GFX7-NEXT:    s_mov_b32 s14, 0
3597; GFX7-NEXT:    s_mov_b32 s15, s11
3598; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3599; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3600; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
3601; GFX7-NEXT:    v_mov_b32_e32 v1, 0
3602; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
3603; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
3604; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
3605; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
3606; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
3607; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
3608; GFX7-NEXT:    s_mov_b32 s10, -1
3609; GFX7-NEXT:    s_mov_b32 s8, s6
3610; GFX7-NEXT:    s_mov_b32 s9, s7
3611; GFX7-NEXT:    s_waitcnt vmcnt(2)
3612; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
3613; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
3614; GFX7-NEXT:    s_waitcnt vmcnt(1)
3615; GFX7-NEXT:    v_bfe_u32 v3, v3, 8, 8
3616; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3617; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
3618; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
3619; GFX7-NEXT:    s_waitcnt vmcnt(0)
3620; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
3621; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v3, v1
3622; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
3623; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
3624; GFX7-NEXT:    s_endpgm
3625;
3626; GFX8-LABEL: udot4_acc32_3src_3ele:
3627; GFX8:       ; %bb.0: ; %entry
3628; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
3629; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3630; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3631; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3632; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3633; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3634; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3635; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3636; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3637; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3638; GFX8-NEXT:    flat_load_dword v4, v[0:1]
3639; GFX8-NEXT:    v_mov_b32_e32 v1, s5
3640; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
3641; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3642; GFX8-NEXT:    flat_load_dword v0, v[0:1]
3643; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
3644; GFX8-NEXT:    s_waitcnt vmcnt(2)
3645; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
3646; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
3647; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3648; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
3649; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
3650; GFX8-NEXT:    s_waitcnt vmcnt(1)
3651; GFX8-NEXT:    v_bfe_u32 v4, v4, 8, 8
3652; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v4, v1
3653; GFX8-NEXT:    s_waitcnt vmcnt(0)
3654; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
3655; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
3656; GFX8-NEXT:    v_mov_b32_e32 v0, s6
3657; GFX8-NEXT:    v_mov_b32_e32 v1, s7
3658; GFX8-NEXT:    flat_store_dword v[0:1], v2
3659; GFX8-NEXT:    s_endpgm
3660;
3661; GFX9-NODL-LABEL: udot4_acc32_3src_3ele:
3662; GFX9-NODL:       ; %bb.0: ; %entry
3663; GFX9-NODL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
3664; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3665; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3666; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[8:9]
3667; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[10:11]
3668; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[12:13]
3669; GFX9-NODL-NEXT:    s_load_dword s0, s[14:15], 0x0
3670; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
3671; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
3672; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v1
3673; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
3674; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
3675; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
3676; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
3677; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3678; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v4, s0
3679; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
3680; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[14:15]
3681; GFX9-NODL-NEXT:    s_endpgm
3682;
3683; GFX9-DL-LABEL: udot4_acc32_3src_3ele:
3684; GFX9-DL:       ; %bb.0: ; %entry
3685; GFX9-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
3686; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3687; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
3688; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c00
3689; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020100
3690; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3691; GFX9-DL-NEXT:    global_load_dword v1, v0, s[10:11]
3692; GFX9-DL-NEXT:    global_load_dword v2, v0, s[12:13]
3693; GFX9-DL-NEXT:    global_load_dword v3, v0, s[8:9]
3694; GFX9-DL-NEXT:    s_load_dword s3, s[14:15], 0x0
3695; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3696; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
3697; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
3698; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3699; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
3700; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
3701; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
3702; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3703; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s3
3704; GFX9-DL-NEXT:    global_store_dword v0, v1, s[14:15]
3705; GFX9-DL-NEXT:    s_endpgm
3706;
3707; GFX10-DL-LABEL: udot4_acc32_3src_3ele:
3708; GFX10-DL:       ; %bb.0: ; %entry
3709; GFX10-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
3710; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3711; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3712; GFX10-DL-NEXT:    s_clause 0x2
3713; GFX10-DL-NEXT:    global_load_dword v1, v0, s[10:11]
3714; GFX10-DL-NEXT:    global_load_dword v2, v0, s[12:13]
3715; GFX10-DL-NEXT:    global_load_dword v3, v0, s[8:9]
3716; GFX10-DL-NEXT:    s_load_dword s0, s[14:15], 0x0
3717; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
3718; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc06010c
3719; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3720; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
3721; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
3722; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
3723; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc020100
3724; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3725; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
3726; GFX10-DL-NEXT:    global_store_dword v2, v0, s[14:15]
3727; GFX10-DL-NEXT:    s_endpgm
3728;
3729; GFX11-DL-LABEL: udot4_acc32_3src_3ele:
3730; GFX11-DL:       ; %bb.0: ; %entry
3731; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
3732; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3733; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3734; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3735; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3736; GFX11-DL-NEXT:    s_clause 0x2
3737; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
3738; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
3739; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
3740; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
3741; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
3742; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc06010c
3743; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
3744; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
3745; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
3746; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
3747; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
3748; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
3749; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3750; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
3751; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
3752; GFX11-DL-NEXT:    s_endpgm
3753                                       ptr addrspace(1) %src2,
3754                                       ptr addrspace(1) %src3,
3755                                       ptr addrspace(1) nocapture %dst) {
3756entry:
3757  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3758  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
3759  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
3760  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
3761  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
3762  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
3763  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
3764
3765  %v1e0 = extractelement <4 x i8> %vec1, i64 0
3766  %cv1e0 = zext i8 %v1e0 to i32
3767  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
3768
3769  %v1e1 = extractelement <4 x i8> %vec1, i64 1
3770  %cv1e1 = zext i8 %v1e1 to i32
3771  %v2e1 = extractelement <4 x i8> %vec2, i64 1
3772  %cv2e1 = zext i8 %v2e1 to i32
3773  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
3774
3775  %v1e2 = extractelement <4 x i8> %vec1, i64 2
3776  %cv1e2 = zext i8 %v1e2 to i32
3777  %v3e2 = extractelement <4 x i8> %vec3, i64 2
3778  %cv3e2 = zext i8 %v3e2 to i32
3779  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
3780
3781
3782  %acc = load i32, ptr addrspace(1) %dst, align 4
3783  %mad1 = add i32 %mul1, %acc
3784  %mad2 = add i32 %mad1, %mul2
3785  %mad3 = add i32 %mad2, %mul3
3786
3787  store i32 %mad3, ptr addrspace(1) %dst, align 4
3788  ret void
3789}
3790
3791define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
3792; GFX7-LABEL: udot4_bad_source:
3793; GFX7:       ; %bb.0: ; %entry
3794; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3795; GFX7-NEXT:    s_load_dword s12, s[4:5], 0xf
3796; GFX7-NEXT:    s_mov_b32 s7, 0xf000
3797; GFX7-NEXT:    s_mov_b32 s10, 0
3798; GFX7-NEXT:    s_mov_b32 s11, s7
3799; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3800; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
3801; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3802; GFX7-NEXT:    v_mov_b32_e32 v1, 0
3803; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
3804; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
3805; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
3806; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x11
3807; GFX7-NEXT:    s_and_b32 s1, s12, 0xffff
3808; GFX7-NEXT:    s_mov_b32 s6, -1
3809; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3810; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
3811; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3812; GFX7-NEXT:    v_mov_b32_e32 v1, s0
3813; GFX7-NEXT:    s_waitcnt vmcnt(1)
3814; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
3815; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
3816; GFX7-NEXT:    s_waitcnt vmcnt(0)
3817; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
3818; GFX7-NEXT:    v_mad_u32_u24 v1, v3, s1, v1
3819; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
3820; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
3821; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
3822; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
3823; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3824; GFX7-NEXT:    s_endpgm
3825;
3826; GFX8-LABEL: udot4_bad_source:
3827; GFX8:       ; %bb.0: ; %entry
3828; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3829; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x3c
3830; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3831; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3832; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3833; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3834; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3835; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3836; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3837; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3838; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3839; GFX8-NEXT:    flat_load_dword v0, v[0:1]
3840; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
3841; GFX8-NEXT:    s_and_b32 s3, s6, 0xffff
3842; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3843; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
3844; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3845; GFX8-NEXT:    v_mov_b32_e32 v1, s2
3846; GFX8-NEXT:    s_waitcnt vmcnt(1)
3847; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v3
3848; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
3849; GFX8-NEXT:    v_mad_u32_u24 v1, v2, s3, v1
3850; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
3851; GFX8-NEXT:    s_waitcnt vmcnt(0)
3852; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
3853; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
3854; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
3855; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
3856; GFX8-NEXT:    v_mov_b32_e32 v0, s0
3857; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3858; GFX8-NEXT:    flat_store_dword v[0:1], v2
3859; GFX8-NEXT:    s_endpgm
3860;
3861; GFX9-NODL-LABEL: udot4_bad_source:
3862; GFX9-NODL:       ; %bb.0: ; %entry
3863; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3864; GFX9-NODL-NEXT:    s_load_dword s6, s[4:5], 0x3c
3865; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3866; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3867; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
3868; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
3869; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
3870; GFX9-NODL-NEXT:    s_and_b32 s3, s6, 0xffff
3871; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
3872; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3873; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
3874; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
3875; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
3876; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
3877; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
3878; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
3879; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3880; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
3881; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, s3, v2
3882; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v4, v1
3883; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
3884; GFX9-NODL-NEXT:    s_endpgm
3885;
3886; GFX9-DL-LABEL: udot4_bad_source:
3887; GFX9-DL:       ; %bb.0: ; %entry
3888; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3889; GFX9-DL-NEXT:    s_load_dword s6, s[4:5], 0x3c
3890; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3891; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3892; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
3893; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
3894; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
3895; GFX9-DL-NEXT:    s_mov_b32 s3, 0xc0c0201
3896; GFX9-DL-NEXT:    s_and_b32 s4, s6, 0xffff
3897; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3898; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3899; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
3900; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3901; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
3902; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
3903; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
3904; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3905; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s3
3906; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v4, s4, v3
3907; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s3
3908; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, v3
3909; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
3910; GFX9-DL-NEXT:    s_endpgm
3911;
3912; GFX10-DL-LABEL: udot4_bad_source:
3913; GFX10-DL:       ; %bb.0: ; %entry
3914; GFX10-DL-NEXT:    s_clause 0x1
3915; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3916; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x3c
3917; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3918; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
3919; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3920; GFX10-DL-NEXT:    s_clause 0x1
3921; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
3922; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
3923; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
3924; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
3925; GFX10-DL-NEXT:    s_and_b32 s3, s6, 0xffff
3926; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3927; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
3928; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
3929; GFX10-DL-NEXT:    v_and_b32_e32 v0, 0xff, v1
3930; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3931; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0201
3932; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
3933; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3934; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, s3, s2
3935; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v2, v0
3936; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
3937; GFX10-DL-NEXT:    s_endpgm
3938;
3939; GFX11-DL-LABEL: udot4_bad_source:
3940; GFX11-DL:       ; %bb.0: ; %entry
3941; GFX11-DL-NEXT:    s_clause 0x1
3942; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3943; GFX11-DL-NEXT:    s_load_b32 s6, s[4:5], 0x3c
3944; GFX11-DL-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
3945; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3946; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3947; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3948; GFX11-DL-NEXT:    s_clause 0x1
3949; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
3950; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
3951; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
3952; GFX11-DL-NEXT:    s_and_b32 s3, s6, 0xffff
3953; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3954; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
3955; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
3956; GFX11-DL-NEXT:    v_and_b32_e32 v2, 0xff, v1
3957; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
3958; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0201
3959; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
3960; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3961; GFX11-DL-NEXT:    v_mad_u32_u24 v2, v2, s3, s2
3962; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3963; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v2
3964; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
3965; GFX11-DL-NEXT:    s_endpgm
3966                                       ptr addrspace(1) %src2,
3967                                       ptr addrspace(1) %src3,
3968                                       i16 %badsource,
3969                                       ptr addrspace(1) nocapture %dst) {
3970entry:
3971  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3972  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
3973  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
3974  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
3975  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
3976  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
3977  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
3978
3979  %v1e0 = extractelement <4 x i8> %vec1, i64 0
3980  %cv1e0 = zext i8 %v1e0 to i32
3981  %v2e0 = extractelement <4 x i8> %vec2, i64 0
3982  %other = zext i16 %badsource to i32
3983  %mul1 = mul nuw nsw i32 %cv1e0, %other
3984
3985  %v1e1 = extractelement <4 x i8> %vec1, i64 1
3986  %cv1e1 = zext i8 %v1e1 to i32
3987  %v2e1 = extractelement <4 x i8> %vec2, i64 1
3988  %cv2e1 = zext i8 %v2e1 to i32
3989  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
3990
3991  %v2e2 = extractelement <4 x i8> %vec2, i64 2
3992  %cv2e2 = zext i8 %v2e2 to i32
3993  %v1e2 = extractelement <4 x i8> %vec1, i64 2
3994  %cv1e2 = zext i8 %v1e2 to i32
3995  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
3996
3997
3998  %acc = load i32, ptr addrspace(1) %dst, align 4
3999  %mad1 = add i32 %mul1, %acc
4000  %mad2 = add i32 %mad1, %mul2
4001  %mad3 = add i32 %mad2, %mul3
4002
4003  store i32 %mad3, ptr addrspace(1) %dst, align 4
4004  ret void
4005}
4006
4007
4008define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
4009; GFX7-LABEL: udot4_commutative:
4010; GFX7:       ; %bb.0: ; %entry
4011; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
4012; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xf
4013; GFX7-NEXT:    s_mov_b32 s3, 0xf000
4014; GFX7-NEXT:    s_mov_b32 s6, 0
4015; GFX7-NEXT:    s_mov_b32 s7, s3
4016; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4017; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4018; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4019; GFX7-NEXT:    v_mov_b32_e32 v1, 0
4020; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
4021; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
4022; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
4023; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
4024; GFX7-NEXT:    s_mov_b32 s2, -1
4025; GFX7-NEXT:    s_waitcnt vmcnt(1)
4026; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
4027; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
4028; GFX7-NEXT:    s_waitcnt vmcnt(0)
4029; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
4030; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
4031; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4032; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
4033; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
4034; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
4035; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v5, v1
4036; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
4037; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4038; GFX7-NEXT:    s_endpgm
4039;
4040; GFX8-LABEL: udot4_commutative:
4041; GFX8:       ; %bb.0: ; %entry
4042; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4043; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x3c
4044; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
4045; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4046; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4047; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
4048; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4049; GFX8-NEXT:    flat_load_dword v3, v[0:1]
4050; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4051; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
4052; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4053; GFX8-NEXT:    flat_load_dword v0, v[0:1]
4054; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
4055; GFX8-NEXT:    s_waitcnt vmcnt(1)
4056; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
4057; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
4058; GFX8-NEXT:    v_bfe_u32 v3, v3, 16, 8
4059; GFX8-NEXT:    s_waitcnt vmcnt(0)
4060; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
4061; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
4062; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4063; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s0
4064; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
4065; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
4066; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
4067; GFX8-NEXT:    v_mov_b32_e32 v0, s4
4068; GFX8-NEXT:    v_mov_b32_e32 v1, s5
4069; GFX8-NEXT:    flat_store_dword v[0:1], v2
4070; GFX8-NEXT:    s_endpgm
4071;
4072; GFX9-NODL-LABEL: udot4_commutative:
4073; GFX9-NODL:       ; %bb.0: ; %entry
4074; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4075; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4076; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4077; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
4078; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
4079; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
4080; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
4081; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
4082; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
4083; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
4084; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
4085; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
4086; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
4087; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
4088; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
4089; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, s0
4090; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
4091; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
4092; GFX9-NODL-NEXT:    s_endpgm
4093;
4094; GFX9-DL-LABEL: udot4_commutative:
4095; GFX9-DL:       ; %bb.0: ; %entry
4096; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4097; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4098; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4099; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
4100; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
4101; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
4102; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
4103; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
4104; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
4105; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
4106; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
4107; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
4108; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
4109; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
4110; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s0
4111; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
4112; GFX9-DL-NEXT:    s_endpgm
4113;
4114; GFX10-DL-LABEL: udot4_commutative:
4115; GFX10-DL:       ; %bb.0: ; %entry
4116; GFX10-DL-NEXT:    s_clause 0x1
4117; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4118; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
4119; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4120; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
4121; GFX10-DL-NEXT:    s_clause 0x1
4122; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
4123; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
4124; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
4125; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
4126; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
4127; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
4128; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
4129; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
4130; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
4131; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
4132; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
4133; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
4134; GFX10-DL-NEXT:    s_endpgm
4135;
4136; GFX11-DL-LABEL: udot4_commutative:
4137; GFX11-DL:       ; %bb.0: ; %entry
4138; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4139; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4140; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x3c
4141; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
4142; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4143; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4144; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
4145; GFX11-DL-NEXT:    s_clause 0x1
4146; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
4147; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
4148; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
4149; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
4150; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
4151; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
4152; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
4153; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
4154; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4155; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
4156; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
4157; GFX11-DL-NEXT:    s_endpgm
4158                                       ptr addrspace(1) %src2,
4159                                       ptr addrspace(1) %src3,
4160                                       ptr addrspace(1) nocapture %dst) {
4161entry:
4162  %idx = call i32 @llvm.amdgcn.workitem.id.x()
4163  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
4164  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
4165  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
4166  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
4167  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
4168  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
4169
4170  %v1e0 = extractelement <4 x i8> %vec1, i64 0
4171  %cv1e0 = zext i8 %v1e0 to i32
4172  %v2e0 = extractelement <4 x i8> %vec2, i64 0
4173  %cv2e0 = zext i8 %v2e0 to i32
4174  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
4175
4176  %v1e1 = extractelement <4 x i8> %vec1, i64 1
4177  %cv1e1 = zext i8 %v1e1 to i32
4178  %v2e1 = extractelement <4 x i8> %vec2, i64 1
4179  %cv2e1 = zext i8 %v2e1 to i32
4180  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
4181
4182  %v2e2 = extractelement <4 x i8> %vec2, i64 2
4183  %cv2e2 = zext i8 %v2e2 to i32
4184  %v1e2 = extractelement <4 x i8> %vec1, i64 2
4185  %cv1e2 = zext i8 %v1e2 to i32
4186  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
4187
4188
4189  %acc = load i32, ptr addrspace(1) %dst, align 4
4190  %mad1 = add i32 %mul1, %acc
4191  %mad2 = add i32 %mad1, %mul2
4192  %mad3 = add i32 %mad2, %mul3
4193
4194  store i32 %mad3, ptr addrspace(1) %dst, align 4
4195  ret void
4196}
4197
4198define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
4199; GFX7-LABEL: udot4_acc32_3src_3ele_src0:
4200; GFX7:       ; %bb.0: ; %entry
4201; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
4202; GFX7-NEXT:    s_mov_b32 s11, 0xf000
4203; GFX7-NEXT:    s_mov_b32 s14, 0
4204; GFX7-NEXT:    s_mov_b32 s15, s11
4205; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4206; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4207; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
4208; GFX7-NEXT:    v_mov_b32_e32 v1, 0
4209; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
4210; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
4211; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
4212; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
4213; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
4214; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
4215; GFX7-NEXT:    s_mov_b32 s10, -1
4216; GFX7-NEXT:    s_mov_b32 s8, s6
4217; GFX7-NEXT:    s_mov_b32 s9, s7
4218; GFX7-NEXT:    s_waitcnt vmcnt(2)
4219; GFX7-NEXT:    v_bfe_u32 v1, v2, 8, 8
4220; GFX7-NEXT:    s_waitcnt vmcnt(1)
4221; GFX7-NEXT:    v_bfe_u32 v2, v3, 8, 8
4222; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4223; GFX7-NEXT:    v_mad_u32_u24 v4, v2, v2, s0
4224; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 8
4225; GFX7-NEXT:    s_waitcnt vmcnt(0)
4226; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
4227; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
4228; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
4229; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
4230; GFX7-NEXT:    s_endpgm
4231;
4232; GFX8-LABEL: udot4_acc32_3src_3ele_src0:
4233; GFX8:       ; %bb.0: ; %entry
4234; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
4235; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
4236; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4237; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4238; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
4239; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4240; GFX8-NEXT:    flat_load_dword v3, v[0:1]
4241; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4242; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
4243; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4244; GFX8-NEXT:    flat_load_dword v4, v[0:1]
4245; GFX8-NEXT:    v_mov_b32_e32 v1, s5
4246; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
4247; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4248; GFX8-NEXT:    flat_load_dword v0, v[0:1]
4249; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
4250; GFX8-NEXT:    s_waitcnt vmcnt(2)
4251; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
4252; GFX8-NEXT:    s_waitcnt vmcnt(1)
4253; GFX8-NEXT:    v_bfe_u32 v1, v4, 8, 8
4254; GFX8-NEXT:    v_bfe_u32 v3, v4, 16, 8
4255; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4256; GFX8-NEXT:    v_mad_u32_u24 v4, v1, v1, s0
4257; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v1, v4
4258; GFX8-NEXT:    s_waitcnt vmcnt(0)
4259; GFX8-NEXT:    v_bfe_u32 v0, v0, 16, 8
4260; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
4261; GFX8-NEXT:    v_mov_b32_e32 v0, s6
4262; GFX8-NEXT:    v_mov_b32_e32 v1, s7
4263; GFX8-NEXT:    flat_store_dword v[0:1], v2
4264; GFX8-NEXT:    s_endpgm
4265;
4266; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0:
4267; GFX9-NODL:       ; %bb.0: ; %entry
4268; GFX9-NODL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
4269; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4270; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
4271; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[10:11]
4272; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[12:13]
4273; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
4274; GFX9-NODL-NEXT:    s_load_dword s0, s[14:15], 0x0
4275; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
4276; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
4277; GFX9-NODL-NEXT:    v_bfe_u32 v4, v1, 8, 8
4278; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
4279; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
4280; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
4281; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
4282; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
4283; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v4, s0
4284; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
4285; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[14:15]
4286; GFX9-NODL-NEXT:    s_endpgm
4287;
4288; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0:
4289; GFX9-DL:       ; %bb.0: ; %entry
4290; GFX9-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
4291; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4292; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
4293; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c01
4294; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020101
4295; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
4296; GFX9-DL-NEXT:    global_load_dword v1, v0, s[12:13]
4297; GFX9-DL-NEXT:    global_load_dword v2, v0, s[8:9]
4298; GFX9-DL-NEXT:    global_load_dword v3, v0, s[10:11]
4299; GFX9-DL-NEXT:    s_load_dword s3, s[14:15], 0x0
4300; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
4301; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
4302; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v2, s0
4303; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
4304; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
4305; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
4306; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
4307; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
4308; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, s3
4309; GFX9-DL-NEXT:    global_store_dword v0, v1, s[14:15]
4310; GFX9-DL-NEXT:    s_endpgm
4311;
4312; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0:
4313; GFX10-DL:       ; %bb.0: ; %entry
4314; GFX10-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
4315; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4316; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
4317; GFX10-DL-NEXT:    s_clause 0x2
4318; GFX10-DL-NEXT:    global_load_dword v1, v0, s[12:13]
4319; GFX10-DL-NEXT:    global_load_dword v2, v0, s[8:9]
4320; GFX10-DL-NEXT:    global_load_dword v3, v0, s[10:11]
4321; GFX10-DL-NEXT:    s_load_dword s0, s[14:15], 0x0
4322; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
4323; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v2, 0xc06010c
4324; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
4325; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c01
4326; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
4327; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
4328; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc020101
4329; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
4330; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
4331; GFX10-DL-NEXT:    global_store_dword v2, v0, s[14:15]
4332; GFX10-DL-NEXT:    s_endpgm
4333;
4334; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0:
4335; GFX11-DL:       ; %bb.0: ; %entry
4336; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
4337; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4338; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4339; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4340; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
4341; GFX11-DL-NEXT:    s_clause 0x2
4342; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
4343; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
4344; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
4345; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
4346; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
4347; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v2, 0xc06010c
4348; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
4349; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c01
4350; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020101
4351; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
4352; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
4353; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
4354; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
4355; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, s0
4356; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
4357; GFX11-DL-NEXT:    s_endpgm
4358                                       ptr addrspace(1) %src2,
4359                                       ptr addrspace(1) %src3,
4360                                       ptr addrspace(1) nocapture %dst) {
4361entry:
4362  %idx = call i32 @llvm.amdgcn.workitem.id.x()
4363  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
4364  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
4365  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
4366  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
4367  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
4368  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
4369
4370  %v2e0 = extractelement <4 x i8> %vec2, i64 1
4371  %cv2e0 = zext i8 %v2e0 to i32
4372  %mul1 = mul nuw nsw i32 %cv2e0, %cv2e0
4373
4374  %v1e1 = extractelement <4 x i8> %vec1, i64 1
4375  %cv1e1 = zext i8 %v1e1 to i32
4376  %v2e1 = extractelement <4 x i8> %vec2, i64 1
4377  %cv2e1 = zext i8 %v2e1 to i32
4378  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
4379
4380  %v3e2 = extractelement <4 x i8> %vec3, i64 2
4381  %cv3e2 = zext i8 %v3e2 to i32
4382  %v2e2 = extractelement <4 x i8> %vec2, i64 2
4383  %cv2e2 = zext i8 %v2e2 to i32
4384  %mul3 = mul nuw nsw i32 %cv2e2, %cv3e2
4385
4386
4387  %acc = load i32, ptr addrspace(1) %dst, align 4
4388  %mad1 = add i32 %mul1, %acc
4389  %mad2 = add i32 %mad1, %mul2
4390  %mad3 = add i32 %mad2, %mul3
4391
4392  store i32 %mad3, ptr addrspace(1) %dst, align 4
4393  ret void
4394}
4395
4396define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
4397; GFX7-LABEL: udot4_4src:
4398; GFX7:       ; %bb.0: ; %entry
4399; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x9
4400; GFX7-NEXT:    s_mov_b32 s3, 0xf000
4401; GFX7-NEXT:    s_mov_b32 s18, 0
4402; GFX7-NEXT:    s_mov_b32 s19, s3
4403; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4404; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4405; GFX7-NEXT:    s_mov_b64 s[16:17], s[8:9]
4406; GFX7-NEXT:    v_mov_b32_e32 v1, 0
4407; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[16:19], 0 addr64
4408; GFX7-NEXT:    s_mov_b64 s[16:17], s[10:11]
4409; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[16:19], 0 addr64
4410; GFX7-NEXT:    s_mov_b64 s[16:17], s[12:13]
4411; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64
4412; GFX7-NEXT:    s_mov_b64 s[16:17], s[14:15]
4413; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
4414; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x11
4415; GFX7-NEXT:    s_mov_b32 s2, -1
4416; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4417; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
4418; GFX7-NEXT:    s_waitcnt vmcnt(3)
4419; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
4420; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
4421; GFX7-NEXT:    s_waitcnt vmcnt(2)
4422; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v3
4423; GFX7-NEXT:    v_bfe_u32 v3, v3, 8, 8
4424; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4425; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v2, s4
4426; GFX7-NEXT:    s_waitcnt vmcnt(1)
4427; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v4
4428; GFX7-NEXT:    v_bfe_u32 v4, v4, 8, 8
4429; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v3, v1
4430; GFX7-NEXT:    s_waitcnt vmcnt(0)
4431; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v0
4432; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
4433; GFX7-NEXT:    v_mad_u32_u24 v1, v2, v4, v1
4434; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
4435; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4436; GFX7-NEXT:    s_endpgm
4437;
4438; GFX8-LABEL: udot4_4src:
4439; GFX8:       ; %bb.0: ; %entry
4440; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
4441; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
4442; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
4443; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4444; GFX8-NEXT:    v_mov_b32_e32 v1, s9
4445; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s8, v2
4446; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4447; GFX8-NEXT:    flat_load_dword v3, v[0:1]
4448; GFX8-NEXT:    v_mov_b32_e32 v1, s11
4449; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s10, v2
4450; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4451; GFX8-NEXT:    flat_load_dword v4, v[0:1]
4452; GFX8-NEXT:    v_mov_b32_e32 v1, s13
4453; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s12, v2
4454; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4455; GFX8-NEXT:    flat_load_dword v5, v[0:1]
4456; GFX8-NEXT:    v_mov_b32_e32 v1, s15
4457; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s14, v2
4458; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4459; GFX8-NEXT:    flat_load_dword v0, v[0:1]
4460; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
4461; GFX8-NEXT:    s_waitcnt vmcnt(3)
4462; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
4463; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
4464; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4465; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
4466; GFX8-NEXT:    s_waitcnt vmcnt(2)
4467; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v4
4468; GFX8-NEXT:    v_bfe_u32 v4, v4, 8, 8
4469; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
4470; GFX8-NEXT:    s_waitcnt vmcnt(1)
4471; GFX8-NEXT:    v_and_b32_e32 v6, 0xff, v5
4472; GFX8-NEXT:    v_bfe_u32 v5, v5, 8, 8
4473; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v5, v1
4474; GFX8-NEXT:    s_waitcnt vmcnt(0)
4475; GFX8-NEXT:    v_and_b32_e32 v7, 0xff, v0
4476; GFX8-NEXT:    v_bfe_u32 v0, v0, 8, 8
4477; GFX8-NEXT:    v_mad_u32_u24 v2, v7, v0, v1
4478; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4479; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4480; GFX8-NEXT:    flat_store_dword v[0:1], v2
4481; GFX8-NEXT:    s_endpgm
4482;
4483; GFX9-NODL-LABEL: udot4_4src:
4484; GFX9-NODL:       ; %bb.0: ; %entry
4485; GFX9-NODL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
4486; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4487; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
4488; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
4489; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[8:9]
4490; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[10:11]
4491; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[12:13]
4492; GFX9-NODL-NEXT:    global_load_dword v4, v0, s[14:15]
4493; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
4494; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
4495; GFX9-NODL-NEXT:    s_waitcnt vmcnt(3)
4496; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
4497; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
4498; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
4499; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
4500; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
4501; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
4502; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
4503; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
4504; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s2, v2
4505; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v4
4506; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
4507; GFX9-NODL-NEXT:    s_endpgm
4508;
4509; GFX9-DL-LABEL: udot4_4src:
4510; GFX9-DL:       ; %bb.0: ; %entry
4511; GFX9-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
4512; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4513; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
4514; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0501
4515; GFX9-DL-NEXT:    s_mov_b32 s3, 0x5010c0c
4516; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
4517; GFX9-DL-NEXT:    global_load_dword v1, v0, s[8:9]
4518; GFX9-DL-NEXT:    global_load_dword v2, v0, s[10:11]
4519; GFX9-DL-NEXT:    global_load_dword v3, v0, s[12:13]
4520; GFX9-DL-NEXT:    global_load_dword v4, v0, s[14:15]
4521; GFX9-DL-NEXT:    s_mov_b32 s4, 0xc0c0400
4522; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
4523; GFX9-DL-NEXT:    s_mov_b32 s5, 0x4000c0c
4524; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
4525; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
4526; GFX9-DL-NEXT:    v_perm_b32 v5, v2, v1, s2
4527; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s4
4528; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
4529; GFX9-DL-NEXT:    v_perm_b32 v6, v4, v3, s3
4530; GFX9-DL-NEXT:    v_perm_b32 v2, v4, v3, s5
4531; GFX9-DL-NEXT:    v_or_b32_e32 v3, v6, v5
4532; GFX9-DL-NEXT:    v_or_b32_e32 v1, v2, v1
4533; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
4534; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v3, s6
4535; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
4536; GFX9-DL-NEXT:    s_endpgm
4537;
4538; GFX10-DL-LABEL: udot4_4src:
4539; GFX10-DL:       ; %bb.0: ; %entry
4540; GFX10-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
4541; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4542; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
4543; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
4544; GFX10-DL-NEXT:    s_clause 0x3
4545; GFX10-DL-NEXT:    global_load_dword v1, v0, s[8:9]
4546; GFX10-DL-NEXT:    global_load_dword v2, v0, s[10:11]
4547; GFX10-DL-NEXT:    global_load_dword v3, v0, s[12:13]
4548; GFX10-DL-NEXT:    global_load_dword v4, v0, s[14:15]
4549; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
4550; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
4551; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc0c0501
4552; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
4553; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
4554; GFX10-DL-NEXT:    v_perm_b32 v5, v4, v3, 0x5010c0c
4555; GFX10-DL-NEXT:    v_perm_b32 v2, v4, v3, 0x4000c0c
4556; GFX10-DL-NEXT:    v_or_b32_e32 v0, v5, v0
4557; GFX10-DL-NEXT:    v_or_b32_e32 v1, v2, v1
4558; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
4559; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
4560; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
4561; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
4562; GFX10-DL-NEXT:    s_endpgm
4563;
4564; GFX11-DL-LABEL: udot4_4src:
4565; GFX11-DL:       ; %bb.0: ; %entry
4566; GFX11-DL-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
4567; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4568; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
4569; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4570; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
4571; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
4572; GFX11-DL-NEXT:    s_clause 0x3
4573; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[8:9]
4574; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[10:11]
4575; GFX11-DL-NEXT:    global_load_b32 v3, v0, s[12:13]
4576; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[14:15]
4577; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
4578; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
4579; GFX11-DL-NEXT:    v_perm_b32 v4, v2, v1, 0xc0c0501
4580; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
4581; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
4582; GFX11-DL-NEXT:    v_perm_b32 v5, v0, v3, 0x5010c0c
4583; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v3, 0x4000c0c
4584; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
4585; GFX11-DL-NEXT:    v_or_b32_e32 v2, v5, v4
4586; GFX11-DL-NEXT:    v_or_b32_e32 v0, v0, v1
4587; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
4588; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
4589; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
4590; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, s2
4591; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
4592; GFX11-DL-NEXT:    s_endpgm
4593                                       ptr addrspace(1) %src2,
4594                                       ptr addrspace(1) %src3,
4595                                       ptr addrspace(1) %src4,
4596                                       ptr addrspace(1) nocapture %dst) {
4597entry:
4598  %idx = call i32 @llvm.amdgcn.workitem.id.x()
4599
4600  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
4601  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
4602  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
4603  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
4604  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
4605  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
4606  %gep4 = getelementptr <4 x i8>, ptr addrspace(1) %src4, i32 %idx
4607  %vec4 = load <4 x i8>, ptr addrspace(1) %gep4
4608
4609
4610  %v1e0 = extractelement <4 x i8> %vec1, i64 0
4611  %cv1e0 = zext i8 %v1e0 to i32
4612  %v1e1 = extractelement <4 x i8> %vec1, i64 1
4613  %cv1e1 = zext i8 %v1e1 to i32
4614  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e1
4615
4616  %v2e0 = extractelement <4 x i8> %vec2, i64 0
4617  %cv2e0 = zext i8 %v2e0 to i32
4618  %v2e1 = extractelement <4 x i8> %vec2, i64 1
4619  %cv2e1 = zext i8 %v2e1 to i32
4620  %mul2 = mul nuw nsw i32 %cv2e0, %cv2e1
4621
4622  %v3e0 = extractelement <4 x i8> %vec3, i64 0
4623  %cv3e0 = zext i8 %v3e0 to i32
4624  %v3e1 = extractelement <4 x i8> %vec3, i64 1
4625  %cv3e1 = zext i8 %v3e1 to i32
4626  %mul3 = mul nuw nsw i32 %cv3e0, %cv3e1
4627
4628  %v4e0 = extractelement <4 x i8> %vec4, i64 0
4629  %cv4e0 = zext i8 %v4e0 to i32
4630  %v4e1 = extractelement <4 x i8> %vec4, i64 1
4631  %cv4e1 = zext i8 %v4e1 to i32
4632  %mul4 = mul nuw nsw i32 %cv4e0, %cv4e1
4633
4634
4635  %acc = load i32, ptr addrspace(1) %dst, align 4
4636  %mad1 = add i32 %mul1, %acc
4637  %mad2 = add i32 %mad1, %mul2
4638  %mad3 = add i32 %mad2, %mul3
4639  %mad4 = add i32 %mad3, %mul4
4640
4641  store i32 %mad4, ptr addrspace(1) %dst, align 4
4642  ret void
4643}
4644
4645define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
4646; GFX7-LABEL: udot4_acc32_multi:
4647; GFX7:       ; %bb.0: ; %entry
4648; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
4649; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
4650; GFX7-NEXT:    s_mov_b32 s3, 0xf000
4651; GFX7-NEXT:    s_mov_b32 s6, 0
4652; GFX7-NEXT:    s_mov_b32 s7, s3
4653; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4654; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
4655; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
4656; GFX7-NEXT:    v_mov_b32_e32 v1, 0
4657; GFX7-NEXT:    s_mov_b64 s[8:9], s[10:11]
4658; GFX7-NEXT:    s_mov_b64 s[10:11], s[6:7]
4659; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
4660; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
4661; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
4662; GFX7-NEXT:    s_mov_b32 s2, -1
4663; GFX7-NEXT:    s_waitcnt vmcnt(1)
4664; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
4665; GFX7-NEXT:    s_waitcnt vmcnt(0)
4666; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
4667; GFX7-NEXT:    v_bfe_u32 v7, v2, 16, 8
4668; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
4669; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4670; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, s4
4671; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v3
4672; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v8, v1
4673; GFX7-NEXT:    v_bfe_u32 v11, v3, 16, 8
4674; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v4, v1
4675; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
4676; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
4677; GFX7-NEXT:    v_mad_u32_u24 v1, v11, v8, v1
4678; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
4679; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
4680; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v6, v1
4681; GFX7-NEXT:    v_bfe_u32 v10, v3, 8, 8
4682; GFX7-NEXT:    v_mad_u32_u24 v1, v2, v0, v1
4683; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
4684; GFX7-NEXT:    v_mad_u32_u24 v1, v10, v6, v1
4685; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
4686; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4687; GFX7-NEXT:    s_endpgm
4688;
4689; GFX8-LABEL: udot4_acc32_multi:
4690; GFX8:       ; %bb.0: ; %entry
4691; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4692; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
4693; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
4694; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4695; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4696; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
4697; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4698; GFX8-NEXT:    v_mov_b32_e32 v3, s3
4699; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
4700; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
4701; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
4702; GFX8-NEXT:    flat_load_dword v2, v[2:3]
4703; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
4704; GFX8-NEXT:    s_waitcnt vmcnt(1)
4705; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v0
4706; GFX8-NEXT:    s_waitcnt vmcnt(0)
4707; GFX8-NEXT:    v_and_b32_e32 v4, 0xff, v2
4708; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
4709; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 8
4710; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4711; GFX8-NEXT:    v_mad_u32_u24 v3, v3, v4, s0
4712; GFX8-NEXT:    v_and_b32_e32 v9, 0xff, v1
4713; GFX8-NEXT:    v_mad_u32_u24 v3, v7, v8, v3
4714; GFX8-NEXT:    v_bfe_u32 v11, v1, 16, 8
4715; GFX8-NEXT:    v_mad_u32_u24 v3, v9, v4, v3
4716; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
4717; GFX8-NEXT:    v_bfe_u32 v6, v2, 8, 8
4718; GFX8-NEXT:    v_mad_u32_u24 v3, v11, v8, v3
4719; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
4720; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
4721; GFX8-NEXT:    v_mad_u32_u24 v3, v5, v6, v3
4722; GFX8-NEXT:    v_bfe_u32 v10, v1, 8, 8
4723; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, v3
4724; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
4725; GFX8-NEXT:    v_mad_u32_u24 v0, v10, v6, v0
4726; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v2, v0
4727; GFX8-NEXT:    v_mov_b32_e32 v0, s4
4728; GFX8-NEXT:    v_mov_b32_e32 v1, s5
4729; GFX8-NEXT:    flat_store_dword v[0:1], v2
4730; GFX8-NEXT:    s_endpgm
4731;
4732; GFX9-NODL-LABEL: udot4_acc32_multi:
4733; GFX9-NODL:       ; %bb.0: ; %entry
4734; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4735; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4736; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
4737; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
4738; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
4739; GFX9-NODL-NEXT:    global_load_dword v3, v2, s[2:3]
4740; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
4741; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, 0
4742; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
4743; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
4744; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
4745; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
4746; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
4747; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v7, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
4748; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v8, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
4749; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v9, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
4750; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
4751; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
4752; GFX9-NODL-NEXT:    v_add3_u32 v3, v4, s0, v6
4753; GFX9-NODL-NEXT:    v_add3_u32 v3, v3, v7, v9
4754; GFX9-NODL-NEXT:    v_add3_u32 v0, v5, v3, v0
4755; GFX9-NODL-NEXT:    v_add3_u32 v0, v0, v8, v1
4756; GFX9-NODL-NEXT:    global_store_dword v2, v0, s[6:7]
4757; GFX9-NODL-NEXT:    s_endpgm
4758;
4759; GFX9-DL-LABEL: udot4_acc32_multi:
4760; GFX9-DL:       ; %bb.0: ; %entry
4761; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4762; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4763; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
4764; GFX9-DL-NEXT:    s_mov_b32 s4, 0x3010301
4765; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
4766; GFX9-DL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
4767; GFX9-DL-NEXT:    global_load_dword v3, v2, s[2:3]
4768; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
4769; GFX9-DL-NEXT:    s_mov_b32 s0, 0x6040200
4770; GFX9-DL-NEXT:    s_mov_b32 s1, 0x2000200
4771; GFX9-DL-NEXT:    s_mov_b32 s2, 0x7050301
4772; GFX9-DL-NEXT:    v_mov_b32_e32 v2, 0
4773; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
4774; GFX9-DL-NEXT:    v_perm_b32 v4, v1, v0, s0
4775; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
4776; GFX9-DL-NEXT:    v_perm_b32 v5, v3, v3, s1
4777; GFX9-DL-NEXT:    v_perm_b32 v0, v1, v0, s2
4778; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
4779; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v4, v5, s3
4780; GFX9-DL-NEXT:    v_perm_b32 v3, v3, v3, s4
4781; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v0, v3, v1
4782; GFX9-DL-NEXT:    global_store_dword v2, v0, s[6:7]
4783; GFX9-DL-NEXT:    s_endpgm
4784;
4785; GFX10-DL-LABEL: udot4_acc32_multi:
4786; GFX10-DL:       ; %bb.0: ; %entry
4787; GFX10-DL-NEXT:    s_clause 0x1
4788; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4789; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4790; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
4791; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
4792; GFX10-DL-NEXT:    s_clause 0x1
4793; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
4794; GFX10-DL-NEXT:    global_load_dword v3, v2, s[2:3]
4795; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
4796; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
4797; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
4798; GFX10-DL-NEXT:    v_perm_b32 v2, v1, v0, 0x6040200
4799; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
4800; GFX10-DL-NEXT:    v_perm_b32 v4, v3, v3, 0x2000200
4801; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v0, 0x7050301
4802; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
4803; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v2, v4, s0
4804; GFX10-DL-NEXT:    v_perm_b32 v2, v3, v3, 0x3010301
4805; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
4806; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v1
4807; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
4808; GFX10-DL-NEXT:    s_endpgm
4809;
4810; GFX11-DL-LABEL: udot4_acc32_multi:
4811; GFX11-DL:       ; %bb.0: ; %entry
4812; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4813; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4814; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
4815; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4816; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
4817; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
4818; GFX11-DL-NEXT:    s_clause 0x1
4819; GFX11-DL-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
4820; GFX11-DL-NEXT:    global_load_b32 v2, v2, s[2:3]
4821; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
4822; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
4823; GFX11-DL-NEXT:    v_perm_b32 v3, v1, v0, 0x6040200
4824; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
4825; GFX11-DL-NEXT:    v_perm_b32 v4, v2, v2, 0x2000200
4826; GFX11-DL-NEXT:    v_perm_b32 v0, v1, v0, 0x7050301
4827; GFX11-DL-NEXT:    v_perm_b32 v2, v2, v2, 0x3010301
4828; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
4829; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4830; GFX11-DL-NEXT:    v_dot4_u32_u8 v1, v3, v4, s0
4831; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
4832; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v2, v1
4833; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[4:5]
4834; GFX11-DL-NEXT:    s_endpgm
4835                                       ptr addrspace(1) %src2,
4836                                       ptr addrspace(1) nocapture %dst) {
4837entry:
4838  %idx = call i32 @llvm.amdgcn.workitem.id.x()
4839  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
4840  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
4841  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
4842  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
4843
4844  %v1e0 = extractelement <8 x i8> %vec1, i64 0
4845  %cv1e0 = zext i8 %v1e0 to i32
4846  %v2e0 = extractelement <8 x i8> %vec2, i64 0
4847  %cv2e0 = zext i8 %v2e0 to i32
4848  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
4849
4850  %v1e1 = extractelement <8 x i8> %vec1, i64 1
4851  %cv1e1 = zext i8 %v1e1 to i32
4852  %v2e1 = extractelement <8 x i8> %vec2, i64 1
4853  %cv2e1 = zext i8 %v2e1 to i32
4854  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
4855
4856  %v1e2 = extractelement <8 x i8> %vec1, i64 2
4857  %cv1e2 = zext i8 %v1e2 to i32
4858  %v2e2 = extractelement <8 x i8> %vec2, i64 2
4859  %cv2e2 = zext i8 %v2e2 to i32
4860  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
4861
4862  %v1e3 = extractelement <8 x i8> %vec1, i64 3
4863  %cv1e3 = zext i8 %v1e3 to i32
4864  %v2e3 = extractelement <8 x i8> %vec2, i64 3
4865  %cv2e3 = zext i8 %v2e3 to i32
4866  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
4867
4868  %v1e4 = extractelement <8 x i8> %vec1, i64 4
4869  %cv1e4 = zext i8 %v1e4 to i32
4870  %v2e4 = extractelement <8 x i8> %vec2, i64 4
4871  %cv2e4 = zext i8 %v2e4 to i32
4872  %mul5 = mul nuw nsw i32 %cv1e4, %cv2e0
4873
4874  %v1e5 = extractelement <8 x i8> %vec1, i64 5
4875  %cv1e5 = zext i8 %v1e5 to i32
4876  %v2e5 = extractelement <8 x i8> %vec2, i64 5
4877  %cv2e5 = zext i8 %v2e5 to i32
4878  %mul6 = mul nuw nsw i32 %cv1e5, %cv2e1
4879
4880  %v1e6 = extractelement <8 x i8> %vec1, i64 6
4881  %cv1e6 = zext i8 %v1e6 to i32
4882  %v2e6 = extractelement <8 x i8> %vec2, i64 6
4883  %cv2e6 = zext i8 %v2e6 to i32
4884  %mul7 = mul nuw nsw i32 %cv1e6, %cv2e2
4885
4886  %v1e7 = extractelement <8 x i8> %vec1, i64 7
4887  %cv1e7 = zext i8 %v1e7 to i32
4888  %v2e7 = extractelement <8 x i8> %vec2, i64 7
4889  %cv2e7 = zext i8 %v2e7 to i32
4890  %mul8 = mul nuw nsw i32 %cv1e7, %cv2e3
4891
4892  %acc = load i32, ptr addrspace(1) %dst, align 4
4893  %mad11 = add i32 %mul1, %acc
4894  %mad21 = add i32 %mad11, %mul3
4895  %mad31 = add i32 %mad21, %mul5
4896  %mad41 = add i32 %mad31, %mul7
4897  %mad12 = add i32 %mul2, %mad41
4898  %mad22 = add i32 %mad12, %mul4
4899  %mad32 = add i32 %mad22, %mul6
4900  %mad42 = add i32 %mad32, %mul8
4901
4902  store i32 %mad42, ptr addrspace(1) %dst, align 4
4903  ret void
4904}
4905
4906define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
4907; GFX7-LABEL: idot4_acc32_hilo:
4908; GFX7:       ; %bb.0: ; %entry
4909; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
4910; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
4911; GFX7-NEXT:    s_mov_b32 s3, 0xf000
4912; GFX7-NEXT:    s_mov_b32 s6, 0
4913; GFX7-NEXT:    s_mov_b32 s7, s3
4914; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
4915; GFX7-NEXT:    v_mov_b32_e32 v1, 0
4916; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4917; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
4918; GFX7-NEXT:    s_mov_b64 s[10:11], s[6:7]
4919; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
4920; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
4921; GFX7-NEXT:    s_mov_b32 s2, -1
4922; GFX7-NEXT:    s_waitcnt vmcnt(1)
4923; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
4924; GFX7-NEXT:    s_waitcnt vmcnt(0)
4925; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
4926; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
4927; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v0
4928; GFX7-NEXT:    v_mul_u32_u24_e32 v4, v4, v5
4929; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 8
4930; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
4931; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, v4
4932; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
4933; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
4934; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
4935; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
4936; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4937; GFX7-NEXT:    s_endpgm
4938;
4939; GFX8-LABEL: idot4_acc32_hilo:
4940; GFX8:       ; %bb.0: ; %entry
4941; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4942; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
4943; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
4944; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4945; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4946; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
4947; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
4948; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4949; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
4950; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4951; GFX8-NEXT:    flat_load_dword v4, v[0:1]
4952; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
4953; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
4954; GFX8-NEXT:    flat_load_dword v2, v[0:1]
4955; GFX8-NEXT:    v_mov_b32_e32 v0, s4
4956; GFX8-NEXT:    v_mov_b32_e32 v1, s5
4957; GFX8-NEXT:    s_waitcnt vmcnt(1)
4958; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v4
4959; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 8
4960; GFX8-NEXT:    s_waitcnt vmcnt(0)
4961; GFX8-NEXT:    v_and_b32_e32 v6, 0xff, v2
4962; GFX8-NEXT:    v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
4963; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 8
4964; GFX8-NEXT:    v_mad_u32_u24 v3, v6, v3, v7
4965; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
4966; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
4967; GFX8-NEXT:    v_mad_u32_u24 v3, v8, v5, v3
4968; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v4, v3
4969; GFX8-NEXT:    flat_store_dword v[0:1], v2
4970; GFX8-NEXT:    s_endpgm
4971;
4972; GFX9-NODL-LABEL: idot4_acc32_hilo:
4973; GFX9-NODL:       ; %bb.0: ; %entry
4974; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4975; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4976; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
4977; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
4978; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1] offset:4
4979; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
4980; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
4981; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
4982; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
4983; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
4984; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
4985; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
4986; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
4987; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
4988; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, v5
4989; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
4990; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
4991; GFX9-NODL-NEXT:    s_endpgm
4992;
4993; GFX9-DL-LABEL: idot4_acc32_hilo:
4994; GFX9-DL:       ; %bb.0: ; %entry
4995; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4996; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4997; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
4998; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
4999; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1] offset:4
5000; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
5001; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
5002; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
5003; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, 0
5004; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
5005; GFX9-DL-NEXT:    s_endpgm
5006;
5007; GFX10-DL-LABEL: idot4_acc32_hilo:
5008; GFX10-DL:       ; %bb.0: ; %entry
5009; GFX10-DL-NEXT:    s_clause 0x1
5010; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5011; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5012; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5013; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
5014; GFX10-DL-NEXT:    s_clause 0x1
5015; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1] offset:4
5016; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
5017; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
5018; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
5019; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, 0
5020; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
5021; GFX10-DL-NEXT:    s_endpgm
5022;
5023; GFX11-DL-LABEL: idot4_acc32_hilo:
5024; GFX11-DL:       ; %bb.0: ; %entry
5025; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5026; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
5027; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
5028; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
5029; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
5030; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5031; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
5032; GFX11-DL-NEXT:    s_clause 0x1
5033; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1] offset:4
5034; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
5035; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
5036; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
5037; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
5038; GFX11-DL-NEXT:    s_endpgm
5039                                       ptr addrspace(1) %src2,
5040                                       ptr addrspace(1) nocapture %dst) {
5041entry:
5042  %idx = call i32 @llvm.amdgcn.workitem.id.x()
5043  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
5044  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
5045  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
5046  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
5047
5048  %v1e0 = extractelement <8 x i8> %vec1, i64 4
5049  %cv1e0 = zext i8 %v1e0 to i32
5050  %v2e0 = extractelement <8 x i8> %vec2, i64 0
5051  %cv2e0 = zext i8 %v2e0 to i32
5052  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
5053
5054  %v1e1 = extractelement <8 x i8> %vec1, i64 5
5055  %cv1e1 = zext i8 %v1e1 to i32
5056  %v2e1 = extractelement <8 x i8> %vec2, i64 1
5057  %cv2e1 = zext i8 %v2e1 to i32
5058  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
5059
5060  %v1e2 = extractelement <8 x i8> %vec1, i64 6
5061  %cv1e2 = zext i8 %v1e2 to i32
5062  %v2e2 = extractelement <8 x i8> %vec2, i64 2
5063  %cv2e2 = zext i8 %v2e2 to i32
5064  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
5065
5066  %v1e3 = extractelement <8 x i8> %vec1, i64 7
5067  %cv1e3 = zext i8 %v1e3 to i32
5068  %v2e3 = extractelement <8 x i8> %vec2, i64 3
5069  %cv2e3 = zext i8 %v2e3 to i32
5070  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
5071
5072  %add1 = add i32 %mul1, 0
5073  %add2 = add i32 %add1, %mul2
5074  %add3 = add i32 %add2, %mul3
5075  %add4 = add i32 %add3, %mul4
5076  store i32 %add4, ptr addrspace(1) %dst, align 4
5077  ret void
5078}
5079
5080define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
5081; GFX7-LABEL: idot4_acc32_lohi:
5082; GFX7:       ; %bb.0: ; %entry
5083; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
5084; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
5085; GFX7-NEXT:    s_mov_b32 s3, 0xf000
5086; GFX7-NEXT:    s_mov_b32 s6, 0
5087; GFX7-NEXT:    s_mov_b32 s7, s3
5088; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5089; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5090; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5091; GFX7-NEXT:    v_mov_b32_e32 v1, 0
5092; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
5093; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
5094; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
5095; GFX7-NEXT:    s_mov_b32 s2, -1
5096; GFX7-NEXT:    s_waitcnt vmcnt(1)
5097; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
5098; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
5099; GFX7-NEXT:    s_waitcnt vmcnt(0)
5100; GFX7-NEXT:    v_bfe_u32 v6, v0, 16, 8
5101; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
5102; GFX7-NEXT:    v_mul_u32_u24_e32 v3, v3, v6
5103; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
5104; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
5105; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
5106; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
5107; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
5108; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
5109; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
5110; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5111; GFX7-NEXT:    s_endpgm
5112;
5113; GFX8-LABEL: idot4_acc32_lohi:
5114; GFX8:       ; %bb.0: ; %entry
5115; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5116; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
5117; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
5118; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5119; GFX8-NEXT:    v_mov_b32_e32 v1, s1
5120; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
5121; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5122; GFX8-NEXT:    v_mov_b32_e32 v3, s3
5123; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
5124; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
5125; GFX8-NEXT:    flat_load_dword v4, v[0:1]
5126; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
5127; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
5128; GFX8-NEXT:    flat_load_dword v2, v[0:1]
5129; GFX8-NEXT:    v_mov_b32_e32 v0, s4
5130; GFX8-NEXT:    v_mov_b32_e32 v1, s5
5131; GFX8-NEXT:    s_waitcnt vmcnt(1)
5132; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v4
5133; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 8
5134; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
5135; GFX8-NEXT:    s_waitcnt vmcnt(0)
5136; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
5137; GFX8-NEXT:    v_mul_u32_u24_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
5138; GFX8-NEXT:    v_bfe_u32 v8, v2, 8, 8
5139; GFX8-NEXT:    v_mad_u32_u24 v3, v3, v7, v4
5140; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v2
5141; GFX8-NEXT:    v_mad_u32_u24 v3, v5, v8, v3
5142; GFX8-NEXT:    v_mad_u32_u24 v2, v6, v2, v3
5143; GFX8-NEXT:    flat_store_dword v[0:1], v2
5144; GFX8-NEXT:    s_endpgm
5145;
5146; GFX9-NODL-LABEL: idot4_acc32_lohi:
5147; GFX9-NODL:       ; %bb.0: ; %entry
5148; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5149; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5150; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5151; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
5152; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
5153; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
5154; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
5155; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
5156; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
5157; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
5158; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
5159; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
5160; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
5161; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_0
5162; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, v5
5163; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
5164; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
5165; GFX9-NODL-NEXT:    s_endpgm
5166;
5167; GFX9-DL-LABEL: idot4_acc32_lohi:
5168; GFX9-DL:       ; %bb.0: ; %entry
5169; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5170; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5171; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5172; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
5173; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3] offset:4
5174; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
5175; GFX9-DL-NEXT:    s_mov_b32 s0, 0x10302
5176; GFX9-DL-NEXT:    s_mov_b32 s1, 0x3020001
5177; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
5178; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
5179; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
5180; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
5181; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
5182; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, 0
5183; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
5184; GFX9-DL-NEXT:    s_endpgm
5185;
5186; GFX10-DL-LABEL: idot4_acc32_lohi:
5187; GFX10-DL:       ; %bb.0: ; %entry
5188; GFX10-DL-NEXT:    s_clause 0x1
5189; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5190; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5191; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5192; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
5193; GFX10-DL-NEXT:    s_clause 0x1
5194; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3] offset:4
5195; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
5196; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
5197; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0x10302
5198; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
5199; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0x3020001
5200; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
5201; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
5202; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
5203; GFX10-DL-NEXT:    s_endpgm
5204;
5205; GFX11-DL-LABEL: idot4_acc32_lohi:
5206; GFX11-DL:       ; %bb.0: ; %entry
5207; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5208; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
5209; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
5210; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
5211; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
5212; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5213; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
5214; GFX11-DL-NEXT:    s_clause 0x1
5215; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3] offset:4
5216; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
5217; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
5218; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0x10302
5219; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
5220; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0x3020001
5221; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5222; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
5223; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
5224; GFX11-DL-NEXT:    s_endpgm
5225                                       ptr addrspace(1) %src2,
5226                                       ptr addrspace(1) nocapture %dst) {
5227entry:
5228  %idx = call i32 @llvm.amdgcn.workitem.id.x()
5229  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
5230  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
5231  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
5232  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
5233
5234  %v1e0 = extractelement <8 x i8> %vec1, i64 0
5235  %cv1e0 = zext i8 %v1e0 to i32
5236  %v2e0 = extractelement <8 x i8> %vec2, i64 7
5237  %cv2e0 = zext i8 %v2e0 to i32
5238  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
5239
5240  %v1e1 = extractelement <8 x i8> %vec1, i64 1
5241  %cv1e1 = zext i8 %v1e1 to i32
5242  %v2e1 = extractelement <8 x i8> %vec2, i64 6
5243  %cv2e1 = zext i8 %v2e1 to i32
5244  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
5245
5246  %v1e2 = extractelement <8 x i8> %vec1, i64 2
5247  %cv1e2 = zext i8 %v1e2 to i32
5248  %v2e2 = extractelement <8 x i8> %vec2, i64 5
5249  %cv2e2 = zext i8 %v2e2 to i32
5250  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
5251
5252  %v1e3 = extractelement <8 x i8> %vec1, i64 3
5253  %cv1e3 = zext i8 %v1e3 to i32
5254  %v2e3 = extractelement <8 x i8> %vec2, i64 4
5255  %cv2e3 = zext i8 %v2e3 to i32
5256  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
5257
5258  %add1 = add i32 %mul1, 0
5259  %add2 = add i32 %add1, %mul2
5260  %add3 = add i32 %add2, %mul3
5261  %add4 = add i32 %add3, %mul4
5262  store i32 %add4, ptr addrspace(1) %dst, align 4
5263  ret void
5264}
5265
5266define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
5267; GFX7-LABEL: idot4_acc32_hihi:
5268; GFX7:       ; %bb.0: ; %entry
5269; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
5270; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
5271; GFX7-NEXT:    s_mov_b32 s3, 0xf000
5272; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5273; GFX7-NEXT:    v_mov_b32_e32 v1, 0
5274; GFX7-NEXT:    s_mov_b32 s6, 0
5275; GFX7-NEXT:    s_mov_b32 s7, s3
5276; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5277; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5278; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:4
5279; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
5280; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
5281; GFX7-NEXT:    s_mov_b32 s2, -1
5282; GFX7-NEXT:    s_waitcnt vmcnt(1)
5283; GFX7-NEXT:    v_bfe_u32 v3, v2, 16, 8
5284; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
5285; GFX7-NEXT:    s_waitcnt vmcnt(0)
5286; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
5287; GFX7-NEXT:    v_bfe_u32 v5, v0, 16, 8
5288; GFX7-NEXT:    v_mul_u32_u24_e32 v3, v3, v6
5289; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
5290; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
5291; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
5292; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
5293; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
5294; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
5295; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
5296; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5297; GFX7-NEXT:    s_endpgm
5298;
5299; GFX8-LABEL: idot4_acc32_hihi:
5300; GFX8:       ; %bb.0: ; %entry
5301; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5302; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
5303; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5304; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5305; GFX8-NEXT:    v_mov_b32_e32 v1, s1
5306; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
5307; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5308; GFX8-NEXT:    v_mov_b32_e32 v3, s3
5309; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
5310; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
5311; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
5312; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5313; GFX8-NEXT:    flat_load_dword v2, v[0:1]
5314; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
5315; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
5316; GFX8-NEXT:    flat_load_dword v3, v[0:1]
5317; GFX8-NEXT:    v_mov_b32_e32 v0, s4
5318; GFX8-NEXT:    v_mov_b32_e32 v1, s5
5319; GFX8-NEXT:    s_waitcnt vmcnt(1)
5320; GFX8-NEXT:    v_and_b32_e32 v4, 0xff, v2
5321; GFX8-NEXT:    v_bfe_u32 v7, v2, 8, 8
5322; GFX8-NEXT:    s_waitcnt vmcnt(0)
5323; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 8
5324; GFX8-NEXT:    v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
5325; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v3
5326; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, v6
5327; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
5328; GFX8-NEXT:    v_bfe_u32 v3, v3, 8, 8
5329; GFX8-NEXT:    v_mad_u32_u24 v4, v7, v8, v4
5330; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v3, v4
5331; GFX8-NEXT:    flat_store_dword v[0:1], v2
5332; GFX8-NEXT:    s_endpgm
5333;
5334; GFX9-NODL-LABEL: idot4_acc32_hihi:
5335; GFX9-NODL:       ; %bb.0: ; %entry
5336; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5337; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5338; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5339; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
5340; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1] offset:4
5341; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
5342; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
5343; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
5344; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
5345; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
5346; GFX9-NODL-NEXT:    v_bfe_u32 v4, v2, 16, 8
5347; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
5348; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3
5349; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
5350; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v3, v4, v5
5351; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
5352; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
5353; GFX9-NODL-NEXT:    s_endpgm
5354;
5355; GFX9-DL-LABEL: idot4_acc32_hihi:
5356; GFX9-DL:       ; %bb.0: ; %entry
5357; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5358; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5359; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5360; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
5361; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3] offset:4
5362; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1] offset:4
5363; GFX9-DL-NEXT:    s_mov_b32 s0, 0x1030200
5364; GFX9-DL-NEXT:    s_mov_b32 s1, 0x3010002
5365; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
5366; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
5367; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s0
5368; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
5369; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
5370; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, 0
5371; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
5372; GFX9-DL-NEXT:    s_endpgm
5373;
5374; GFX10-DL-LABEL: idot4_acc32_hihi:
5375; GFX10-DL:       ; %bb.0: ; %entry
5376; GFX10-DL-NEXT:    s_clause 0x1
5377; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5378; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5379; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5380; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
5381; GFX10-DL-NEXT:    s_clause 0x1
5382; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3] offset:4
5383; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1] offset:4
5384; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
5385; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0x1030200
5386; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
5387; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0x3010002
5388; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
5389; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
5390; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
5391; GFX10-DL-NEXT:    s_endpgm
5392;
5393; GFX11-DL-LABEL: idot4_acc32_hihi:
5394; GFX11-DL:       ; %bb.0: ; %entry
5395; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5396; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
5397; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
5398; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
5399; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
5400; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5401; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
5402; GFX11-DL-NEXT:    s_clause 0x1
5403; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3] offset:4
5404; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1] offset:4
5405; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
5406; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0x1030200
5407; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
5408; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0x3010002
5409; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5410; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
5411; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
5412; GFX11-DL-NEXT:    s_endpgm
5413                                       ptr addrspace(1) %src2,
5414                                       ptr addrspace(1) nocapture %dst) {
5415entry:
5416  %idx = call i32 @llvm.amdgcn.workitem.id.x()
5417  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
5418  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
5419  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
5420  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
5421
5422  %v1e0 = extractelement <8 x i8> %vec1, i64 4
5423  %cv1e0 = zext i8 %v1e0 to i32
5424  %v2e0 = extractelement <8 x i8> %vec2, i64 6
5425  %cv2e0 = zext i8 %v2e0 to i32
5426  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
5427
5428  %v1e1 = extractelement <8 x i8> %vec1, i64 6
5429  %cv1e1 = zext i8 %v1e1 to i32
5430  %v2e1 = extractelement <8 x i8> %vec2, i64 4
5431  %cv2e1 = zext i8 %v2e1 to i32
5432  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
5433
5434  %v1e2 = extractelement <8 x i8> %vec1, i64 5
5435  %cv1e2 = zext i8 %v1e2 to i32
5436  %v2e2 = extractelement <8 x i8> %vec2, i64 7
5437  %cv2e2 = zext i8 %v2e2 to i32
5438  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
5439
5440  %v1e3 = extractelement <8 x i8> %vec1, i64 7
5441  %cv1e3 = zext i8 %v1e3 to i32
5442  %v2e3 = extractelement <8 x i8> %vec2, i64 5
5443  %cv2e3 = zext i8 %v2e3 to i32
5444  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
5445
5446  %add1 = add i32 %mul1, 0
5447  %add2 = add i32 %add1, %mul2
5448  %add3 = add i32 %add2, %mul3
5449  %add4 = add i32 %add3, %mul4
5450  store i32 %add4, ptr addrspace(1) %dst, align 4
5451  ret void
5452}
5453
5454define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
5455; GFX7-LABEL: idot4_acc32_v8i8:
5456; GFX7:       ; %bb.0: ; %entry
5457; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
5458; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
5459; GFX7-NEXT:    s_mov_b32 s7, 0xf000
5460; GFX7-NEXT:    s_mov_b32 s2, 0
5461; GFX7-NEXT:    s_mov_b32 s3, s7
5462; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5463; GFX7-NEXT:    v_mov_b32_e32 v1, 0
5464; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5465; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
5466; GFX7-NEXT:    s_mov_b32 s6, -1
5467; GFX7-NEXT:    s_waitcnt vmcnt(0)
5468; GFX7-NEXT:    v_bfe_u32 v4, v0, 8, 8
5469; GFX7-NEXT:    v_bfe_u32 v5, v1, 8, 8
5470; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v0
5471; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v1
5472; GFX7-NEXT:    v_mul_u32_u24_e32 v4, v4, v5
5473; GFX7-NEXT:    v_bfe_u32 v6, v0, 16, 8
5474; GFX7-NEXT:    v_bfe_u32 v7, v1, 16, 8
5475; GFX7-NEXT:    v_mad_u32_u24 v2, v2, v3, v4
5476; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
5477; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
5478; GFX7-NEXT:    v_mad_u32_u24 v2, v6, v7, v2
5479; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
5480; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5481; GFX7-NEXT:    s_endpgm
5482;
5483; GFX8-LABEL: idot4_acc32_v8i8:
5484; GFX8:       ; %bb.0: ; %entry
5485; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5486; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
5487; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5488; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5489; GFX8-NEXT:    v_mov_b32_e32 v1, s1
5490; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
5491; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5492; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
5493; GFX8-NEXT:    s_waitcnt vmcnt(0)
5494; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
5495; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v1
5496; GFX8-NEXT:    v_mul_u32_u24_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
5497; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 8
5498; GFX8-NEXT:    v_bfe_u32 v6, v1, 16, 8
5499; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v3, v4
5500; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
5501; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
5502; GFX8-NEXT:    v_mad_u32_u24 v2, v5, v6, v2
5503; GFX8-NEXT:    v_mad_u32_u24 v2, v0, v1, v2
5504; GFX8-NEXT:    v_mov_b32_e32 v0, s2
5505; GFX8-NEXT:    v_mov_b32_e32 v1, s3
5506; GFX8-NEXT:    flat_store_dword v[0:1], v2
5507; GFX8-NEXT:    s_endpgm
5508;
5509; GFX9-NODL-LABEL: idot4_acc32_v8i8:
5510; GFX9-NODL:       ; %bb.0: ; %entry
5511; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5512; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
5513; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5514; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, 0
5515; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
5516; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
5517; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
5518; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v0
5519; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v1
5520; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
5521; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
5522; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
5523; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v3, v4, v5
5524; GFX9-NODL-NEXT:    v_add3_u32 v0, v1, v6, v0
5525; GFX9-NODL-NEXT:    global_store_dword v2, v0, s[2:3]
5526; GFX9-NODL-NEXT:    s_endpgm
5527;
5528; GFX9-DL-LABEL: idot4_acc32_v8i8:
5529; GFX9-DL:       ; %bb.0: ; %entry
5530; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5531; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
5532; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5533; GFX9-DL-NEXT:    v_mov_b32_e32 v2, 0
5534; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
5535; GFX9-DL-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
5536; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
5537; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
5538; GFX9-DL-NEXT:    global_store_dword v2, v0, s[2:3]
5539; GFX9-DL-NEXT:    s_endpgm
5540;
5541; GFX10-DL-LABEL: idot4_acc32_v8i8:
5542; GFX10-DL:       ; %bb.0: ; %entry
5543; GFX10-DL-NEXT:    s_clause 0x1
5544; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5545; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
5546; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5547; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
5548; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
5549; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v0, s[0:1]
5550; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
5551; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
5552; GFX10-DL-NEXT:    global_store_dword v2, v0, s[2:3]
5553; GFX10-DL-NEXT:    s_endpgm
5554;
5555; GFX11-DL-LABEL: idot4_acc32_v8i8:
5556; GFX11-DL:       ; %bb.0: ; %entry
5557; GFX11-DL-NEXT:    s_clause 0x1
5558; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
5559; GFX11-DL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x34
5560; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
5561; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
5562; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
5563; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5564; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
5565; GFX11-DL-NEXT:    global_load_b64 v[0:1], v0, s[0:1]
5566; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
5567; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
5568; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[2:3]
5569; GFX11-DL-NEXT:    s_endpgm
5570                                       ptr addrspace(1) %src2,
5571                                       ptr addrspace(1) nocapture %dst) {
5572entry:
5573  %idx = call i32 @llvm.amdgcn.workitem.id.x()
5574  %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
5575  %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
5576
5577
5578  %v1e0 = extractelement <8 x i8> %vec1, i64 0
5579  %cv1e0 = zext i8 %v1e0 to i32
5580  %v2e0 = extractelement <8 x i8> %vec1, i64 4
5581  %cv2e0 = zext i8 %v2e0 to i32
5582  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
5583
5584  %v1e1 = extractelement <8 x i8> %vec1, i64 1
5585  %cv1e1 = zext i8 %v1e1 to i32
5586  %v2e1 = extractelement <8 x i8> %vec1, i64 5
5587  %cv2e1 = zext i8 %v2e1 to i32
5588  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
5589
5590  %v1e2 = extractelement <8 x i8> %vec1, i64 2
5591  %cv1e2 = zext i8 %v1e2 to i32
5592  %v2e2 = extractelement <8 x i8> %vec1, i64 6
5593  %cv2e2 = zext i8 %v2e2 to i32
5594  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
5595
5596  %v1e3 = extractelement <8 x i8> %vec1, i64 3
5597  %cv1e3 = zext i8 %v1e3 to i32
5598  %v2e3 = extractelement <8 x i8> %vec1, i64 7
5599  %cv2e3 = zext i8 %v2e3 to i32
5600  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
5601
5602  %add1 = add i32 %mul1, 0
5603  %add2 = add i32 %add1, %mul2
5604  %add3 = add i32 %add2, %mul3
5605  %add4 = add i32 %add3, %mul4
5606  store i32 %add4, ptr addrspace(1) %dst, align 4
5607  ret void
5608}
5609
5610define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
5611; GFX7-LABEL: idot4_acc32_v16i8:
5612; GFX7:       ; %bb.0: ; %entry
5613; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
5614; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
5615; GFX7-NEXT:    s_mov_b32 s3, 0xf000
5616; GFX7-NEXT:    s_mov_b32 s6, 0
5617; GFX7-NEXT:    s_mov_b32 s7, s3
5618; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5619; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
5620; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
5621; GFX7-NEXT:    v_mov_b32_e32 v2, 0
5622; GFX7-NEXT:    s_mov_b64 s[8:9], s[10:11]
5623; GFX7-NEXT:    s_mov_b64 s[10:11], s[6:7]
5624; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
5625; GFX7-NEXT:    v_mov_b32_e32 v5, v2
5626; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[1:2], s[4:7], 0 addr64
5627; GFX7-NEXT:    buffer_load_dword v0, v[4:5], s[8:11], 0 addr64
5628; GFX7-NEXT:    s_mov_b32 s2, -1
5629; GFX7-NEXT:    s_waitcnt vmcnt(1)
5630; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
5631; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
5632; GFX7-NEXT:    s_waitcnt vmcnt(0)
5633; GFX7-NEXT:    v_bfe_u32 v5, v0, 8, 8
5634; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
5635; GFX7-NEXT:    v_mul_u32_u24_e32 v2, v2, v5
5636; GFX7-NEXT:    v_bfe_u32 v6, v3, 8, 8
5637; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
5638; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v4, v2
5639; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
5640; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
5641; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
5642; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
5643; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5644; GFX7-NEXT:    s_endpgm
5645;
5646; GFX8-LABEL: idot4_acc32_v16i8:
5647; GFX8:       ; %bb.0: ; %entry
5648; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5649; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
5650; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
5651; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5652; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5653; GFX8-NEXT:    v_mov_b32_e32 v2, s1
5654; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
5655; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
5656; GFX8-NEXT:    v_mov_b32_e32 v3, s3
5657; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
5658; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
5659; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[1:2]
5660; GFX8-NEXT:    flat_load_dword v4, v[4:5]
5661; GFX8-NEXT:    s_waitcnt vmcnt(1)
5662; GFX8-NEXT:    v_mov_b32_e32 v0, s4
5663; GFX8-NEXT:    v_mov_b32_e32 v1, s5
5664; GFX8-NEXT:    v_and_b32_e32 v5, 0xff, v2
5665; GFX8-NEXT:    s_waitcnt vmcnt(0)
5666; GFX8-NEXT:    v_and_b32_e32 v6, 0xff, v4
5667; GFX8-NEXT:    v_mul_u32_u24_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
5668; GFX8-NEXT:    v_bfe_u32 v7, v3, 8, 8
5669; GFX8-NEXT:    v_bfe_u32 v8, v4, 16, 8
5670; GFX8-NEXT:    v_mad_u32_u24 v2, v5, v6, v2
5671; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
5672; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
5673; GFX8-NEXT:    v_mad_u32_u24 v2, v7, v8, v2
5674; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v4, v2
5675; GFX8-NEXT:    flat_store_dword v[0:1], v2
5676; GFX8-NEXT:    s_endpgm
5677;
5678; GFX9-NODL-LABEL: idot4_acc32_v16i8:
5679; GFX9-NODL:       ; %bb.0: ; %entry
5680; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5681; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5682; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
5683; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
5684; GFX9-NODL-NEXT:    ; kill: killed $vgpr4
5685; GFX9-NODL-NEXT:    ; kill: killed $vgpr5
5686; GFX9-NODL-NEXT:    ; kill: killed $sgpr0_sgpr1_sgpr2 killed $sgpr3
5687; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
5688; GFX9-NODL-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
5689; GFX9-NODL-NEXT:    global_load_dword v0, v5, s[2:3]
5690; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
5691; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, 0
5692; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
5693; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
5694; GFX9-NODL-NEXT:    v_and_b32_e32 v5, 0xff, v0
5695; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
5696; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2
5697; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
5698; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
5699; GFX9-NODL-NEXT:    v_add3_u32 v0, v2, v6, v0
5700; GFX9-NODL-NEXT:    global_store_dword v1, v0, s[6:7]
5701; GFX9-NODL-NEXT:    s_endpgm
5702;
5703; GFX9-DL-LABEL: idot4_acc32_v16i8:
5704; GFX9-DL:       ; %bb.0: ; %entry
5705; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5706; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5707; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
5708; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
5709; GFX9-DL-NEXT:    ; kill: killed $vgpr4
5710; GFX9-DL-NEXT:    ; kill: killed $vgpr5
5711; GFX9-DL-NEXT:    ; kill: killed $sgpr0_sgpr1_sgpr2 killed $sgpr3
5712; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
5713; GFX9-DL-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
5714; GFX9-DL-NEXT:    global_load_dword v0, v5, s[2:3]
5715; GFX9-DL-NEXT:    s_mov_b32 s0, 0x7050002
5716; GFX9-DL-NEXT:    s_mov_b32 s1, 0x3020001
5717; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
5718; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
5719; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v2, s0
5720; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
5721; GFX9-DL-NEXT:    v_perm_b32 v0, v0, v0, s1
5722; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, 0
5723; GFX9-DL-NEXT:    global_store_dword v1, v0, s[6:7]
5724; GFX9-DL-NEXT:    s_endpgm
5725;
5726; GFX10-DL-LABEL: idot4_acc32_v16i8:
5727; GFX10-DL:       ; %bb.0: ; %entry
5728; GFX10-DL-NEXT:    s_clause 0x1
5729; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5730; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5731; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
5732; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
5733; GFX10-DL-NEXT:    ; kill: killed $vgpr4
5734; GFX10-DL-NEXT:    ; kill: killed $sgpr0_sgpr1_sgpr2 killed $sgpr3
5735; GFX10-DL-NEXT:    ; kill: killed $vgpr5
5736; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
5737; GFX10-DL-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
5738; GFX10-DL-NEXT:    global_load_dword v0, v5, s[2:3]
5739; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
5740; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v2, 0x7050002
5741; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
5742; GFX10-DL-NEXT:    v_perm_b32 v0, v0, v0, 0x3020001
5743; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
5744; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
5745; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
5746; GFX10-DL-NEXT:    s_endpgm
5747;
5748; GFX11-DL-LABEL: idot4_acc32_v16i8:
5749; GFX11-DL:       ; %bb.0: ; %entry
5750; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5751; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
5752; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
5753; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5754; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
5755; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
5756; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
5757; GFX11-DL-NEXT:    global_load_b128 v[0:3], v1, s[0:1]
5758; GFX11-DL-NEXT:    global_load_b32 v0, v4, s[2:3]
5759; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
5760; GFX11-DL-NEXT:    v_perm_b32 v1, v3, v2, 0x7050002
5761; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
5762; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
5763; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0x3020001
5764; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5765; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
5766; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
5767; GFX11-DL-NEXT:    s_endpgm
5768                                       ptr addrspace(1) %src2,
5769                                       ptr addrspace(1) nocapture %dst) {
5770entry:
5771  %idx = call i32 @llvm.amdgcn.workitem.id.x()
5772  %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
5773  %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
5774  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
5775  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
5776
5777  %v1e0 = extractelement <16 x i8> %vec1, i64 8
5778  %cv1e0 = zext i8 %v1e0 to i32
5779  %v2e0 = extractelement <8 x i8> %vec2, i64 0
5780  %cv2e0 = zext i8 %v2e0 to i32
5781  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
5782
5783  %v1e1 = extractelement <16 x i8> %vec1, i64 10
5784  %cv1e1 = zext i8 %v1e1 to i32
5785  %v2e1 = extractelement <8 x i8> %vec2, i64 1
5786  %cv2e1 = zext i8 %v2e1 to i32
5787  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
5788
5789  %v1e2 = extractelement <16 x i8> %vec1, i64 13
5790  %cv1e2 = zext i8 %v1e2 to i32
5791  %v2e2 = extractelement <8 x i8> %vec2, i64 2
5792  %cv2e2 = zext i8 %v2e2 to i32
5793  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
5794
5795  %v1e3 = extractelement <16 x i8> %vec1, i64 15
5796  %cv1e3 = zext i8 %v1e3 to i32
5797  %v2e3 = extractelement <8 x i8> %vec2, i64 3
5798  %cv2e3 = zext i8 %v2e3 to i32
5799  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
5800
5801  %add1 = add i32 %mul1, 0
5802  %add2 = add i32 %add1, %mul2
5803  %add3 = add i32 %add2, %mul3
5804  %add4 = add i32 %add3, %mul4
5805  store i32 %add4, ptr addrspace(1) %dst, align 4
5806  ret void
5807}
5808
5809define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
5810; GFX7-LABEL: idot4_acc32_v256i8:
5811; GFX7:       ; %bb.0: ; %entry
5812; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
5813; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
5814; GFX7-NEXT:    s_mov_b32 s3, 0xf000
5815; GFX7-NEXT:    s_mov_b32 s6, 0
5816; GFX7-NEXT:    s_mov_b32 s7, s3
5817; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
5818; GFX7-NEXT:    v_mov_b32_e32 v2, 0
5819; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5820; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
5821; GFX7-NEXT:    s_mov_b64 s[10:11], s[6:7]
5822; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
5823; GFX7-NEXT:    v_mov_b32_e32 v4, v2
5824; GFX7-NEXT:    buffer_load_dword v0, v[1:2], s[8:11], 0 addr64 offset:252
5825; GFX7-NEXT:    buffer_load_dword v1, v[3:4], s[4:7], 0 addr64
5826; GFX7-NEXT:    s_mov_b32 s2, -1
5827; GFX7-NEXT:    s_waitcnt vmcnt(1)
5828; GFX7-NEXT:    v_bfe_u32 v4, v0, 16, 8
5829; GFX7-NEXT:    s_waitcnt vmcnt(0)
5830; GFX7-NEXT:    v_bfe_u32 v5, v1, 8, 8
5831; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
5832; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v1
5833; GFX7-NEXT:    v_mul_u32_u24_e32 v4, v4, v5
5834; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
5835; GFX7-NEXT:    v_bfe_u32 v7, v1, 16, 8
5836; GFX7-NEXT:    v_mad_u32_u24 v2, v2, v3, v4
5837; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
5838; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
5839; GFX7-NEXT:    v_mad_u32_u24 v2, v6, v7, v2
5840; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
5841; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5842; GFX7-NEXT:    s_endpgm
5843;
5844; GFX8-LABEL: idot4_acc32_v256i8:
5845; GFX8:       ; %bb.0: ; %entry
5846; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5847; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
5848; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
5849; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5850; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5851; GFX8-NEXT:    v_mov_b32_e32 v2, s1
5852; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s0, v1
5853; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
5854; GFX8-NEXT:    v_mov_b32_e32 v1, s3
5855; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
5856; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5857; GFX8-NEXT:    s_movk_i32 s0, 0xfc
5858; GFX8-NEXT:    flat_load_dword v4, v[0:1]
5859; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v3
5860; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
5861; GFX8-NEXT:    flat_load_dword v2, v[0:1]
5862; GFX8-NEXT:    v_mov_b32_e32 v0, s4
5863; GFX8-NEXT:    v_mov_b32_e32 v1, s5
5864; GFX8-NEXT:    s_waitcnt vmcnt(1)
5865; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v4
5866; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 8
5867; GFX8-NEXT:    s_waitcnt vmcnt(0)
5868; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
5869; GFX8-NEXT:    v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
5870; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v2
5871; GFX8-NEXT:    v_mad_u32_u24 v3, v6, v3, v7
5872; GFX8-NEXT:    v_bfe_u32 v2, v2, 8, 8
5873; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
5874; GFX8-NEXT:    v_mad_u32_u24 v3, v8, v5, v3
5875; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v4, v3
5876; GFX8-NEXT:    flat_store_dword v[0:1], v2
5877; GFX8-NEXT:    s_endpgm
5878;
5879; GFX9-NODL-LABEL: idot4_acc32_v256i8:
5880; GFX9-NODL:       ; %bb.0: ; %entry
5881; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5882; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5883; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
5884; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5885; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
5886; GFX9-NODL-NEXT:    global_load_dword v2, v1, s[0:1] offset:252
5887; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[2:3]
5888; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
5889; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
5890; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
5891; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
5892; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v3
5893; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1
5894; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
5895; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3
5896; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v1, v4, v5
5897; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v6, v2
5898; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
5899; GFX9-NODL-NEXT:    s_endpgm
5900;
5901; GFX9-DL-LABEL: idot4_acc32_v256i8:
5902; GFX9-DL:       ; %bb.0: ; %entry
5903; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5904; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5905; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
5906; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
5907; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
5908; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
5909; GFX9-DL-NEXT:    global_load_dword v3, v1, s[0:1] offset:252
5910; GFX9-DL-NEXT:    s_mov_b32 s0, 0x3020001
5911; GFX9-DL-NEXT:    s_mov_b32 s1, 0x1000302
5912; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
5913; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
5914; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v2, s0
5915; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
5916; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
5917; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v2, v1, 0
5918; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
5919; GFX9-DL-NEXT:    s_endpgm
5920;
5921; GFX10-DL-LABEL: idot4_acc32_v256i8:
5922; GFX10-DL:       ; %bb.0: ; %entry
5923; GFX10-DL-NEXT:    s_clause 0x1
5924; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5925; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5926; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
5927; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
5928; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
5929; GFX10-DL-NEXT:    global_load_dword v2, v1, s[2:3]
5930; GFX10-DL-NEXT:    global_load_dword v3, v0, s[0:1] offset:252
5931; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
5932; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v2, 0x3020001
5933; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
5934; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0x1000302
5935; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
5936; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, 0
5937; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
5938; GFX10-DL-NEXT:    s_endpgm
5939;
5940; GFX11-DL-LABEL: idot4_acc32_v256i8:
5941; GFX11-DL:       ; %bb.0: ; %entry
5942; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
5943; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
5944; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
5945; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5946; GFX11-DL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0
5947; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
5948; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
5949; GFX11-DL-NEXT:    global_load_b32 v1, v1, s[2:3]
5950; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1] offset:252
5951; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
5952; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0x3020001
5953; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
5954; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0x1000302
5955; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5956; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v0, v1, 0
5957; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
5958; GFX11-DL-NEXT:    s_endpgm
5959                                       ptr addrspace(1) %src2,
5960                                       ptr addrspace(1) nocapture %dst) {
5961entry:
5962  %idx = call i32 @llvm.amdgcn.workitem.id.x()
5963  %gep1 = getelementptr <256 x i8>, ptr addrspace(1) %src1, i32 %idx
5964  %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
5965  %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
5966  %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
5967
5968  %v1e0 = extractelement <256 x i8> %vec1, i64 255
5969  %cv1e0 = zext i8 %v1e0 to i32
5970  %v2e0 = extractelement <8 x i8> %vec2, i64 0
5971  %cv2e0 = zext i8 %v2e0 to i32
5972  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
5973
5974  %v1e1 = extractelement <256 x i8> %vec1, i64 254
5975  %cv1e1 = zext i8 %v1e1 to i32
5976  %v2e1 = extractelement <8 x i8> %vec2, i64 1
5977  %cv2e1 = zext i8 %v2e1 to i32
5978  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
5979
5980  %v1e2 = extractelement <256 x i8> %vec1, i64 252
5981  %cv1e2 = zext i8 %v1e2 to i32
5982  %v2e2 = extractelement <8 x i8> %vec2, i64 2
5983  %cv2e2 = zext i8 %v2e2 to i32
5984  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
5985
5986  %v1e3 = extractelement <256 x i8> %vec1, i64 253
5987  %cv1e3 = zext i8 %v1e3 to i32
5988  %v2e3 = extractelement <8 x i8> %vec2, i64 3
5989  %cv2e3 = zext i8 %v2e3 to i32
5990  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
5991
5992  %add1 = add i32 %mul1, 0
5993  %add2 = add i32 %add1, %mul2
5994  %add3 = add i32 %add2, %mul3
5995  %add4 = add i32 %add3, %mul4
5996  store i32 %add4, ptr addrspace(1) %dst, align 4
5997  ret void
5998}
5999
6000define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
6001; GFX7-LABEL: idot4_acc32_anyext:
6002; GFX7:       ; %bb.0: ; %entry
6003; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6004; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
6005; GFX7-NEXT:    s_mov_b32 s7, 0xf000
6006; GFX7-NEXT:    s_mov_b32 s10, 0
6007; GFX7-NEXT:    s_mov_b32 s11, s7
6008; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6009; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
6010; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
6011; GFX7-NEXT:    v_mov_b32_e32 v1, 0
6012; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
6013; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
6014; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
6015; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
6016; GFX7-NEXT:    s_mov_b32 s6, -1
6017; GFX7-NEXT:    s_waitcnt vmcnt(1)
6018; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
6019; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
6020; GFX7-NEXT:    s_waitcnt vmcnt(0)
6021; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
6022; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
6023; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
6024; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
6025; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6026; GFX7-NEXT:    s_endpgm
6027;
6028; GFX8-LABEL: idot4_acc32_anyext:
6029; GFX8:       ; %bb.0: ; %entry
6030; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6031; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
6032; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
6033; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6034; GFX8-NEXT:    v_mov_b32_e32 v1, s1
6035; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
6036; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6037; GFX8-NEXT:    flat_load_dword v3, v[0:1]
6038; GFX8-NEXT:    v_mov_b32_e32 v1, s3
6039; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
6040; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6041; GFX8-NEXT:    flat_load_dword v0, v[0:1]
6042; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
6043; GFX8-NEXT:    s_waitcnt vmcnt(1)
6044; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
6045; GFX8-NEXT:    v_bfe_u32 v2, v3, 8, 8
6046; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
6047; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v1, s0
6048; GFX8-NEXT:    s_waitcnt vmcnt(0)
6049; GFX8-NEXT:    v_bfe_u32 v0, v0, 8, 8
6050; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v0, v1
6051; GFX8-NEXT:    v_mov_b32_e32 v0, s4
6052; GFX8-NEXT:    v_mov_b32_e32 v1, s5
6053; GFX8-NEXT:    flat_store_dword v[0:1], v2
6054; GFX8-NEXT:    s_endpgm
6055;
6056; GFX9-NODL-LABEL: idot4_acc32_anyext:
6057; GFX9-NODL:       ; %bb.0: ; %entry
6058; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6059; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6060; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
6061; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
6062; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
6063; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
6064; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
6065; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
6066; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
6067; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
6068; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
6069; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
6070; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
6071; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, s0, v1
6072; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
6073; GFX9-NODL-NEXT:    s_endpgm
6074;
6075; GFX9-DL-LABEL: idot4_acc32_anyext:
6076; GFX9-DL:       ; %bb.0: ; %entry
6077; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6078; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6079; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
6080; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
6081; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
6082; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
6083; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
6084; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0500
6085; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0100
6086; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
6087; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
6088; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v1, s1
6089; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s2
6090; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
6091; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
6092; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
6093; GFX9-DL-NEXT:    s_endpgm
6094;
6095; GFX10-DL-LABEL: idot4_acc32_anyext:
6096; GFX10-DL:       ; %bb.0: ; %entry
6097; GFX10-DL-NEXT:    s_clause 0x1
6098; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6099; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
6100; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
6101; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
6102; GFX10-DL-NEXT:    s_clause 0x1
6103; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
6104; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
6105; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
6106; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
6107; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
6108; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc0c0500
6109; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
6110; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
6111; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
6112; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
6113; GFX10-DL-NEXT:    global_store_dword v2, v0, s[6:7]
6114; GFX10-DL-NEXT:    s_endpgm
6115;
6116; GFX11-DL-LABEL: idot4_acc32_anyext:
6117; GFX11-DL:       ; %bb.0: ; %entry
6118; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
6119; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
6120; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
6121; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
6122; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
6123; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
6124; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
6125; GFX11-DL-NEXT:    s_clause 0x1
6126; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
6127; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
6128; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
6129; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
6130; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v1, 0xc0c0500
6131; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
6132; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
6133; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6134; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s0
6135; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
6136; GFX11-DL-NEXT:    s_endpgm
6137                                       ptr addrspace(1) %src2,
6138                                       ptr addrspace(1) nocapture %dst) {
6139entry:
6140  %idx = call i32 @llvm.amdgcn.workitem.id.x()
6141  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
6142  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
6143  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
6144  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
6145
6146  %v1e0 = extractelement <4 x i8> %vec1, i64 0
6147  %cv1e0t = sext i8 %v1e0 to i32
6148  %cv1e0 = and i32 %cv1e0t, 255
6149  %v2e0 = extractelement <4 x i8> %vec2, i64 0
6150  %cv2e0t = sext i8 %v2e0 to i32
6151  %cv2e0 = and i32 %cv1e0t, 255
6152  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
6153
6154  %v1e1 = extractelement <4 x i8> %vec1, i64 1
6155  %cv1e1 = zext i8 %v1e1 to i32
6156  %v2e1 = extractelement <4 x i8> %vec2, i64 1
6157  %cv2e1t = sext i8 %v2e1 to i32
6158  %cv2e1 = and i32 %cv2e1t, 255
6159  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
6160
6161  %acc = load i32, ptr addrspace(1) %dst, align 4
6162  %add1 = add i32 %mul1, %acc
6163  %add2 = add i32 %add1, %mul2
6164  store i32 %add2, ptr addrspace(1) %dst, align 4
6165  ret void
6166}
6167
6168declare i32 @llvm.amdgcn.workitem.id.x()
6169