xref: /llvm-project/llvm/test/CodeGen/AMDGPU/idot8u.ll (revision 5a3299a684d7d8c40f48d732e5b80a8bd29aa882)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL %s
8
9define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
10; GFX7-LABEL: udot8_acc32:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
13; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
14; GFX7-NEXT:    s_mov_b32 s14, -1
15; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
16; GFX7-NEXT:    s_add_u32 s12, s12, s11
17; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
18; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
19; GFX7-NEXT:    s_mov_b32 s3, 0xf000
20; GFX7-NEXT:    s_mov_b32 s6, 0
21; GFX7-NEXT:    s_mov_b32 s7, s3
22; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
23; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
24; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
25; GFX7-NEXT:    v_mov_b32_e32 v1, 0
26; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
27; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
28; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
29; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
30; GFX7-NEXT:    s_mov_b32 s2, -1
31; GFX7-NEXT:    s_addc_u32 s13, s13, 0
32; GFX7-NEXT:    s_waitcnt vmcnt(1)
33; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
34; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
35; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
36; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
37; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
38; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
39; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
40; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
41; GFX7-NEXT:    s_waitcnt vmcnt(0)
42; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
43; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
44; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
45; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
46; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
47; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
48; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
49; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
50; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, s4
52; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
53; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
54; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
55; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
56; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
57; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
58; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
59; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
60; GFX7-NEXT:    s_endpgm
61;
62; GFX8-LABEL: udot8_acc32:
63; GFX8:       ; %bb.0: ; %entry
64; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
65; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
66; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
67; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
68; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
69; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX8-NEXT:    v_mov_b32_e32 v1, s1
71; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
72; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
73; GFX8-NEXT:    flat_load_dword v3, v[0:1]
74; GFX8-NEXT:    v_mov_b32_e32 v1, s3
75; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
76; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
77; GFX8-NEXT:    flat_load_dword v0, v[0:1]
78; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
79; GFX8-NEXT:    s_mov_b32 s14, -1
80; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
81; GFX8-NEXT:    s_add_u32 s12, s12, s11
82; GFX8-NEXT:    s_addc_u32 s13, s13, 0
83; GFX8-NEXT:    s_waitcnt vmcnt(1)
84; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v3
85; GFX8-NEXT:    v_bfe_u32 v2, v3, 24, 4
86; GFX8-NEXT:    v_bfe_u32 v4, v3, 20, 4
87; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 4
88; GFX8-NEXT:    v_bfe_u32 v6, v3, 12, 4
89; GFX8-NEXT:    v_bfe_u32 v7, v3, 8, 4
90; GFX8-NEXT:    v_bfe_u32 v8, v3, 4, 4
91; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
92; GFX8-NEXT:    s_waitcnt vmcnt(0)
93; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
94; GFX8-NEXT:    v_bfe_u32 v10, v0, 24, 4
95; GFX8-NEXT:    v_bfe_u32 v11, v0, 20, 4
96; GFX8-NEXT:    v_bfe_u32 v12, v0, 16, 4
97; GFX8-NEXT:    v_bfe_u32 v13, v0, 12, 4
98; GFX8-NEXT:    v_bfe_u32 v14, v0, 8, 4
99; GFX8-NEXT:    v_bfe_u32 v15, v0, 4, 4
100; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
101; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, s0
103; GFX8-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
104; GFX8-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
105; GFX8-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
106; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
107; GFX8-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
108; GFX8-NEXT:    v_mad_u32_u24 v0, v2, v10, v0
109; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v9, v0
110; GFX8-NEXT:    v_mov_b32_e32 v0, s4
111; GFX8-NEXT:    v_mov_b32_e32 v1, s5
112; GFX8-NEXT:    flat_store_dword v[0:1], v2
113; GFX8-NEXT:    s_endpgm
114;
115; GFX9-LABEL: udot8_acc32:
116; GFX9:       ; %bb.0: ; %entry
117; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
118; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
119; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
120; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
121; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
122; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
124; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
125; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
126; GFX9-NEXT:    s_mov_b32 s14, -1
127; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
128; GFX9-NEXT:    s_add_u32 s12, s12, s11
129; GFX9-NEXT:    v_mov_b32_e32 v0, 0
130; GFX9-NEXT:    s_addc_u32 s13, s13, 0
131; GFX9-NEXT:    s_waitcnt vmcnt(1)
132; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
133; GFX9-NEXT:    v_bfe_u32 v4, v1, 24, 4
134; GFX9-NEXT:    v_bfe_u32 v5, v1, 20, 4
135; GFX9-NEXT:    v_bfe_u32 v6, v1, 16, 4
136; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
137; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
138; GFX9-NEXT:    v_bfe_u32 v9, v1, 4, 4
139; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
140; GFX9-NEXT:    s_waitcnt vmcnt(0)
141; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
142; GFX9-NEXT:    v_bfe_u32 v11, v2, 24, 4
143; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
144; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 4
145; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
146; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
147; GFX9-NEXT:    v_bfe_u32 v16, v2, 4, 4
148; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
149; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
150; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v9, v16
151; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
152; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
153; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
154; GFX9-NEXT:    v_add3_u32 v1, v1, s0, v2
155; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
156; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
157; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
158; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
159; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v3, v10
160; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v5
161; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
162; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
163; GFX9-NEXT:    s_endpgm
164;
165; GFX9-DL-LABEL: udot8_acc32:
166; GFX9-DL:       ; %bb.0: ; %entry
167; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
168; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
169; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
170; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
171; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
172; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
173; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
174; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
175; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
176; GFX9-DL-NEXT:    s_mov_b32 s14, -1
177; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
178; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
179; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
180; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
181; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
182; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s0
183; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
184; GFX9-DL-NEXT:    s_endpgm
185;
186; GFX10-DL-LABEL: udot8_acc32:
187; GFX10-DL:       ; %bb.0: ; %entry
188; GFX10-DL-NEXT:    s_clause 0x1
189; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
190; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
191; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
192; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
193; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
194; GFX10-DL-NEXT:    s_mov_b32 s14, -1
195; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
196; GFX10-DL-NEXT:    s_add_u32 s12, s12, s11
197; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
198; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
199; GFX10-DL-NEXT:    s_clause 0x1
200; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
201; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
202; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
203; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
204; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
205; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
206; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s0
207; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
208; GFX10-DL-NEXT:    s_endpgm
209                                       ptr addrspace(1) %src2,
210                                       ptr addrspace(1) nocapture %dst) {
211entry:
212  %idx = call i32 @llvm.amdgcn.workitem.id.x()
213  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
214  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
215  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
216  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
217
218  %v1e0 = extractelement <8 x i4> %vec1, i64 0
219  %cv1e0 = zext i4 %v1e0 to i32
220  %v2e0 = extractelement <8 x i4> %vec2, i64 0
221  %cv2e0 = zext i4 %v2e0 to i32
222  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
223
224  %v1e1 = extractelement <8 x i4> %vec1, i64 1
225  %cv1e1 = zext i4 %v1e1 to i32
226  %v2e1 = extractelement <8 x i4> %vec2, i64 1
227  %cv2e1 = zext i4 %v2e1 to i32
228  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
229
230  %v1e2 = extractelement <8 x i4> %vec1, i64 2
231  %cv1e2 = zext i4 %v1e2 to i32
232  %v2e2 = extractelement <8 x i4> %vec2, i64 2
233  %cv2e2 = zext i4 %v2e2 to i32
234  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
235
236  %v1e3 = extractelement <8 x i4> %vec1, i64 3
237  %cv1e3 = zext i4 %v1e3 to i32
238  %v2e3 = extractelement <8 x i4> %vec2, i64 3
239  %cv2e3 = zext i4 %v2e3 to i32
240  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
241
242  %v1e4 = extractelement <8 x i4> %vec1, i64 4
243  %cv1e4 = zext i4 %v1e4 to i32
244  %v2e4 = extractelement <8 x i4> %vec2, i64 4
245  %cv2e4 = zext i4 %v2e4 to i32
246  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
247
248  %v1e5 = extractelement <8 x i4> %vec1, i64 5
249  %cv1e5 = zext i4 %v1e5 to i32
250  %v2e5 = extractelement <8 x i4> %vec2, i64 5
251  %cv2e5 = zext i4 %v2e5 to i32
252  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
253
254  %v1e6 = extractelement <8 x i4> %vec1, i64 6
255  %cv1e6 = zext i4 %v1e6 to i32
256  %v2e6 = extractelement <8 x i4> %vec2, i64 6
257  %cv2e6 = zext i4 %v2e6 to i32
258  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
259
260  %v1e7 = extractelement <8 x i4> %vec1, i64 7
261  %cv1e7 = zext i4 %v1e7 to i32
262  %v2e7 = extractelement <8 x i4> %vec2, i64 7
263  %cv2e7 = zext i4 %v2e7 to i32
264  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
265
266  %acc = load i32, ptr addrspace(1) %dst, align 4
267  %add1 = add i32 %mul0, %acc
268  %add2 = add i32 %add1, %mul1
269  %add3 = add i32 %add2, %mul2
270  %add4 = add i32 %add3, %mul3
271  %add5 = add i32 %add4, %mul4
272  %add6 = add i32 %add5, %mul5
273  %add7 = add i32 %add6, %mul6
274  %add8 = add i32 %add7, %mul7
275
276  store i32 %add8, ptr addrspace(1) %dst, align 4
277  ret void
278}
279
280; TODO: Remove the unnecessary instruction(that is zero-extending the
281; 2nd MAD) to have the pattern-recognizer to kick in.
282define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
283; GFX7-LABEL: udot8_acc16:
284; GFX7:       ; %bb.0: ; %entry
285; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
286; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
287; GFX7-NEXT:    s_mov_b32 s14, -1
288; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
289; GFX7-NEXT:    s_add_u32 s12, s12, s11
290; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
291; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
292; GFX7-NEXT:    s_mov_b32 s3, 0xf000
293; GFX7-NEXT:    s_mov_b32 s6, 0
294; GFX7-NEXT:    s_mov_b32 s7, s3
295; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
296; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
297; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
298; GFX7-NEXT:    v_mov_b32_e32 v1, 0
299; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
300; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
301; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
302; GFX7-NEXT:    s_mov_b32 s2, -1
303; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
304; GFX7-NEXT:    s_addc_u32 s13, s13, 0
305; GFX7-NEXT:    s_waitcnt vmcnt(2)
306; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
307; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
308; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
309; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
310; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
311; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
312; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
313; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
314; GFX7-NEXT:    s_waitcnt vmcnt(1)
315; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
316; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
317; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
318; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
319; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
320; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
321; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
322; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
323; GFX7-NEXT:    s_waitcnt vmcnt(0)
324; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
325; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
326; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
327; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
328; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
329; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
330; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
331; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
332; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
333; GFX7-NEXT:    s_endpgm
334;
335; GFX8-LABEL: udot8_acc16:
336; GFX8:       ; %bb.0: ; %entry
337; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
338; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
339; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
340; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
341; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
342; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
343; GFX8-NEXT:    v_mov_b32_e32 v1, s1
344; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
345; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
346; GFX8-NEXT:    flat_load_dword v3, v[0:1]
347; GFX8-NEXT:    v_mov_b32_e32 v1, s3
348; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
349; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
350; GFX8-NEXT:    flat_load_dword v2, v[0:1]
351; GFX8-NEXT:    v_mov_b32_e32 v0, s4
352; GFX8-NEXT:    v_mov_b32_e32 v1, s5
353; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
354; GFX8-NEXT:    s_mov_b32 s14, -1
355; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
356; GFX8-NEXT:    s_add_u32 s12, s12, s11
357; GFX8-NEXT:    s_addc_u32 s13, s13, 0
358; GFX8-NEXT:    s_waitcnt vmcnt(2)
359; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
360; GFX8-NEXT:    v_bfe_u32 v6, v3, 24, 4
361; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
362; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
363; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
364; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
365; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
366; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
367; GFX8-NEXT:    s_waitcnt vmcnt(1)
368; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
369; GFX8-NEXT:    v_bfe_u32 v13, v2, 24, 4
370; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
371; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
372; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
373; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
374; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
375; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
376; GFX8-NEXT:    s_waitcnt vmcnt(0)
377; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
378; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
379; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
380; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
381; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
382; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
383; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
384; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
385; GFX8-NEXT:    flat_store_short v[0:1], v2
386; GFX8-NEXT:    s_endpgm
387;
388; GFX9-LABEL: udot8_acc16:
389; GFX9:       ; %bb.0: ; %entry
390; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
391; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
392; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
393; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
394; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
395; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
396; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
397; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
398; GFX9-NEXT:    v_mov_b32_e32 v0, 0
399; GFX9-NEXT:    global_load_ushort v3, v0, s[6:7]
400; GFX9-NEXT:    s_mov_b32 s14, -1
401; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
402; GFX9-NEXT:    s_add_u32 s12, s12, s11
403; GFX9-NEXT:    s_addc_u32 s13, s13, 0
404; GFX9-NEXT:    s_waitcnt vmcnt(2)
405; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
406; GFX9-NEXT:    v_bfe_u32 v5, v1, 24, 4
407; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
408; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
409; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
410; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
411; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
412; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
413; GFX9-NEXT:    s_waitcnt vmcnt(1)
414; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
415; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
416; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
417; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
418; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
419; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
420; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
421; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
422; GFX9-NEXT:    s_waitcnt vmcnt(0)
423; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
424; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
425; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
426; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
427; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
428; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
429; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
430; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
431; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
432; GFX9-NEXT:    s_endpgm
433;
434; GFX9-DL-LABEL: udot8_acc16:
435; GFX9-DL:       ; %bb.0: ; %entry
436; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
437; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
438; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
439; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
440; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
441; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
442; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
443; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
444; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
445; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[6:7]
446; GFX9-DL-NEXT:    s_mov_b32 s14, -1
447; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
448; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
449; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
450; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
451; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
452; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
453; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
454; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
455; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
456; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
457; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
458; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
459; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
460; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
461; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
462; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
463; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
464; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
465; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
466; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
467; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
468; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
469; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
470; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
471; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
472; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
473; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
474; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
475; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
476; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
477; GFX9-DL-NEXT:    global_store_short v0, v1, s[6:7]
478; GFX9-DL-NEXT:    s_endpgm
479;
480; GFX10-DL-LABEL: udot8_acc16:
481; GFX10-DL:       ; %bb.0: ; %entry
482; GFX10-DL-NEXT:    s_clause 0x1
483; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
484; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
485; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
486; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
487; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
488; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
489; GFX10-DL-NEXT:    s_mov_b32 s14, -1
490; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
491; GFX10-DL-NEXT:    s_add_u32 s12, s12, s11
492; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
493; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
494; GFX10-DL-NEXT:    s_clause 0x1
495; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
496; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
497; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[6:7]
498; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
499; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
500; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
501; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
502; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
503; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
504; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
505; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
506; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
507; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
508; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
509; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
510; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
511; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
512; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
513; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
514; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
515; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
516; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
517; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
518; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
519; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 24, 4
520; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
521; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
522; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
523; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
524; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
525; GFX10-DL-NEXT:    global_store_short v1, v0, s[6:7]
526; GFX10-DL-NEXT:    s_endpgm
527                                       ptr addrspace(1) %src2,
528                                       ptr addrspace(1) nocapture %dst) {
529entry:
530  %idx = call i32 @llvm.amdgcn.workitem.id.x()
531  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
532  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
533  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
534  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
535
536  %v1e0 = extractelement <8 x i4> %vec1, i64 0
537  %cv1e0 = zext i4 %v1e0 to i16
538  %v2e0 = extractelement <8 x i4> %vec2, i64 0
539  %cv2e0 = zext i4 %v2e0 to i16
540  %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
541
542  %v1e1 = extractelement <8 x i4> %vec1, i64 1
543  %cv1e1 = zext i4 %v1e1 to i16
544  %v2e1 = extractelement <8 x i4> %vec2, i64 1
545  %cv2e1 = zext i4 %v2e1 to i16
546  %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
547
548  %v1e2 = extractelement <8 x i4> %vec1, i64 2
549  %cv1e2 = zext i4 %v1e2 to i16
550  %v2e2 = extractelement <8 x i4> %vec2, i64 2
551  %cv2e2 = zext i4 %v2e2 to i16
552  %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
553
554  %v1e3 = extractelement <8 x i4> %vec1, i64 3
555  %cv1e3 = zext i4 %v1e3 to i16
556  %v2e3 = extractelement <8 x i4> %vec2, i64 3
557  %cv2e3 = zext i4 %v2e3 to i16
558  %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
559
560  %v1e4 = extractelement <8 x i4> %vec1, i64 4
561  %cv1e4 = zext i4 %v1e4 to i16
562  %v2e4 = extractelement <8 x i4> %vec2, i64 4
563  %cv2e4 = zext i4 %v2e4 to i16
564  %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
565
566  %v1e5 = extractelement <8 x i4> %vec1, i64 5
567  %cv1e5 = zext i4 %v1e5 to i16
568  %v2e5 = extractelement <8 x i4> %vec2, i64 5
569  %cv2e5 = zext i4 %v2e5 to i16
570  %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
571
572  %v1e6 = extractelement <8 x i4> %vec1, i64 6
573  %cv1e6 = zext i4 %v1e6 to i16
574  %v2e6 = extractelement <8 x i4> %vec2, i64 6
575  %cv2e6 = zext i4 %v2e6 to i16
576  %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
577
578  %v1e7 = extractelement <8 x i4> %vec1, i64 7
579  %cv1e7 = zext i4 %v1e7 to i16
580  %v2e7 = extractelement <8 x i4> %vec2, i64 7
581  %cv2e7 = zext i4 %v2e7 to i16
582  %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
583
584  %acc = load i16, ptr addrspace(1) %dst, align 4
585  %add1 = add i16 %mul0, %acc
586  %add2 = add i16 %add1, %mul1
587  %add3 = add i16 %add2, %mul2
588  %add4 = add i16 %add3, %mul3
589  %add5 = add i16 %add4, %mul4
590  %add6 = add i16 %add5, %mul5
591  %add7 = add i16 %add6, %mul6
592  %add8 = add i16 %add7, %mul7
593
594  store i16 %add8, ptr addrspace(1) %dst, align 4
595  ret void
596}
597
598; TODO: Remove the unnecessary instruction(that is zero-extending the
599; 2nd MAD) to have the pattern-recognizer to kick in.
600define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
601; GFX7-LABEL: udot8_acc8:
602; GFX7:       ; %bb.0: ; %entry
603; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
604; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
605; GFX7-NEXT:    s_mov_b32 s14, -1
606; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
607; GFX7-NEXT:    s_add_u32 s12, s12, s11
608; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
609; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
610; GFX7-NEXT:    s_mov_b32 s3, 0xf000
611; GFX7-NEXT:    s_mov_b32 s6, 0
612; GFX7-NEXT:    s_mov_b32 s7, s3
613; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
614; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
615; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
616; GFX7-NEXT:    v_mov_b32_e32 v1, 0
617; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
618; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
619; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
620; GFX7-NEXT:    s_mov_b32 s2, -1
621; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
622; GFX7-NEXT:    s_addc_u32 s13, s13, 0
623; GFX7-NEXT:    s_waitcnt vmcnt(2)
624; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
625; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
626; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
627; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
628; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
629; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
630; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
631; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
632; GFX7-NEXT:    s_waitcnt vmcnt(1)
633; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
634; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
635; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
636; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
637; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
638; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
639; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
640; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
641; GFX7-NEXT:    s_waitcnt vmcnt(0)
642; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
643; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
644; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
645; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
646; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
647; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
648; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
649; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
650; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
651; GFX7-NEXT:    s_endpgm
652;
653; GFX8-LABEL: udot8_acc8:
654; GFX8:       ; %bb.0: ; %entry
655; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
656; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
657; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
658; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
659; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
660; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
661; GFX8-NEXT:    v_mov_b32_e32 v1, s1
662; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
663; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
664; GFX8-NEXT:    flat_load_dword v3, v[0:1]
665; GFX8-NEXT:    v_mov_b32_e32 v1, s3
666; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
667; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
668; GFX8-NEXT:    flat_load_dword v2, v[0:1]
669; GFX8-NEXT:    v_mov_b32_e32 v0, s4
670; GFX8-NEXT:    v_mov_b32_e32 v1, s5
671; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
672; GFX8-NEXT:    s_mov_b32 s14, -1
673; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
674; GFX8-NEXT:    s_add_u32 s12, s12, s11
675; GFX8-NEXT:    s_addc_u32 s13, s13, 0
676; GFX8-NEXT:    s_waitcnt vmcnt(2)
677; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
678; GFX8-NEXT:    v_bfe_u32 v6, v3, 24, 4
679; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
680; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
681; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
682; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
683; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
684; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
685; GFX8-NEXT:    s_waitcnt vmcnt(1)
686; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
687; GFX8-NEXT:    v_bfe_u32 v13, v2, 24, 4
688; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
689; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
690; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
691; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
692; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
693; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
694; GFX8-NEXT:    s_waitcnt vmcnt(0)
695; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
696; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
697; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
698; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
699; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
700; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
701; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
702; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
703; GFX8-NEXT:    flat_store_byte v[0:1], v2
704; GFX8-NEXT:    s_endpgm
705;
706; GFX9-LABEL: udot8_acc8:
707; GFX9:       ; %bb.0: ; %entry
708; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
709; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
710; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
711; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
712; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
713; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
714; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
715; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
716; GFX9-NEXT:    v_mov_b32_e32 v0, 0
717; GFX9-NEXT:    global_load_ubyte v3, v0, s[6:7]
718; GFX9-NEXT:    s_mov_b32 s14, -1
719; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
720; GFX9-NEXT:    s_add_u32 s12, s12, s11
721; GFX9-NEXT:    s_addc_u32 s13, s13, 0
722; GFX9-NEXT:    s_waitcnt vmcnt(2)
723; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
724; GFX9-NEXT:    v_bfe_u32 v5, v1, 24, 4
725; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
726; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
727; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
728; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
729; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
730; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
731; GFX9-NEXT:    s_waitcnt vmcnt(1)
732; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
733; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
734; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
735; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
736; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
737; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
738; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
739; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
740; GFX9-NEXT:    s_waitcnt vmcnt(0)
741; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
742; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
743; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
744; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
745; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
746; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
747; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
748; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
749; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
750; GFX9-NEXT:    s_endpgm
751;
752; GFX9-DL-LABEL: udot8_acc8:
753; GFX9-DL:       ; %bb.0: ; %entry
754; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
755; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
756; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
757; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
758; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
759; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
760; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
761; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
762; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
763; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[6:7]
764; GFX9-DL-NEXT:    s_mov_b32 s14, -1
765; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
766; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
767; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
768; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
769; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
770; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
771; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
772; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
773; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
774; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
775; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
776; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
777; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
778; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
779; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
780; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
781; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
782; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
783; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
784; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
785; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
786; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
787; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
788; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
789; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
790; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
791; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
792; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
793; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
794; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
795; GFX9-DL-NEXT:    global_store_byte v0, v1, s[6:7]
796; GFX9-DL-NEXT:    s_endpgm
797;
798; GFX10-DL-LABEL: udot8_acc8:
799; GFX10-DL:       ; %bb.0: ; %entry
800; GFX10-DL-NEXT:    s_clause 0x1
801; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
802; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
803; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
804; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
805; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
806; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
807; GFX10-DL-NEXT:    s_mov_b32 s14, -1
808; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
809; GFX10-DL-NEXT:    s_add_u32 s12, s12, s11
810; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
811; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
812; GFX10-DL-NEXT:    s_clause 0x1
813; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
814; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
815; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[6:7]
816; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
817; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
818; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
819; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
820; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
821; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
822; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
823; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
824; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
825; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
826; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
827; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
828; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
829; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
830; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
831; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
832; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
833; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
834; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
835; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
836; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
837; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 24, 4
838; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
839; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
840; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
841; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
842; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
843; GFX10-DL-NEXT:    global_store_byte v1, v0, s[6:7]
844; GFX10-DL-NEXT:    s_endpgm
845                                      ptr addrspace(1) %src2,
846                                      ptr addrspace(1) nocapture %dst) {
847entry:
848  %idx = call i32 @llvm.amdgcn.workitem.id.x()
849  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
850  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
851  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
852  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
853
854  %v1e0 = extractelement <8 x i4> %vec1, i64 0
855  %cv1e0 = zext i4 %v1e0 to i8
856  %v2e0 = extractelement <8 x i4> %vec2, i64 0
857  %cv2e0 = zext i4 %v2e0 to i8
858  %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
859
860  %v1e1 = extractelement <8 x i4> %vec1, i64 1
861  %cv1e1 = zext i4 %v1e1 to i8
862  %v2e1 = extractelement <8 x i4> %vec2, i64 1
863  %cv2e1 = zext i4 %v2e1 to i8
864  %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
865
866  %v1e2 = extractelement <8 x i4> %vec1, i64 2
867  %cv1e2 = zext i4 %v1e2 to i8
868  %v2e2 = extractelement <8 x i4> %vec2, i64 2
869  %cv2e2 = zext i4 %v2e2 to i8
870  %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
871
872  %v1e3 = extractelement <8 x i4> %vec1, i64 3
873  %cv1e3 = zext i4 %v1e3 to i8
874  %v2e3 = extractelement <8 x i4> %vec2, i64 3
875  %cv2e3 = zext i4 %v2e3 to i8
876  %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
877
878  %v1e4 = extractelement <8 x i4> %vec1, i64 4
879  %cv1e4 = zext i4 %v1e4 to i8
880  %v2e4 = extractelement <8 x i4> %vec2, i64 4
881  %cv2e4 = zext i4 %v2e4 to i8
882  %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
883
884  %v1e5 = extractelement <8 x i4> %vec1, i64 5
885  %cv1e5 = zext i4 %v1e5 to i8
886  %v2e5 = extractelement <8 x i4> %vec2, i64 5
887  %cv2e5 = zext i4 %v2e5 to i8
888  %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
889
890  %v1e6 = extractelement <8 x i4> %vec1, i64 6
891  %cv1e6 = zext i4 %v1e6 to i8
892  %v2e6 = extractelement <8 x i4> %vec2, i64 6
893  %cv2e6 = zext i4 %v2e6 to i8
894  %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
895
896  %v1e7 = extractelement <8 x i4> %vec1, i64 7
897  %cv1e7 = zext i4 %v1e7 to i8
898  %v2e7 = extractelement <8 x i4> %vec2, i64 7
899  %cv2e7 = zext i4 %v2e7 to i8
900  %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
901
902  %acc = load i8, ptr addrspace(1) %dst, align 4
903  %add1 = add i8 %mul0, %acc
904  %add2 = add i8 %add1, %mul1
905  %add3 = add i8 %add2, %mul2
906  %add4 = add i8 %add3, %mul3
907  %add5 = add i8 %add4, %mul4
908  %add6 = add i8 %add5, %mul5
909  %add7 = add i8 %add6, %mul6
910  %add8 = add i8 %add7, %mul7
911
912  store i8 %add8, ptr addrspace(1) %dst, align 4
913  ret void
914}
915
916; TODO: Remove the two unnecessary instructions(and+add after 2nd MAD)
917; to have the pattern-recognizer to kick in.
918define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
919; GFX7-LABEL: udot8_acc4:
920; GFX7:       ; %bb.0: ; %entry
921; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
922; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
923; GFX7-NEXT:    s_mov_b32 s14, -1
924; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
925; GFX7-NEXT:    s_add_u32 s12, s12, s11
926; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
927; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
928; GFX7-NEXT:    s_mov_b32 s3, 0xf000
929; GFX7-NEXT:    s_mov_b32 s6, 0
930; GFX7-NEXT:    s_mov_b32 s7, s3
931; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
932; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
933; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
934; GFX7-NEXT:    v_mov_b32_e32 v1, 0
935; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
936; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
937; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
938; GFX7-NEXT:    s_mov_b32 s2, -1
939; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
940; GFX7-NEXT:    s_addc_u32 s13, s13, 0
941; GFX7-NEXT:    s_waitcnt vmcnt(2)
942; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
943; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
944; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
945; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
946; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
947; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
948; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
949; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
950; GFX7-NEXT:    s_waitcnt vmcnt(1)
951; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
952; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
953; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
954; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
955; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
956; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
957; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
958; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
959; GFX7-NEXT:    s_waitcnt vmcnt(0)
960; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
961; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
962; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
963; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
964; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
965; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
966; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
967; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
968; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
969; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
970; GFX7-NEXT:    s_endpgm
971;
972; GFX8-LABEL: udot8_acc4:
973; GFX8:       ; %bb.0: ; %entry
974; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
975; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
976; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
977; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
978; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
979; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
980; GFX8-NEXT:    v_mov_b32_e32 v1, s1
981; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
982; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
983; GFX8-NEXT:    flat_load_dword v3, v[0:1]
984; GFX8-NEXT:    v_mov_b32_e32 v1, s3
985; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
986; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
987; GFX8-NEXT:    flat_load_dword v2, v[0:1]
988; GFX8-NEXT:    v_mov_b32_e32 v0, s4
989; GFX8-NEXT:    v_mov_b32_e32 v1, s5
990; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
991; GFX8-NEXT:    s_mov_b32 s14, -1
992; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
993; GFX8-NEXT:    s_add_u32 s12, s12, s11
994; GFX8-NEXT:    s_addc_u32 s13, s13, 0
995; GFX8-NEXT:    s_waitcnt vmcnt(2)
996; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
997; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
998; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
999; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
1000; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
1001; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
1002; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
1003; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
1004; GFX8-NEXT:    s_waitcnt vmcnt(1)
1005; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
1006; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
1007; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
1008; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
1009; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
1010; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
1011; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
1012; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1013; GFX8-NEXT:    s_waitcnt vmcnt(0)
1014; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1015; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
1016; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
1017; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
1018; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
1019; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
1020; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
1021; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
1022; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1023; GFX8-NEXT:    flat_store_byte v[0:1], v2
1024; GFX8-NEXT:    s_endpgm
1025;
1026; GFX9-LABEL: udot8_acc4:
1027; GFX9:       ; %bb.0: ; %entry
1028; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1029; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1030; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1031; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1032; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1033; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1034; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
1035; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
1036; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1037; GFX9-NEXT:    global_load_ubyte v3, v0, s[6:7]
1038; GFX9-NEXT:    s_mov_b32 s14, -1
1039; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
1040; GFX9-NEXT:    s_add_u32 s12, s12, s11
1041; GFX9-NEXT:    s_addc_u32 s13, s13, 0
1042; GFX9-NEXT:    s_waitcnt vmcnt(2)
1043; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1044; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1045; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
1046; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
1047; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
1048; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
1049; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
1050; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1051; GFX9-NEXT:    s_waitcnt vmcnt(1)
1052; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1053; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
1054; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
1055; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
1056; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
1057; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
1058; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
1059; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
1060; GFX9-NEXT:    s_waitcnt vmcnt(0)
1061; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1062; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
1063; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
1064; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
1065; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
1066; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
1067; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
1068; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
1069; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1070; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
1071; GFX9-NEXT:    s_endpgm
1072;
1073; GFX9-DL-LABEL: udot8_acc4:
1074; GFX9-DL:       ; %bb.0: ; %entry
1075; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1076; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1077; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1078; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1079; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1080; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1081; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1082; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1083; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1084; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[6:7]
1085; GFX9-DL-NEXT:    s_mov_b32 s14, -1
1086; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
1087; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
1088; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
1089; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1090; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1091; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1092; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
1093; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
1094; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
1095; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
1096; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
1097; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1098; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1099; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1100; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
1101; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
1102; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
1103; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
1104; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
1105; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
1106; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1107; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1108; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1109; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
1110; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
1111; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
1112; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
1113; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
1114; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
1115; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
1116; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1117; GFX9-DL-NEXT:    global_store_byte v0, v1, s[6:7]
1118; GFX9-DL-NEXT:    s_endpgm
1119;
1120; GFX10-DL-LABEL: udot8_acc4:
1121; GFX10-DL:       ; %bb.0: ; %entry
1122; GFX10-DL-NEXT:    s_clause 0x1
1123; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1124; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1125; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1126; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1127; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1128; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1129; GFX10-DL-NEXT:    s_mov_b32 s14, -1
1130; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
1131; GFX10-DL-NEXT:    s_add_u32 s12, s12, s11
1132; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
1133; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1134; GFX10-DL-NEXT:    s_clause 0x1
1135; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
1136; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
1137; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[6:7]
1138; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1139; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
1140; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1141; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
1142; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
1143; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
1144; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1145; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
1146; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
1147; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
1148; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1149; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
1150; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
1151; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1152; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
1153; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
1154; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1155; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
1156; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
1157; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1158; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
1159; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
1160; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
1161; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
1162; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1163; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1164; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
1165; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
1166; GFX10-DL-NEXT:    global_store_byte v1, v0, s[6:7]
1167; GFX10-DL-NEXT:    s_endpgm
1168                                      ptr addrspace(1) %src2,
1169                                      ptr addrspace(1) nocapture %dst) {
1170entry:
1171  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1172  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
1173  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
1174  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
1175  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
1176
1177  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1178  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1179  %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1180
1181  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1182  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1183  %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1184
1185  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1186  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1187  %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1188
1189  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1190  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1191  %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1192
1193  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1194  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1195  %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1196
1197  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1198  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1199  %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1200
1201  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1202  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1203  %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1204
1205  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1206  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1207  %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1208
1209  %acc = load i4, ptr addrspace(1) %dst, align 4
1210  %add1 = add i4 %mul0, %acc
1211  %add2 = add i4 %add1, %mul1
1212  %add3 = add i4 %add2, %mul2
1213  %add4 = add i4 %add3, %mul3
1214  %add5 = add i4 %add4, %mul4
1215  %add6 = add i4 %add5, %mul5
1216  %add7 = add i4 %add6, %mul6
1217  %add8 = add i4 %add7, %mul7
1218
1219  store i4 %add8, ptr addrspace(1) %dst, align 4
1220  ret void
1221}
1222
1223; TODO: Currently, permutation of udot8 is turned off due to a huge increase
1224; in the compile time.
1225define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
1226; GFX7-LABEL: udot8_CommutationInsideMAD:
1227; GFX7:       ; %bb.0: ; %entry
1228; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1229; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1230; GFX7-NEXT:    s_mov_b32 s14, -1
1231; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1232; GFX7-NEXT:    s_add_u32 s12, s12, s11
1233; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1234; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1235; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1236; GFX7-NEXT:    s_mov_b32 s6, 0
1237; GFX7-NEXT:    s_mov_b32 s7, s3
1238; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1239; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1240; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1241; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1242; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1243; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1244; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1245; GFX7-NEXT:    s_mov_b32 s2, -1
1246; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
1247; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1248; GFX7-NEXT:    s_waitcnt vmcnt(2)
1249; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
1250; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
1251; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
1252; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
1253; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
1254; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
1255; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
1256; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
1257; GFX7-NEXT:    s_waitcnt vmcnt(1)
1258; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
1259; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
1260; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
1261; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
1262; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
1263; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
1264; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
1265; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1266; GFX7-NEXT:    s_waitcnt vmcnt(0)
1267; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1268; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
1269; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
1270; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
1271; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
1272; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
1273; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
1274; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
1275; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1276; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1277; GFX7-NEXT:    s_endpgm
1278;
1279; GFX8-LABEL: udot8_CommutationInsideMAD:
1280; GFX8:       ; %bb.0: ; %entry
1281; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1282; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1283; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1284; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1285; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1286; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1287; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1288; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1289; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1290; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1291; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1292; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1293; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1294; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1295; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1296; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1297; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
1298; GFX8-NEXT:    s_mov_b32 s14, -1
1299; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
1300; GFX8-NEXT:    s_add_u32 s12, s12, s11
1301; GFX8-NEXT:    s_addc_u32 s13, s13, 0
1302; GFX8-NEXT:    s_waitcnt vmcnt(2)
1303; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
1304; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
1305; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
1306; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
1307; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
1308; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
1309; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
1310; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
1311; GFX8-NEXT:    s_waitcnt vmcnt(1)
1312; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
1313; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
1314; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
1315; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
1316; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
1317; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
1318; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
1319; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1320; GFX8-NEXT:    s_waitcnt vmcnt(0)
1321; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1322; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
1323; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
1324; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
1325; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
1326; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
1327; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
1328; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
1329; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1330; GFX8-NEXT:    flat_store_byte v[0:1], v2
1331; GFX8-NEXT:    s_endpgm
1332;
1333; GFX9-LABEL: udot8_CommutationInsideMAD:
1334; GFX9:       ; %bb.0: ; %entry
1335; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1336; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1337; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1338; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1339; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1340; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1341; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
1342; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
1343; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1344; GFX9-NEXT:    global_load_ubyte v3, v0, s[6:7]
1345; GFX9-NEXT:    s_mov_b32 s14, -1
1346; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
1347; GFX9-NEXT:    s_add_u32 s12, s12, s11
1348; GFX9-NEXT:    s_addc_u32 s13, s13, 0
1349; GFX9-NEXT:    s_waitcnt vmcnt(2)
1350; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1351; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1352; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
1353; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
1354; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
1355; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
1356; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
1357; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1358; GFX9-NEXT:    s_waitcnt vmcnt(1)
1359; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1360; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
1361; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
1362; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
1363; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
1364; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
1365; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
1366; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
1367; GFX9-NEXT:    s_waitcnt vmcnt(0)
1368; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1369; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
1370; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
1371; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
1372; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
1373; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
1374; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
1375; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
1376; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1377; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
1378; GFX9-NEXT:    s_endpgm
1379;
1380; GFX9-DL-LABEL: udot8_CommutationInsideMAD:
1381; GFX9-DL:       ; %bb.0: ; %entry
1382; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1383; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1384; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1385; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1386; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1387; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1388; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1389; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1390; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1391; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[6:7]
1392; GFX9-DL-NEXT:    s_mov_b32 s14, -1
1393; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
1394; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
1395; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
1396; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1397; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1398; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1399; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
1400; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
1401; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
1402; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
1403; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
1404; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1405; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1406; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1407; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
1408; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
1409; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
1410; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
1411; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
1412; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
1413; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1414; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1415; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1416; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
1417; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
1418; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
1419; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
1420; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
1421; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
1422; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
1423; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1424; GFX9-DL-NEXT:    global_store_byte v0, v1, s[6:7]
1425; GFX9-DL-NEXT:    s_endpgm
1426;
1427; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
1428; GFX10-DL:       ; %bb.0: ; %entry
1429; GFX10-DL-NEXT:    s_clause 0x1
1430; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1431; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1432; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1433; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1434; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1435; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1436; GFX10-DL-NEXT:    s_mov_b32 s14, -1
1437; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
1438; GFX10-DL-NEXT:    s_add_u32 s12, s12, s11
1439; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
1440; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1441; GFX10-DL-NEXT:    s_clause 0x1
1442; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
1443; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
1444; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[6:7]
1445; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1446; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
1447; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1448; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
1449; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
1450; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
1451; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1452; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
1453; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
1454; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
1455; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1456; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
1457; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
1458; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1459; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
1460; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
1461; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1462; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
1463; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
1464; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1465; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
1466; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
1467; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
1468; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
1469; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1470; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1471; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
1472; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
1473; GFX10-DL-NEXT:    global_store_byte v1, v0, s[6:7]
1474; GFX10-DL-NEXT:    s_endpgm
1475                                                      ptr addrspace(1) %src2,
1476                                                      ptr addrspace(1) nocapture %dst) {
1477entry:
1478  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1479  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
1480  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
1481  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
1482  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
1483
1484  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1485  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1486  %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1487
1488  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1489  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1490  %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1491
1492  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1493  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1494  %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1495
1496  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1497  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1498  %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1499
1500  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1501  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1502  %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1503
1504  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1505  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1506  %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1507
1508  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1509  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1510  %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1511
1512  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1513  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1514  %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1515
1516  %acc = load i4, ptr addrspace(1) %dst, align 4
1517  %add1 = add i4 %mul0, %acc
1518  %add2 = add i4 %mul1, %add1
1519  %add3 = add i4 %mul2, %add2
1520  %add4 = add i4 %mul3, %add3
1521  %add5 = add i4 %mul4, %add4
1522  %add6 = add i4 %mul5, %add5
1523  %add7 = add i4 %mul6, %add6
1524  %add8 = add i4 %mul7, %add7
1525
1526  store i4 %add8, ptr addrspace(1) %dst, align 4
1527  ret void
1528}
1529
1530define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
1531; GFX7-LABEL: udot8_multiuses_mul1:
1532; GFX7:       ; %bb.0: ; %entry
1533; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1534; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1535; GFX7-NEXT:    s_mov_b32 s14, -1
1536; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1537; GFX7-NEXT:    s_add_u32 s12, s12, s11
1538; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1539; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1540; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1541; GFX7-NEXT:    s_mov_b32 s6, 0
1542; GFX7-NEXT:    s_mov_b32 s7, s3
1543; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1544; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1545; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1546; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1547; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1548; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1549; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1550; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1551; GFX7-NEXT:    s_mov_b32 s2, -1
1552; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1553; GFX7-NEXT:    s_waitcnt vmcnt(1)
1554; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
1555; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
1556; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
1557; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
1558; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
1559; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
1560; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
1561; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
1562; GFX7-NEXT:    s_waitcnt vmcnt(0)
1563; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
1564; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
1565; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
1566; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
1567; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
1568; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
1569; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
1570; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1571; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1572; GFX7-NEXT:    v_mad_u32_u24 v16, v2, v0, s4
1573; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v16
1574; GFX7-NEXT:    v_mad_u32_u24 v2, v8, v15, v16
1575; GFX7-NEXT:    v_mad_u32_u24 v2, v7, v14, v2
1576; GFX7-NEXT:    v_mad_u32_u24 v2, v6, v13, v2
1577; GFX7-NEXT:    v_mad_u32_u24 v2, v5, v12, v2
1578; GFX7-NEXT:    v_mad_u32_u24 v2, v4, v11, v2
1579; GFX7-NEXT:    v_mad_u32_u24 v2, v3, v10, v2
1580; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v9, v2
1581; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1582; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1583; GFX7-NEXT:    s_endpgm
1584;
1585; GFX8-LABEL: udot8_multiuses_mul1:
1586; GFX8:       ; %bb.0: ; %entry
1587; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1588; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1589; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1590; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1591; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1592; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1593; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1594; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1595; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1596; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1597; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1598; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1599; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1600; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1601; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1602; GFX8-NEXT:    s_mov_b32 s14, -1
1603; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
1604; GFX8-NEXT:    s_add_u32 s12, s12, s11
1605; GFX8-NEXT:    s_addc_u32 s13, s13, 0
1606; GFX8-NEXT:    s_waitcnt vmcnt(1)
1607; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v3
1608; GFX8-NEXT:    v_bfe_u32 v2, v3, 24, 4
1609; GFX8-NEXT:    v_bfe_u32 v4, v3, 20, 4
1610; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 4
1611; GFX8-NEXT:    v_bfe_u32 v6, v3, 12, 4
1612; GFX8-NEXT:    v_bfe_u32 v7, v3, 8, 4
1613; GFX8-NEXT:    v_bfe_u32 v8, v3, 4, 4
1614; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
1615; GFX8-NEXT:    s_waitcnt vmcnt(0)
1616; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
1617; GFX8-NEXT:    v_bfe_u32 v10, v0, 24, 4
1618; GFX8-NEXT:    v_bfe_u32 v11, v0, 20, 4
1619; GFX8-NEXT:    v_bfe_u32 v12, v0, 16, 4
1620; GFX8-NEXT:    v_bfe_u32 v13, v0, 12, 4
1621; GFX8-NEXT:    v_bfe_u32 v14, v0, 8, 4
1622; GFX8-NEXT:    v_bfe_u32 v15, v0, 4, 4
1623; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
1624; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1625; GFX8-NEXT:    v_mad_u32_u24 v16, v3, v0, s0
1626; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, v16
1627; GFX8-NEXT:    v_mad_u32_u24 v3, v8, v15, v16
1628; GFX8-NEXT:    v_mad_u32_u24 v3, v7, v14, v3
1629; GFX8-NEXT:    v_mad_u32_u24 v3, v6, v13, v3
1630; GFX8-NEXT:    v_mad_u32_u24 v3, v5, v12, v3
1631; GFX8-NEXT:    v_mad_u32_u24 v3, v4, v11, v3
1632; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v10, v3
1633; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v9, v2
1634; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
1635; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1636; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1637; GFX8-NEXT:    flat_store_dword v[0:1], v2
1638; GFX8-NEXT:    s_endpgm
1639;
1640; GFX9-LABEL: udot8_multiuses_mul1:
1641; GFX9:       ; %bb.0: ; %entry
1642; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1643; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1644; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1645; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1646; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1647; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1648; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
1649; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
1650; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
1651; GFX9-NEXT:    s_mov_b32 s14, -1
1652; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
1653; GFX9-NEXT:    s_add_u32 s12, s12, s11
1654; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1655; GFX9-NEXT:    s_addc_u32 s13, s13, 0
1656; GFX9-NEXT:    s_waitcnt vmcnt(1)
1657; GFX9-NEXT:    v_bfe_u32 v3, v1, 4, 4
1658; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1659; GFX9-NEXT:    v_bfe_u32 v5, v1, 24, 4
1660; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
1661; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
1662; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
1663; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
1664; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1665; GFX9-NEXT:    s_waitcnt vmcnt(0)
1666; GFX9-NEXT:    v_bfe_u32 v10, v2, 4, 4
1667; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1668; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
1669; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
1670; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
1671; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
1672; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
1673; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
1674; GFX9-NEXT:    v_mul_u32_u24_e32 v17, v1, v2
1675; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1676; GFX9-NEXT:    v_mad_u32_u24 v1, v1, v2, s0
1677; GFX9-NEXT:    v_mul_u32_u24_e32 v9, v9, v16
1678; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
1679; GFX9-NEXT:    v_mad_u32_u24 v2, v3, v10, v1
1680; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
1681; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
1682; GFX9-NEXT:    v_add3_u32 v2, v2, v9, v8
1683; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
1684; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
1685; GFX9-NEXT:    v_add3_u32 v2, v2, v7, v6
1686; GFX9-NEXT:    v_add3_u32 v2, v2, v5, v4
1687; GFX9-NEXT:    v_add3_u32 v1, v17, v1, v2
1688; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
1689; GFX9-NEXT:    s_endpgm
1690;
1691; GFX9-DL-LABEL: udot8_multiuses_mul1:
1692; GFX9-DL:       ; %bb.0: ; %entry
1693; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1694; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1695; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1696; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1697; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1698; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1699; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1700; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1701; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1702; GFX9-DL-NEXT:    s_mov_b32 s14, -1
1703; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
1704; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
1705; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1706; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
1707; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1708; GFX9-DL-NEXT:    v_bfe_u32 v3, v1, 4, 4
1709; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1710; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
1711; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
1712; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
1713; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
1714; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
1715; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1716; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1717; GFX9-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
1718; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1719; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
1720; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
1721; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
1722; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
1723; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
1724; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1725; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v17, v1, v2
1726; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1727; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v1, v2, s0
1728; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v9, v9, v16
1729; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
1730; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v3, v10, v1
1731; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
1732; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
1733; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v9, v8
1734; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
1735; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
1736; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v7, v6
1737; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v5, v4
1738; GFX9-DL-NEXT:    v_add3_u32 v1, v17, v1, v2
1739; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1740; GFX9-DL-NEXT:    s_endpgm
1741;
1742; GFX10-DL-LABEL: udot8_multiuses_mul1:
1743; GFX10-DL:       ; %bb.0: ; %entry
1744; GFX10-DL-NEXT:    s_clause 0x1
1745; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1746; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1747; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1748; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1749; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1750; GFX10-DL-NEXT:    s_mov_b32 s14, -1
1751; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
1752; GFX10-DL-NEXT:    s_add_u32 s12, s12, s11
1753; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
1754; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX10-DL-NEXT:    s_clause 0x1
1756; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1757; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1758; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1759; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1760; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1761; GFX10-DL-NEXT:    v_and_b32_e32 v8, 15, v1
1762; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1763; GFX10-DL-NEXT:    v_and_b32_e32 v9, 15, v2
1764; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
1765; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
1766; GFX10-DL-NEXT:    v_bfe_u32 v4, v1, 24, 4
1767; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 20, 4
1768; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 16, 4
1769; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
1770; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 8, 4
1771; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
1772; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 8, 4
1773; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 12, 4
1774; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1775; GFX10-DL-NEXT:    v_mad_u32_u24 v13, v8, v9, s0
1776; GFX10-DL-NEXT:    v_bfe_u32 v14, v2, 20, 4
1777; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 16, 4
1778; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, v1, v11
1779; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v7, v7, v12
1780; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v10, v13
1781; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
1782; GFX10-DL-NEXT:    v_bfe_u32 v2, v2, 24, 4
1783; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v15
1784; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v14
1785; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v7
1786; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, v4, v2
1787; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v10
1788; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v3, v8, v9
1789; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v6, v5
1790; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
1791; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1792; GFX10-DL-NEXT:    v_add3_u32 v0, v3, v13, v0
1793; GFX10-DL-NEXT:    global_store_dword v1, v0, s[6:7]
1794; GFX10-DL-NEXT:    s_endpgm
1795                                                ptr addrspace(1) %src2,
1796                                                ptr addrspace(1) nocapture %dst) {
1797entry:
1798  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1799  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
1800  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
1801  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
1802  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
1803
1804  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1805  %cv1e0 = zext i4 %v1e0 to i32
1806  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1807  %cv2e0 = zext i4 %v2e0 to i32
1808  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1809
1810  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1811  %cv1e1 = zext i4 %v1e1 to i32
1812  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1813  %cv2e1 = zext i4 %v2e1 to i32
1814  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1815
1816  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1817  %cv1e2 = zext i4 %v1e2 to i32
1818  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1819  %cv2e2 = zext i4 %v2e2 to i32
1820  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1821
1822  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1823  %cv1e3 = zext i4 %v1e3 to i32
1824  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1825  %cv2e3 = zext i4 %v2e3 to i32
1826  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1827
1828  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1829  %cv1e4 = zext i4 %v1e4 to i32
1830  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1831  %cv2e4 = zext i4 %v2e4 to i32
1832  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1833
1834  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1835  %cv1e5 = zext i4 %v1e5 to i32
1836  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1837  %cv2e5 = zext i4 %v2e5 to i32
1838  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1839
1840  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1841  %cv1e6 = zext i4 %v1e6 to i32
1842  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1843  %cv2e6 = zext i4 %v2e6 to i32
1844  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1845
1846  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1847  %cv1e7 = zext i4 %v1e7 to i32
1848  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1849  %cv2e7 = zext i4 %v2e7 to i32
1850  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1851
1852  %acc = load i32, ptr addrspace(1) %dst, align 4
1853  %add1 = add i32 %mul0, %acc
1854  %add = add i32  %mul0, %add1
1855  %add2 = add i32 %add1, %mul1
1856  %add3 = add i32 %add2, %mul2
1857  %add4 = add i32 %add3, %mul3
1858  %add5 = add i32 %add4, %mul4
1859  %add6 = add i32 %add5, %mul5
1860  %add7 = add i32 %add6, %mul6
1861  %add8 = add i32 %add7, %mul7
1862
1863  %res = add i32 %add, %add8
1864  store i32 %res, ptr addrspace(1) %dst, align 4
1865  ret void
1866}
1867
1868define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
1869; GFX7-LABEL: udot8_acc32_vecMul:
1870; GFX7:       ; %bb.0: ; %entry
1871; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1872; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1873; GFX7-NEXT:    s_mov_b32 s14, -1
1874; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1875; GFX7-NEXT:    s_add_u32 s12, s12, s11
1876; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1877; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1878; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1879; GFX7-NEXT:    s_mov_b32 s6, 0
1880; GFX7-NEXT:    s_mov_b32 s7, s3
1881; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1882; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1883; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1884; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1885; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1886; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1887; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1888; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1889; GFX7-NEXT:    s_mov_b32 s2, -1
1890; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1891; GFX7-NEXT:    s_waitcnt vmcnt(1)
1892; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
1893; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
1894; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
1895; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
1896; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
1897; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
1898; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
1899; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
1900; GFX7-NEXT:    s_waitcnt vmcnt(0)
1901; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
1902; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
1903; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
1904; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
1905; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
1906; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
1907; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
1908; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1909; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1910; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, s4
1911; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
1912; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
1913; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
1914; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
1915; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
1916; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
1917; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
1918; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1919; GFX7-NEXT:    s_endpgm
1920;
1921; GFX8-LABEL: udot8_acc32_vecMul:
1922; GFX8:       ; %bb.0: ; %entry
1923; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1924; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1925; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1926; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1927; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1928; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1929; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1930; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1931; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1932; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1933; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1934; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1935; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1936; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1937; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1938; GFX8-NEXT:    s_mov_b32 s14, -1
1939; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
1940; GFX8-NEXT:    s_add_u32 s12, s12, s11
1941; GFX8-NEXT:    s_addc_u32 s13, s13, 0
1942; GFX8-NEXT:    s_waitcnt vmcnt(1)
1943; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v3
1944; GFX8-NEXT:    v_bfe_u32 v2, v3, 24, 4
1945; GFX8-NEXT:    v_bfe_u32 v4, v3, 20, 4
1946; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 4
1947; GFX8-NEXT:    v_bfe_u32 v6, v3, 12, 4
1948; GFX8-NEXT:    v_bfe_u32 v7, v3, 8, 4
1949; GFX8-NEXT:    v_bfe_u32 v8, v3, 4, 4
1950; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
1951; GFX8-NEXT:    s_waitcnt vmcnt(0)
1952; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
1953; GFX8-NEXT:    v_bfe_u32 v10, v0, 24, 4
1954; GFX8-NEXT:    v_bfe_u32 v11, v0, 20, 4
1955; GFX8-NEXT:    v_bfe_u32 v12, v0, 16, 4
1956; GFX8-NEXT:    v_bfe_u32 v13, v0, 12, 4
1957; GFX8-NEXT:    v_bfe_u32 v14, v0, 8, 4
1958; GFX8-NEXT:    v_bfe_u32 v15, v0, 4, 4
1959; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
1960; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1961; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, s0
1962; GFX8-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
1963; GFX8-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
1964; GFX8-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
1965; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
1966; GFX8-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
1967; GFX8-NEXT:    v_mad_u32_u24 v0, v2, v10, v0
1968; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v9, v0
1969; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1970; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1971; GFX8-NEXT:    flat_store_dword v[0:1], v2
1972; GFX8-NEXT:    s_endpgm
1973;
1974; GFX9-LABEL: udot8_acc32_vecMul:
1975; GFX9:       ; %bb.0: ; %entry
1976; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1977; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1978; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1979; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1980; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1981; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1982; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
1983; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
1984; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
1985; GFX9-NEXT:    s_mov_b32 s14, -1
1986; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
1987; GFX9-NEXT:    s_add_u32 s12, s12, s11
1988; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1989; GFX9-NEXT:    s_addc_u32 s13, s13, 0
1990; GFX9-NEXT:    s_waitcnt vmcnt(1)
1991; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
1992; GFX9-NEXT:    v_bfe_u32 v4, v1, 24, 4
1993; GFX9-NEXT:    v_bfe_u32 v5, v1, 20, 4
1994; GFX9-NEXT:    v_bfe_u32 v6, v1, 16, 4
1995; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
1996; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
1997; GFX9-NEXT:    v_bfe_u32 v9, v1, 4, 4
1998; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1999; GFX9-NEXT:    s_waitcnt vmcnt(0)
2000; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
2001; GFX9-NEXT:    v_bfe_u32 v11, v2, 24, 4
2002; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
2003; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 4
2004; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
2005; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
2006; GFX9-NEXT:    v_bfe_u32 v16, v2, 4, 4
2007; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
2008; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
2009; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v9, v16
2010; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
2011; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
2012; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2013; GFX9-NEXT:    v_add3_u32 v1, v1, s0, v2
2014; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
2015; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
2016; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
2017; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
2018; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v3, v10
2019; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v5
2020; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
2021; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
2022; GFX9-NEXT:    s_endpgm
2023;
2024; GFX9-DL-LABEL: udot8_acc32_vecMul:
2025; GFX9-DL:       ; %bb.0: ; %entry
2026; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2027; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2028; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2029; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2030; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2031; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2032; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2033; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2034; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2035; GFX9-DL-NEXT:    s_mov_b32 s14, -1
2036; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
2037; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
2038; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2039; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
2040; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2041; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s0
2042; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2043; GFX9-DL-NEXT:    s_endpgm
2044;
2045; GFX10-DL-LABEL: udot8_acc32_vecMul:
2046; GFX10-DL:       ; %bb.0: ; %entry
2047; GFX10-DL-NEXT:    s_clause 0x1
2048; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2049; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2050; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2051; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2052; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2053; GFX10-DL-NEXT:    s_mov_b32 s14, -1
2054; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
2055; GFX10-DL-NEXT:    s_add_u32 s12, s12, s11
2056; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
2057; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2058; GFX10-DL-NEXT:    s_clause 0x1
2059; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2060; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2061; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
2062; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2063; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2064; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2065; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s0
2066; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2067; GFX10-DL-NEXT:    s_endpgm
2068                                              ptr addrspace(1) %src2,
2069                                              ptr addrspace(1) nocapture %dst) {
2070entry:
2071  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2072  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
2073  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
2074  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
2075  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
2076
2077  %cvec1 = zext <8 x i4> %vec1 to <8 x i32>
2078  %cvec2 = zext <8 x i4> %vec2 to <8 x i32>
2079
2080  %mul = mul <8 x i32> %cvec1, %cvec2
2081  %mul0 = extractelement <8 x i32> %mul, i64 0
2082  %mul1 = extractelement <8 x i32> %mul, i64 1
2083  %mul2 = extractelement <8 x i32> %mul, i64 2
2084  %mul3 = extractelement <8 x i32> %mul, i64 3
2085  %mul4 = extractelement <8 x i32> %mul, i64 4
2086  %mul5 = extractelement <8 x i32> %mul, i64 5
2087  %mul6 = extractelement <8 x i32> %mul, i64 6
2088  %mul7 = extractelement <8 x i32> %mul, i64 7
2089
2090  %acc = load i32, ptr addrspace(1) %dst, align 4
2091  %add1 = add i32 %mul0, %acc
2092  %add2 = add i32 %add1, %mul1
2093  %add3 = add i32 %add2, %mul2
2094  %add4 = add i32 %add3, %mul3
2095  %add5 = add i32 %add4, %mul4
2096  %add6 = add i32 %add5, %mul5
2097  %add7 = add i32 %add6, %mul6
2098  %add8 = add i32 %add7, %mul7
2099
2100  store i32 %add8, ptr addrspace(1) %dst, align 4
2101  ret void
2102}
2103
2104; TODO: Clean up the code(by default pk_mad_I16 should be generated), then
2105; support the pattern.
2106define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
2107; GFX7-LABEL: udot8_acc16_vecMul:
2108; GFX7:       ; %bb.0: ; %entry
2109; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2110; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2111; GFX7-NEXT:    s_mov_b32 s14, -1
2112; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2113; GFX7-NEXT:    s_add_u32 s12, s12, s11
2114; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2115; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2116; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2117; GFX7-NEXT:    s_mov_b32 s6, 0
2118; GFX7-NEXT:    s_mov_b32 s7, s3
2119; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2120; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2121; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2122; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2123; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2124; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
2125; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2126; GFX7-NEXT:    s_mov_b32 s2, -1
2127; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
2128; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2129; GFX7-NEXT:    s_waitcnt vmcnt(2)
2130; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
2131; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
2132; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
2133; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
2134; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
2135; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
2136; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
2137; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
2138; GFX7-NEXT:    s_waitcnt vmcnt(1)
2139; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
2140; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
2141; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
2142; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
2143; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
2144; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
2145; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
2146; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
2147; GFX7-NEXT:    s_waitcnt vmcnt(0)
2148; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2149; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
2150; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
2151; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
2152; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
2153; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
2154; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
2155; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
2156; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
2157; GFX7-NEXT:    s_endpgm
2158;
2159; GFX8-LABEL: udot8_acc16_vecMul:
2160; GFX8:       ; %bb.0: ; %entry
2161; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2162; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2163; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2164; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2165; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2166; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2167; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2168; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2169; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2170; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2171; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2172; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2173; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2174; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2175; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2176; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2177; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
2178; GFX8-NEXT:    s_mov_b32 s14, -1
2179; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
2180; GFX8-NEXT:    s_add_u32 s12, s12, s11
2181; GFX8-NEXT:    s_addc_u32 s13, s13, 0
2182; GFX8-NEXT:    s_waitcnt vmcnt(2)
2183; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
2184; GFX8-NEXT:    v_bfe_u32 v6, v3, 24, 4
2185; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
2186; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
2187; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
2188; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
2189; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
2190; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
2191; GFX8-NEXT:    s_waitcnt vmcnt(1)
2192; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
2193; GFX8-NEXT:    v_bfe_u32 v13, v2, 24, 4
2194; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
2195; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
2196; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
2197; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
2198; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
2199; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
2200; GFX8-NEXT:    s_waitcnt vmcnt(0)
2201; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2202; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
2203; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
2204; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
2205; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
2206; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
2207; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
2208; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
2209; GFX8-NEXT:    flat_store_short v[0:1], v2
2210; GFX8-NEXT:    s_endpgm
2211;
2212; GFX9-LABEL: udot8_acc16_vecMul:
2213; GFX9:       ; %bb.0: ; %entry
2214; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2215; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2216; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2217; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2218; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2219; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2220; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
2221; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
2222; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2223; GFX9-NEXT:    global_load_ushort v3, v0, s[6:7]
2224; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
2225; GFX9-NEXT:    s_mov_b32 s14, -1
2226; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
2227; GFX9-NEXT:    s_add_u32 s12, s12, s11
2228; GFX9-NEXT:    s_addc_u32 s13, s13, 0
2229; GFX9-NEXT:    s_waitcnt vmcnt(2)
2230; GFX9-NEXT:    v_and_b32_e32 v4, 15, v1
2231; GFX9-NEXT:    v_bfe_u32 v5, v1, 4, 4
2232; GFX9-NEXT:    v_bfe_u32 v6, v1, 8, 4
2233; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
2234; GFX9-NEXT:    s_waitcnt vmcnt(1)
2235; GFX9-NEXT:    v_and_b32_e32 v11, 15, v2
2236; GFX9-NEXT:    v_bfe_u32 v12, v2, 4, 4
2237; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s0
2238; GFX9-NEXT:    v_perm_b32 v7, v12, v11, s0
2239; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s0
2240; GFX9-NEXT:    v_bfe_u32 v8, v1, 16, 4
2241; GFX9-NEXT:    v_bfe_u32 v9, v1, 20, 4
2242; GFX9-NEXT:    v_bfe_u32 v13, v2, 8, 4
2243; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
2244; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
2245; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s0
2246; GFX9-NEXT:    v_perm_b32 v9, v14, v13, s0
2247; GFX9-NEXT:    s_waitcnt vmcnt(0)
2248; GFX9-NEXT:    v_add_u16_e32 v3, v4, v3
2249; GFX9-NEXT:    v_bfe_u32 v10, v1, 24, 4
2250; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
2251; GFX9-NEXT:    v_bfe_u32 v15, v2, 16, 4
2252; GFX9-NEXT:    v_bfe_u32 v16, v2, 20, 4
2253; GFX9-NEXT:    v_bfe_u32 v17, v2, 24, 4
2254; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
2255; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
2256; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2257; GFX9-NEXT:    v_perm_b32 v2, v2, v17, s0
2258; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s0
2259; GFX9-NEXT:    v_perm_b32 v10, v16, v15, s0
2260; GFX9-NEXT:    v_add_u16_e32 v3, v3, v5
2261; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2262; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
2263; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2264; GFX9-NEXT:    v_add_u16_e32 v3, v3, v2
2265; GFX9-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2266; GFX9-NEXT:    v_add_u16_e32 v2, v2, v1
2267; GFX9-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2268; GFX9-NEXT:    global_store_short v0, v1, s[6:7]
2269; GFX9-NEXT:    s_endpgm
2270;
2271; GFX9-DL-LABEL: udot8_acc16_vecMul:
2272; GFX9-DL:       ; %bb.0: ; %entry
2273; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2274; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2275; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2276; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2277; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2278; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2279; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2280; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2281; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2282; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[6:7]
2283; GFX9-DL-NEXT:    s_mov_b32 s0, 0x5040100
2284; GFX9-DL-NEXT:    s_mov_b32 s14, -1
2285; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
2286; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
2287; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
2288; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2289; GFX9-DL-NEXT:    v_and_b32_e32 v4, 15, v1
2290; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 4, 4
2291; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 8, 4
2292; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
2293; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2294; GFX9-DL-NEXT:    v_and_b32_e32 v11, 15, v2
2295; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 4, 4
2296; GFX9-DL-NEXT:    v_perm_b32 v6, v7, v6, s0
2297; GFX9-DL-NEXT:    v_perm_b32 v7, v12, v11, s0
2298; GFX9-DL-NEXT:    v_perm_b32 v4, v5, v4, s0
2299; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 16, 4
2300; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 20, 4
2301; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
2302; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 12, 4
2303; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
2304; GFX9-DL-NEXT:    v_perm_b32 v8, v9, v8, s0
2305; GFX9-DL-NEXT:    v_perm_b32 v9, v14, v13, s0
2306; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2307; GFX9-DL-NEXT:    v_add_u16_e32 v3, v4, v3
2308; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 24, 4
2309; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
2310; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 16, 4
2311; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 20, 4
2312; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 24, 4
2313; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
2314; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
2315; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2316; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v17, s0
2317; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v10, s0
2318; GFX9-DL-NEXT:    v_perm_b32 v10, v16, v15, s0
2319; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v5
2320; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2321; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
2322; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2323; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v2
2324; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2325; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v1
2326; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2327; GFX9-DL-NEXT:    global_store_short v0, v1, s[6:7]
2328; GFX9-DL-NEXT:    s_endpgm
2329;
2330; GFX10-DL-LABEL: udot8_acc16_vecMul:
2331; GFX10-DL:       ; %bb.0: ; %entry
2332; GFX10-DL-NEXT:    s_clause 0x1
2333; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2334; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2335; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2336; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2337; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2338; GFX10-DL-NEXT:    s_mov_b32 s14, -1
2339; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
2340; GFX10-DL-NEXT:    s_add_u32 s12, s12, s11
2341; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
2342; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2343; GFX10-DL-NEXT:    s_clause 0x1
2344; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2345; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2346; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2347; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[6:7]
2348; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2349; GFX10-DL-NEXT:    v_and_b32_e32 v4, 15, v1
2350; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2351; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v2
2352; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
2353; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 4, 4
2354; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 12, 4
2355; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
2356; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
2357; GFX10-DL-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
2358; GFX10-DL-NEXT:    v_perm_b32 v4, v7, v4, 0x5040100
2359; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 8, 4
2360; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 8, 4
2361; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
2362; GFX10-DL-NEXT:    v_perm_b32 v6, v9, v6, 0x5040100
2363; GFX10-DL-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
2364; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 16, 4
2365; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 20, 4
2366; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
2367; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2368; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
2369; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
2370; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v6, v6, v7
2371; GFX10-DL-NEXT:    v_perm_b32 v5, v10, v5, 0x5040100
2372; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 24, 4
2373; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
2374; GFX10-DL-NEXT:    v_perm_b32 v4, v9, v4, 0x5040100
2375; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
2376; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
2377; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v6
2378; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 24, 4
2379; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
2380; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
2381; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
2382; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
2383; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v6, 0x5040100
2384; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2385; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v4
2386; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2387; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
2388; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2389; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
2390; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
2391; GFX10-DL-NEXT:    global_store_short v0, v1, s[6:7]
2392; GFX10-DL-NEXT:    s_endpgm
2393                                              ptr addrspace(1) %src2,
2394                                              ptr addrspace(1) nocapture %dst) {
2395entry:
2396  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2397  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
2398  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
2399  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
2400  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
2401
2402  %cvec1 = zext <8 x i4> %vec1 to <8 x i16>
2403  %cvec2 = zext <8 x i4> %vec2 to <8 x i16>
2404
2405  %mul = mul <8 x i16> %cvec1, %cvec2
2406  %mul0 = extractelement <8 x i16> %mul, i64 0
2407  %mul1 = extractelement <8 x i16> %mul, i64 1
2408  %mul2 = extractelement <8 x i16> %mul, i64 2
2409  %mul3 = extractelement <8 x i16> %mul, i64 3
2410  %mul4 = extractelement <8 x i16> %mul, i64 4
2411  %mul5 = extractelement <8 x i16> %mul, i64 5
2412  %mul6 = extractelement <8 x i16> %mul, i64 6
2413  %mul7 = extractelement <8 x i16> %mul, i64 7
2414
2415  %acc = load i16, ptr addrspace(1) %dst, align 4
2416  %add1 = add i16 %mul0, %acc
2417  %add2 = add i16 %add1, %mul1
2418  %add3 = add i16 %add2, %mul2
2419  %add4 = add i16 %add3, %mul3
2420  %add5 = add i16 %add4, %mul4
2421  %add6 = add i16 %add5, %mul5
2422  %add7 = add i16 %add6, %mul6
2423  %add8 = add i16 %add7, %mul7
2424
2425  store i16 %add8, ptr addrspace(1) %dst, align 4
2426  ret void
2427}
2428
2429; TODO: Cleanup the code to generate MAD; pattern should be recognized then.
2430define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
2431; GFX7-LABEL: udot8_acc8_vecMul:
2432; GFX7:       ; %bb.0: ; %entry
2433; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2434; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2435; GFX7-NEXT:    s_mov_b32 s14, -1
2436; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2437; GFX7-NEXT:    s_add_u32 s12, s12, s11
2438; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2439; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2440; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2441; GFX7-NEXT:    s_mov_b32 s6, 0
2442; GFX7-NEXT:    s_mov_b32 s7, s3
2443; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2444; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2445; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2446; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2447; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2448; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
2449; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2450; GFX7-NEXT:    s_mov_b32 s2, -1
2451; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
2452; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2453; GFX7-NEXT:    s_waitcnt vmcnt(2)
2454; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
2455; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
2456; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
2457; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
2458; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
2459; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
2460; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
2461; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
2462; GFX7-NEXT:    s_waitcnt vmcnt(1)
2463; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
2464; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
2465; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
2466; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
2467; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
2468; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
2469; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
2470; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
2471; GFX7-NEXT:    s_waitcnt vmcnt(0)
2472; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2473; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
2474; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
2475; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
2476; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
2477; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
2478; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
2479; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
2480; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2481; GFX7-NEXT:    s_endpgm
2482;
2483; GFX8-LABEL: udot8_acc8_vecMul:
2484; GFX8:       ; %bb.0: ; %entry
2485; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2486; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2487; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2488; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2489; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2490; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2491; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2492; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2493; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2494; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2495; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2496; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2497; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2498; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2499; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2500; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2501; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
2502; GFX8-NEXT:    s_mov_b32 s14, -1
2503; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
2504; GFX8-NEXT:    s_add_u32 s12, s12, s11
2505; GFX8-NEXT:    s_addc_u32 s13, s13, 0
2506; GFX8-NEXT:    s_waitcnt vmcnt(2)
2507; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v3
2508; GFX8-NEXT:    v_bfe_u32 v10, v3, 24, 4
2509; GFX8-NEXT:    v_bfe_u32 v11, v3, 20, 4
2510; GFX8-NEXT:    v_bfe_u32 v7, v3, 12, 4
2511; GFX8-NEXT:    v_bfe_u32 v8, v3, 8, 4
2512; GFX8-NEXT:    v_bfe_u32 v12, v3, 16, 4
2513; GFX8-NEXT:    s_waitcnt vmcnt(1)
2514; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 28, v2
2515; GFX8-NEXT:    v_bfe_u32 v17, v2, 24, 4
2516; GFX8-NEXT:    v_bfe_u32 v18, v2, 20, 4
2517; GFX8-NEXT:    v_bfe_u32 v14, v2, 12, 4
2518; GFX8-NEXT:    v_bfe_u32 v15, v2, 8, 4
2519; GFX8-NEXT:    v_bfe_u32 v19, v2, 16, 4
2520; GFX8-NEXT:    v_mul_lo_u16_sdwa v11, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2521; GFX8-NEXT:    v_mul_lo_u16_e32 v18, v10, v17
2522; GFX8-NEXT:    v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2523; GFX8-NEXT:    v_bfe_u32 v5, v3, 4, 4
2524; GFX8-NEXT:    v_and_b32_e32 v6, 15, v3
2525; GFX8-NEXT:    v_bfe_u32 v3, v2, 4, 4
2526; GFX8-NEXT:    v_and_b32_e32 v13, 15, v2
2527; GFX8-NEXT:    v_mul_lo_u16_e32 v2, v12, v19
2528; GFX8-NEXT:    v_mul_lo_u16_e32 v8, v8, v15
2529; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2530; GFX8-NEXT:    v_or_b32_e32 v9, v18, v9
2531; GFX8-NEXT:    v_mul_lo_u16_sdwa v5, v5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2532; GFX8-NEXT:    v_or_b32_e32 v3, v2, v11
2533; GFX8-NEXT:    v_or_b32_e32 v7, v8, v7
2534; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
2535; GFX8-NEXT:    v_mul_lo_u16_e32 v6, v6, v13
2536; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
2537; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2538; GFX8-NEXT:    v_or_b32_e32 v6, v6, v5
2539; GFX8-NEXT:    v_or_b32_e32 v5, v5, v2
2540; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
2541; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 24, v[2:3]
2542; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
2543; GFX8-NEXT:    s_waitcnt vmcnt(0)
2544; GFX8-NEXT:    v_add_u16_e32 v3, v6, v4
2545; GFX8-NEXT:    v_add_u16_e32 v3, v3, v5
2546; GFX8-NEXT:    v_add_u16_e32 v3, v3, v7
2547; GFX8-NEXT:    v_add_u16_e32 v2, v3, v2
2548; GFX8-NEXT:    v_mad_u16 v2, v12, v19, v2
2549; GFX8-NEXT:    v_add_u16_e32 v2, v2, v8
2550; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
2551; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
2552; GFX8-NEXT:    v_add_u16_e32 v2, v2, v9
2553; GFX8-NEXT:    flat_store_byte v[0:1], v2
2554; GFX8-NEXT:    s_endpgm
2555;
2556; GFX9-LABEL: udot8_acc8_vecMul:
2557; GFX9:       ; %bb.0: ; %entry
2558; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2559; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2560; GFX9-NEXT:    s_mov_b32 s14, -1
2561; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
2562; GFX9-NEXT:    s_add_u32 s12, s12, s11
2563; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2564; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2565; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2566; GFX9-NEXT:    v_mov_b32_e32 v3, 0
2567; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2568; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
2569; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
2570; GFX9-NEXT:    global_load_ubyte v4, v3, s[0:1]
2571; GFX9-NEXT:    s_addc_u32 s13, s13, 0
2572; GFX9-NEXT:    s_waitcnt vmcnt(2)
2573; GFX9-NEXT:    v_bfe_u32 v0, v1, 4, 4
2574; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
2575; GFX9-NEXT:    v_bfe_u32 v6, v1, 12, 4
2576; GFX9-NEXT:    v_bfe_u32 v7, v1, 8, 4
2577; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
2578; GFX9-NEXT:    v_bfe_u32 v9, v1, 24, 4
2579; GFX9-NEXT:    v_bfe_u32 v10, v1, 20, 4
2580; GFX9-NEXT:    v_bfe_u32 v11, v1, 16, 4
2581; GFX9-NEXT:    s_waitcnt vmcnt(1)
2582; GFX9-NEXT:    v_bfe_u32 v1, v2, 4, 4
2583; GFX9-NEXT:    v_and_b32_e32 v12, 15, v2
2584; GFX9-NEXT:    v_bfe_u32 v13, v2, 12, 4
2585; GFX9-NEXT:    v_bfe_u32 v14, v2, 8, 4
2586; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
2587; GFX9-NEXT:    v_bfe_u32 v16, v2, 24, 4
2588; GFX9-NEXT:    v_bfe_u32 v17, v2, 20, 4
2589; GFX9-NEXT:    v_bfe_u32 v2, v2, 16, 4
2590; GFX9-NEXT:    v_mul_lo_u16_e32 v18, v11, v2
2591; GFX9-NEXT:    v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2592; GFX9-NEXT:    v_mul_lo_u16_e32 v17, v9, v16
2593; GFX9-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2594; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
2595; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2596; GFX9-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
2597; GFX9-NEXT:    v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2598; GFX9-NEXT:    v_or_b32_e32 v0, v18, v10
2599; GFX9-NEXT:    v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2600; GFX9-NEXT:    v_or_b32_e32 v6, v7, v6
2601; GFX9-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2602; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
2603; GFX9-NEXT:    v_or_b32_e32 v5, v5, v12
2604; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
2605; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
2606; GFX9-NEXT:    v_or_b32_e32 v10, v12, v0
2607; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
2608; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
2609; GFX9-NEXT:    s_waitcnt vmcnt(0)
2610; GFX9-NEXT:    v_add_u16_e32 v4, v5, v4
2611; GFX9-NEXT:    v_add_u16_e32 v1, v4, v1
2612; GFX9-NEXT:    v_add_u16_e32 v1, v1, v6
2613; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
2614; GFX9-NEXT:    v_mad_legacy_u16 v0, v11, v2, v0
2615; GFX9-NEXT:    v_add_u16_e32 v0, v0, v8
2616; GFX9-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
2617; GFX9-NEXT:    v_add_u16_e32 v0, v0, v7
2618; GFX9-NEXT:    global_store_byte v3, v0, s[0:1]
2619; GFX9-NEXT:    s_endpgm
2620;
2621; GFX9-DL-LABEL: udot8_acc8_vecMul:
2622; GFX9-DL:       ; %bb.0: ; %entry
2623; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2624; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2625; GFX9-DL-NEXT:    s_mov_b32 s14, -1
2626; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
2627; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
2628; GFX9-DL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2629; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2630; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2631; GFX9-DL-NEXT:    v_mov_b32_e32 v3, 0
2632; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2633; GFX9-DL-NEXT:    global_load_dword v1, v0, s[8:9]
2634; GFX9-DL-NEXT:    global_load_dword v2, v0, s[10:11]
2635; GFX9-DL-NEXT:    global_load_ubyte v4, v3, s[0:1]
2636; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
2637; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2638; GFX9-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
2639; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v1
2640; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
2641; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
2642; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
2643; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 24, 4
2644; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
2645; GFX9-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
2646; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2647; GFX9-DL-NEXT:    v_bfe_u32 v1, v2, 4, 4
2648; GFX9-DL-NEXT:    v_and_b32_e32 v12, 15, v2
2649; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 12, 4
2650; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 8, 4
2651; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
2652; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 24, 4
2653; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 20, 4
2654; GFX9-DL-NEXT:    v_bfe_u32 v2, v2, 16, 4
2655; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v18, v11, v2
2656; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2657; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v17, v9, v16
2658; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2659; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
2660; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2661; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
2662; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2663; GFX9-DL-NEXT:    v_or_b32_e32 v0, v18, v10
2664; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2665; GFX9-DL-NEXT:    v_or_b32_e32 v6, v7, v6
2666; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2667; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
2668; GFX9-DL-NEXT:    v_or_b32_e32 v5, v5, v12
2669; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
2670; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
2671; GFX9-DL-NEXT:    v_or_b32_e32 v10, v12, v0
2672; GFX9-DL-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
2673; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
2674; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2675; GFX9-DL-NEXT:    v_add_u16_e32 v4, v5, v4
2676; GFX9-DL-NEXT:    v_add_u16_e32 v1, v4, v1
2677; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
2678; GFX9-DL-NEXT:    v_add_u16_e32 v0, v1, v0
2679; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v11, v2, v0
2680; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v8
2681; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
2682; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v7
2683; GFX9-DL-NEXT:    global_store_byte v3, v0, s[0:1]
2684; GFX9-DL-NEXT:    s_endpgm
2685;
2686; GFX10-DL-LABEL: udot8_acc8_vecMul:
2687; GFX10-DL:       ; %bb.0: ; %entry
2688; GFX10-DL-NEXT:    s_clause 0x1
2689; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2690; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2691; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2692; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0
2693; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2694; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2695; GFX10-DL-NEXT:    s_mov_b32 s14, -1
2696; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
2697; GFX10-DL-NEXT:    s_add_u32 s12, s12, s11
2698; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
2699; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2700; GFX10-DL-NEXT:    s_clause 0x1
2701; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2702; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2703; GFX10-DL-NEXT:    global_load_ubyte v3, v4, s[6:7]
2704; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2705; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
2706; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2707; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 12, 4
2708; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
2709; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v1
2710; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
2711; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
2712; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 24, 4
2713; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 20, 4
2714; GFX10-DL-NEXT:    v_bfe_u32 v12, v1, 16, 4
2715; GFX10-DL-NEXT:    v_bfe_u32 v1, v2, 8, 4
2716; GFX10-DL-NEXT:    v_mul_lo_u16 v6, v6, v9
2717; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
2718; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
2719; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 20, 4
2720; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v7, v1
2721; GFX10-DL-NEXT:    v_lshlrev_b16 v6, 8, v6
2722; GFX10-DL-NEXT:    v_and_b32_e32 v13, 15, v2
2723; GFX10-DL-NEXT:    v_mul_lo_u16 v0, v0, v9
2724; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 16, 4
2725; GFX10-DL-NEXT:    v_bfe_u32 v16, v2, 24, 4
2726; GFX10-DL-NEXT:    v_or_b32_e32 v6, v1, v6
2727; GFX10-DL-NEXT:    v_mul_lo_u16 v2, v11, v15
2728; GFX10-DL-NEXT:    v_mul_lo_u16 v8, v8, v14
2729; GFX10-DL-NEXT:    v_lshlrev_b16 v9, 8, v0
2730; GFX10-DL-NEXT:    v_mul_lo_u16 v5, v5, v13
2731; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
2732; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v12, v7
2733; GFX10-DL-NEXT:    v_mul_lo_u16 v11, v10, v16
2734; GFX10-DL-NEXT:    v_lshlrev_b16 v2, 8, v2
2735; GFX10-DL-NEXT:    v_lshlrev_b16 v8, 8, v8
2736; GFX10-DL-NEXT:    v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2737; GFX10-DL-NEXT:    v_or_b32_e32 v5, v5, v9
2738; GFX10-DL-NEXT:    v_or_b32_e32 v1, v1, v2
2739; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2740; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v13
2741; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2742; GFX10-DL-NEXT:    v_add_nc_u16 v3, v5, v3
2743; GFX10-DL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2744; GFX10-DL-NEXT:    v_add_nc_u16 v5, v3, v9
2745; GFX10-DL-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
2746; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
2747; GFX10-DL-NEXT:    v_add_nc_u16 v0, v5, v6
2748; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v2
2749; GFX10-DL-NEXT:    v_mad_u16 v0, v12, v7, v0
2750; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
2751; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
2752; GFX10-DL-NEXT:    v_mad_u16 v0, v10, v16, v0
2753; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
2754; GFX10-DL-NEXT:    global_store_byte v4, v0, s[6:7]
2755; GFX10-DL-NEXT:    s_endpgm
2756                                             ptr addrspace(1) %src2,
2757                                             ptr addrspace(1) nocapture %dst) {
2758entry:
2759  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2760  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
2761  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
2762  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
2763  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
2764
2765  %cvec1 = zext <8 x i4> %vec1 to <8 x i8>
2766  %cvec2 = zext <8 x i4> %vec2 to <8 x i8>
2767
2768  %mul = mul <8 x i8> %cvec1, %cvec2
2769  %mul0 = extractelement <8 x i8> %mul, i64 0
2770  %mul1 = extractelement <8 x i8> %mul, i64 1
2771  %mul2 = extractelement <8 x i8> %mul, i64 2
2772  %mul3 = extractelement <8 x i8> %mul, i64 3
2773  %mul4 = extractelement <8 x i8> %mul, i64 4
2774  %mul5 = extractelement <8 x i8> %mul, i64 5
2775  %mul6 = extractelement <8 x i8> %mul, i64 6
2776  %mul7 = extractelement <8 x i8> %mul, i64 7
2777
2778  %acc = load i8, ptr addrspace(1) %dst, align 4
2779  %add1 = add i8 %mul0, %acc
2780  %add2 = add i8 %add1, %mul1
2781  %add3 = add i8 %add2, %mul2
2782  %add4 = add i8 %add3, %mul3
2783  %add5 = add i8 %add4, %mul4
2784  %add6 = add i8 %add5, %mul5
2785  %add7 = add i8 %add6, %mul6
2786  %add8 = add i8 %add7, %mul7
2787
2788  store i8 %add8, ptr addrspace(1) %dst, align 4
2789  ret void
2790}
2791
2792; TODO: Once the adictional "and+add" are removed, the pattern will be recognized.
2793define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
2794; GFX7-LABEL: udot8_acc4_vecMul:
2795; GFX7:       ; %bb.0: ; %entry
2796; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2797; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2798; GFX7-NEXT:    s_mov_b32 s14, -1
2799; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2800; GFX7-NEXT:    s_add_u32 s12, s12, s11
2801; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2802; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2803; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2804; GFX7-NEXT:    s_mov_b32 s6, 0
2805; GFX7-NEXT:    s_mov_b32 s7, s3
2806; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2807; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2808; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2809; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2810; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2811; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
2812; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2813; GFX7-NEXT:    s_mov_b32 s2, -1
2814; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
2815; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2816; GFX7-NEXT:    s_waitcnt vmcnt(2)
2817; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
2818; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
2819; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
2820; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
2821; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
2822; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
2823; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
2824; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
2825; GFX7-NEXT:    s_waitcnt vmcnt(1)
2826; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
2827; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
2828; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
2829; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
2830; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
2831; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
2832; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
2833; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
2834; GFX7-NEXT:    s_waitcnt vmcnt(0)
2835; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2836; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
2837; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
2838; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
2839; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
2840; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
2841; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
2842; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
2843; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
2844; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2845; GFX7-NEXT:    s_endpgm
2846;
2847; GFX8-LABEL: udot8_acc4_vecMul:
2848; GFX8:       ; %bb.0: ; %entry
2849; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2850; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2851; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2852; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2853; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2854; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2855; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2856; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2857; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2858; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2859; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2860; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2861; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2862; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2863; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2864; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2865; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
2866; GFX8-NEXT:    s_mov_b32 s14, -1
2867; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
2868; GFX8-NEXT:    s_add_u32 s12, s12, s11
2869; GFX8-NEXT:    s_addc_u32 s13, s13, 0
2870; GFX8-NEXT:    s_waitcnt vmcnt(2)
2871; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
2872; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
2873; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
2874; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
2875; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
2876; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
2877; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
2878; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
2879; GFX8-NEXT:    s_waitcnt vmcnt(1)
2880; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
2881; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
2882; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
2883; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
2884; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
2885; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
2886; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
2887; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
2888; GFX8-NEXT:    s_waitcnt vmcnt(0)
2889; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2890; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
2891; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
2892; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
2893; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
2894; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
2895; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
2896; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
2897; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
2898; GFX8-NEXT:    flat_store_byte v[0:1], v2
2899; GFX8-NEXT:    s_endpgm
2900;
2901; GFX9-LABEL: udot8_acc4_vecMul:
2902; GFX9:       ; %bb.0: ; %entry
2903; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2904; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2905; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2906; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2907; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2908; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2909; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
2910; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
2911; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2912; GFX9-NEXT:    global_load_ubyte v3, v0, s[6:7]
2913; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
2914; GFX9-NEXT:    s_mov_b32 s14, -1
2915; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
2916; GFX9-NEXT:    s_add_u32 s12, s12, s11
2917; GFX9-NEXT:    s_addc_u32 s13, s13, 0
2918; GFX9-NEXT:    s_waitcnt vmcnt(2)
2919; GFX9-NEXT:    v_and_b32_e32 v4, 15, v1
2920; GFX9-NEXT:    v_bfe_u32 v5, v1, 4, 4
2921; GFX9-NEXT:    v_bfe_u32 v6, v1, 8, 4
2922; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
2923; GFX9-NEXT:    s_waitcnt vmcnt(1)
2924; GFX9-NEXT:    v_and_b32_e32 v11, 15, v2
2925; GFX9-NEXT:    v_bfe_u32 v12, v2, 4, 4
2926; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s0
2927; GFX9-NEXT:    v_perm_b32 v7, v12, v11, s0
2928; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s0
2929; GFX9-NEXT:    v_bfe_u32 v8, v1, 16, 4
2930; GFX9-NEXT:    v_bfe_u32 v9, v1, 20, 4
2931; GFX9-NEXT:    v_bfe_u32 v13, v2, 8, 4
2932; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
2933; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
2934; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s0
2935; GFX9-NEXT:    v_perm_b32 v9, v14, v13, s0
2936; GFX9-NEXT:    s_waitcnt vmcnt(0)
2937; GFX9-NEXT:    v_add_u16_e32 v3, v4, v3
2938; GFX9-NEXT:    v_bfe_u32 v10, v1, 24, 4
2939; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
2940; GFX9-NEXT:    v_bfe_u32 v15, v2, 16, 4
2941; GFX9-NEXT:    v_bfe_u32 v16, v2, 20, 4
2942; GFX9-NEXT:    v_bfe_u32 v17, v2, 24, 4
2943; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
2944; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
2945; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2946; GFX9-NEXT:    v_perm_b32 v2, v2, v17, s0
2947; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s0
2948; GFX9-NEXT:    v_perm_b32 v10, v16, v15, s0
2949; GFX9-NEXT:    v_add_u16_e32 v3, v3, v5
2950; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2951; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
2952; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2953; GFX9-NEXT:    v_add_u16_e32 v3, v3, v2
2954; GFX9-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2955; GFX9-NEXT:    v_add_u16_e32 v2, v2, v1
2956; GFX9-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2957; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
2958; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
2959; GFX9-NEXT:    s_endpgm
2960;
2961; GFX9-DL-LABEL: udot8_acc4_vecMul:
2962; GFX9-DL:       ; %bb.0: ; %entry
2963; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2964; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2965; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2966; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2967; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2968; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2969; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2970; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2971; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2972; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[6:7]
2973; GFX9-DL-NEXT:    s_mov_b32 s0, 0x5040100
2974; GFX9-DL-NEXT:    s_mov_b32 s14, -1
2975; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
2976; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
2977; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
2978; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2979; GFX9-DL-NEXT:    v_and_b32_e32 v4, 15, v1
2980; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 4, 4
2981; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 8, 4
2982; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
2983; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2984; GFX9-DL-NEXT:    v_and_b32_e32 v11, 15, v2
2985; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 4, 4
2986; GFX9-DL-NEXT:    v_perm_b32 v6, v7, v6, s0
2987; GFX9-DL-NEXT:    v_perm_b32 v7, v12, v11, s0
2988; GFX9-DL-NEXT:    v_perm_b32 v4, v5, v4, s0
2989; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 16, 4
2990; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 20, 4
2991; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
2992; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 12, 4
2993; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
2994; GFX9-DL-NEXT:    v_perm_b32 v8, v9, v8, s0
2995; GFX9-DL-NEXT:    v_perm_b32 v9, v14, v13, s0
2996; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2997; GFX9-DL-NEXT:    v_add_u16_e32 v3, v4, v3
2998; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 24, 4
2999; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
3000; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 16, 4
3001; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 20, 4
3002; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 24, 4
3003; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3004; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
3005; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3006; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v17, s0
3007; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v10, s0
3008; GFX9-DL-NEXT:    v_perm_b32 v10, v16, v15, s0
3009; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v5
3010; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
3011; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
3012; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3013; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v2
3014; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3015; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v1
3016; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3017; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
3018; GFX9-DL-NEXT:    global_store_byte v0, v1, s[6:7]
3019; GFX9-DL-NEXT:    s_endpgm
3020;
3021; GFX10-DL-LABEL: udot8_acc4_vecMul:
3022; GFX10-DL:       ; %bb.0: ; %entry
3023; GFX10-DL-NEXT:    s_clause 0x1
3024; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3025; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3026; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3027; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
3028; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
3029; GFX10-DL-NEXT:    s_mov_b32 s14, -1
3030; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
3031; GFX10-DL-NEXT:    s_add_u32 s12, s12, s11
3032; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
3033; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3034; GFX10-DL-NEXT:    s_clause 0x1
3035; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
3036; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
3037; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
3038; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[6:7]
3039; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
3040; GFX10-DL-NEXT:    v_and_b32_e32 v4, 15, v1
3041; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
3042; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v2
3043; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
3044; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 4, 4
3045; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 12, 4
3046; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
3047; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
3048; GFX10-DL-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
3049; GFX10-DL-NEXT:    v_perm_b32 v4, v7, v4, 0x5040100
3050; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 8, 4
3051; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 8, 4
3052; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
3053; GFX10-DL-NEXT:    v_perm_b32 v6, v9, v6, 0x5040100
3054; GFX10-DL-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
3055; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 16, 4
3056; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 20, 4
3057; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
3058; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3059; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
3060; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
3061; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v6, v6, v7
3062; GFX10-DL-NEXT:    v_perm_b32 v5, v10, v5, 0x5040100
3063; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 24, 4
3064; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
3065; GFX10-DL-NEXT:    v_perm_b32 v4, v9, v4, 0x5040100
3066; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
3067; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
3068; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v6
3069; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 24, 4
3070; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3071; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
3072; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
3073; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
3074; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v6, 0x5040100
3075; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
3076; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v4
3077; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
3078; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
3079; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3080; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
3081; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
3082; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
3083; GFX10-DL-NEXT:    global_store_byte v0, v1, s[6:7]
3084; GFX10-DL-NEXT:    s_endpgm
3085                                             ptr addrspace(1) %src2,
3086                                             ptr addrspace(1) nocapture %dst) {
3087entry:
3088  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3089  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
3090  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
3091  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
3092  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
3093
3094  %mul = mul <8 x i4> %vec1, %vec2
3095  %mul0 = extractelement <8 x i4> %mul, i64 0
3096  %mul1 = extractelement <8 x i4> %mul, i64 1
3097  %mul2 = extractelement <8 x i4> %mul, i64 2
3098  %mul3 = extractelement <8 x i4> %mul, i64 3
3099  %mul4 = extractelement <8 x i4> %mul, i64 4
3100  %mul5 = extractelement <8 x i4> %mul, i64 5
3101  %mul6 = extractelement <8 x i4> %mul, i64 6
3102  %mul7 = extractelement <8 x i4> %mul, i64 7
3103
3104  %acc = load i4, ptr addrspace(1) %dst, align 4
3105  %add1 = add i4 %mul0, %acc
3106  %add2 = add i4 %add1, %mul1
3107  %add3 = add i4 %add2, %mul2
3108  %add4 = add i4 %add3, %mul3
3109  %add5 = add i4 %add4, %mul4
3110  %add6 = add i4 %add5, %mul5
3111  %add7 = add i4 %add6, %mul6
3112  %add8 = add i4 %add7, %mul7
3113
3114  store i4 %add8, ptr addrspace(1) %dst, align 4
3115  ret void
3116}
3117
3118define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
3119; GFX7-LABEL: udot8_variant1:
3120; GFX7:       ; %bb.0: ; %entry
3121; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
3122; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
3123; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3124; GFX7-NEXT:    s_mov_b32 s6, 0
3125; GFX7-NEXT:    s_mov_b32 s7, s3
3126; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3127; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3128; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3129; GFX7-NEXT:    v_mov_b32_e32 v1, 0
3130; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3131; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
3132; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
3133; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
3134; GFX7-NEXT:    s_mov_b32 s2, -1
3135; GFX7-NEXT:    s_waitcnt vmcnt(1)
3136; GFX7-NEXT:    v_and_b32_e32 v1, 15, v2
3137; GFX7-NEXT:    v_bfe_u32 v3, v2, 4, 4
3138; GFX7-NEXT:    s_waitcnt vmcnt(0)
3139; GFX7-NEXT:    v_and_b32_e32 v9, 15, v0
3140; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 4
3141; GFX7-NEXT:    v_bfe_u32 v5, v2, 12, 4
3142; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
3143; GFX7-NEXT:    v_bfe_u32 v7, v2, 20, 4
3144; GFX7-NEXT:    v_bfe_u32 v8, v2, 24, 4
3145; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3146; GFX7-NEXT:    v_bfe_u32 v10, v0, 4, 4
3147; GFX7-NEXT:    v_bfe_u32 v11, v0, 8, 4
3148; GFX7-NEXT:    v_bfe_u32 v12, v0, 12, 4
3149; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
3150; GFX7-NEXT:    v_bfe_u32 v14, v0, 20, 4
3151; GFX7-NEXT:    v_bfe_u32 v15, v0, 24, 4
3152; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
3153; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3154; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v1, s4
3155; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
3156; GFX7-NEXT:    v_mad_u32_u24 v0, v10, v3, v0
3157; GFX7-NEXT:    v_mad_u32_u24 v0, v11, v4, v0
3158; GFX7-NEXT:    v_mad_u32_u24 v0, v12, v5, v0
3159; GFX7-NEXT:    v_mad_u32_u24 v0, v13, v6, v0
3160; GFX7-NEXT:    v_mad_u32_u24 v0, v14, v7, v0
3161; GFX7-NEXT:    v_mad_u32_u24 v0, v15, v8, v0
3162; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3163; GFX7-NEXT:    s_endpgm
3164;
3165; GFX8-LABEL: udot8_variant1:
3166; GFX8:       ; %bb.0: ; %entry
3167; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3168; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
3169; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3170; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3171; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3172; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3173; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3174; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3175; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3176; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3177; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3178; GFX8-NEXT:    flat_load_dword v0, v[0:1]
3179; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
3180; GFX8-NEXT:    s_waitcnt vmcnt(1)
3181; GFX8-NEXT:    v_and_b32_e32 v1, 15, v3
3182; GFX8-NEXT:    v_bfe_u32 v4, v3, 4, 4
3183; GFX8-NEXT:    v_bfe_u32 v6, v3, 8, 4
3184; GFX8-NEXT:    v_bfe_u32 v8, v3, 12, 4
3185; GFX8-NEXT:    v_bfe_u32 v10, v3, 16, 4
3186; GFX8-NEXT:    v_bfe_u32 v12, v3, 20, 4
3187; GFX8-NEXT:    s_waitcnt vmcnt(0)
3188; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
3189; GFX8-NEXT:    v_bfe_u32 v5, v0, 4, 4
3190; GFX8-NEXT:    v_bfe_u32 v7, v0, 8, 4
3191; GFX8-NEXT:    v_bfe_u32 v9, v0, 12, 4
3192; GFX8-NEXT:    v_bfe_u32 v11, v0, 16, 4
3193; GFX8-NEXT:    v_bfe_u32 v13, v0, 20, 4
3194; GFX8-NEXT:    v_bfe_u32 v14, v3, 24, 4
3195; GFX8-NEXT:    v_bfe_u32 v15, v0, 24, 4
3196; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
3197; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
3198; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3199; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
3200; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, v1
3201; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v4, v0
3202; GFX8-NEXT:    v_mad_u32_u24 v0, v7, v6, v0
3203; GFX8-NEXT:    v_mad_u32_u24 v0, v9, v8, v0
3204; GFX8-NEXT:    v_mad_u32_u24 v0, v11, v10, v0
3205; GFX8-NEXT:    v_mad_u32_u24 v0, v13, v12, v0
3206; GFX8-NEXT:    v_mad_u32_u24 v2, v15, v14, v0
3207; GFX8-NEXT:    v_mov_b32_e32 v0, s4
3208; GFX8-NEXT:    v_mov_b32_e32 v1, s5
3209; GFX8-NEXT:    flat_store_dword v[0:1], v2
3210; GFX8-NEXT:    s_endpgm
3211;
3212; GFX9-LABEL: udot8_variant1:
3213; GFX9:       ; %bb.0: ; %entry
3214; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3215; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3216; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3217; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3218; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
3219; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
3220; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
3221; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3222; GFX9-NEXT:    s_waitcnt vmcnt(1)
3223; GFX9-NEXT:    v_and_b32_e32 v3, 15, v1
3224; GFX9-NEXT:    s_waitcnt vmcnt(0)
3225; GFX9-NEXT:    v_and_b32_e32 v4, 15, v2
3226; GFX9-NEXT:    v_bfe_u32 v5, v1, 4, 4
3227; GFX9-NEXT:    v_bfe_u32 v6, v2, 4, 4
3228; GFX9-NEXT:    v_bfe_u32 v7, v1, 8, 4
3229; GFX9-NEXT:    v_bfe_u32 v8, v2, 8, 4
3230; GFX9-NEXT:    v_bfe_u32 v9, v1, 12, 4
3231; GFX9-NEXT:    v_bfe_u32 v10, v2, 12, 4
3232; GFX9-NEXT:    v_bfe_u32 v11, v1, 16, 4
3233; GFX9-NEXT:    v_bfe_u32 v12, v2, 16, 4
3234; GFX9-NEXT:    v_bfe_u32 v13, v1, 20, 4
3235; GFX9-NEXT:    v_bfe_u32 v14, v2, 20, 4
3236; GFX9-NEXT:    v_bfe_u32 v15, v1, 24, 4
3237; GFX9-NEXT:    v_bfe_u32 v16, v2, 24, 4
3238; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
3239; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3240; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v4, v3
3241; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v2, v1
3242; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v6, v5
3243; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v8, v7
3244; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3245; GFX9-NEXT:    v_add3_u32 v1, v3, s0, v1
3246; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v10, v9
3247; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v12, v11
3248; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v5
3249; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v14, v13
3250; GFX9-NEXT:    v_mul_u32_u24_e32 v9, v16, v15
3251; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v7
3252; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v9
3253; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
3254; GFX9-NEXT:    s_endpgm
3255;
3256; GFX9-DL-LABEL: udot8_variant1:
3257; GFX9-DL:       ; %bb.0: ; %entry
3258; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3259; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3260; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3261; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3262; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
3263; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
3264; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
3265; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3266; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3267; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, v2, v1, s0
3268; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
3269; GFX9-DL-NEXT:    s_endpgm
3270;
3271; GFX10-DL-LABEL: udot8_variant1:
3272; GFX10-DL:       ; %bb.0: ; %entry
3273; GFX10-DL-NEXT:    s_clause 0x1
3274; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3275; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3276; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3277; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3278; GFX10-DL-NEXT:    s_clause 0x1
3279; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
3280; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
3281; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
3282; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
3283; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
3284; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3285; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v2, v1, s0
3286; GFX10-DL-NEXT:    global_store_dword v0, v1, s[6:7]
3287; GFX10-DL-NEXT:    s_endpgm
3288                                          ptr addrspace(1) %v2addr,
3289                                          ptr addrspace(1) %dst) {
3290entry:
3291  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3292  %gep1 = getelementptr i32, ptr addrspace(1) %v1addr, i32 %idx
3293  %v1 = load i32, ptr addrspace(1) %gep1, align 4
3294  %gep2 = getelementptr i32, ptr addrspace(1) %v2addr, i32 %idx
3295  %v2 = load i32, ptr addrspace(1) %gep2, align 4
3296  %and = and i32 %v1, 15
3297  %and1 = and i32 %v2, 15
3298  %mul1 = mul nuw nsw i32 %and1, %and
3299
3300  %shr = lshr i32 %v1, 4
3301  %and2 = and i32 %shr, 15
3302  %shr3 = lshr i32 %v2, 4
3303  %and4 = and i32 %shr3, 15
3304  %mul2 = mul nuw nsw i32 %and4, %and2
3305
3306  %shr6 = lshr i32 %v1, 8
3307  %and7 = and i32 %shr6, 15
3308  %shr8 = lshr i32 %v2, 8
3309  %and9 = and i32 %shr8, 15
3310  %mul3 = mul nuw nsw i32 %and9, %and7
3311
3312  %shr12 = lshr i32 %v1, 12
3313  %and13 = and i32 %shr12, 15
3314  %shr14 = lshr i32 %v2, 12
3315  %and15 = and i32 %shr14, 15
3316  %mul4 = mul nuw nsw i32 %and15, %and13
3317
3318  %shr18 = lshr i32 %v1, 16
3319  %and19 = and i32 %shr18, 15
3320  %shr20 = lshr i32 %v2, 16
3321  %and21 = and i32 %shr20, 15
3322  %mul5 = mul nuw nsw i32 %and21, %and19
3323
3324  %shr24 = lshr i32 %v1, 20
3325  %and25 = and i32 %shr24, 15
3326  %shr26 = lshr i32 %v2, 20
3327  %and27 = and i32 %shr26, 15
3328  %mul6 = mul nuw nsw i32 %and27, %and25
3329
3330  %shr30 = lshr i32 %v1, 24
3331  %and31 = and i32 %shr30, 15
3332  %shr32 = lshr i32 %v2, 24
3333  %and33 = and i32 %shr32, 15
3334  %mul7 = mul nuw nsw i32 %and33, %and31
3335
3336  %shr36 = lshr i32 %v1, 28
3337  %shr37 = lshr i32 %v2, 28
3338  %mul8 = mul nuw nsw i32 %shr37, %shr36
3339  %acc = load i32, ptr addrspace(1) %dst, align 4
3340
3341  %add1 = add i32 %mul1, %acc
3342  %add2 = add i32 %add1, %mul8
3343  %add3 = add i32 %add2, %mul2
3344  %add4 = add i32 %add3, %mul3
3345  %add5 = add i32 %add4, %mul4
3346  %add6 = add i32 %add5, %mul5
3347  %add7 = add i32 %add6, %mul6
3348  %add8 = add i32 %add7, %mul7
3349  store i32 %add8, ptr addrspace(1) %dst, align 4
3350  ret void
3351}
3352
3353declare i32 @llvm.amdgcn.workitem.id.x()
3354