xref: /llvm-project/llvm/test/CodeGen/AMDGPU/idot8s.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
9; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
10
11define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
12; GFX7-LABEL: idot8_acc32:
13; GFX7:       ; %bb.0: ; %entry
14; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
15; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
16; GFX7-NEXT:    s_mov_b32 s14, -1
17; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
18; GFX7-NEXT:    s_add_u32 s12, s12, s11
19; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
20; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
21; GFX7-NEXT:    s_mov_b32 s3, 0xf000
22; GFX7-NEXT:    s_mov_b32 s6, 0
23; GFX7-NEXT:    s_mov_b32 s7, s3
24; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
25; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
26; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
27; GFX7-NEXT:    v_mov_b32_e32 v1, 0
28; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
29; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
30; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
31; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
32; GFX7-NEXT:    s_mov_b32 s2, -1
33; GFX7-NEXT:    s_addc_u32 s13, s13, 0
34; GFX7-NEXT:    s_waitcnt vmcnt(1)
35; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 4
36; GFX7-NEXT:    v_bfe_i32 v3, v2, 4, 4
37; GFX7-NEXT:    s_waitcnt vmcnt(0)
38; GFX7-NEXT:    v_bfe_i32 v9, v0, 0, 4
39; GFX7-NEXT:    v_bfe_i32 v10, v0, 4, 4
40; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v9, s4
42; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 4
43; GFX7-NEXT:    v_bfe_i32 v11, v0, 8, 4
44; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v10, v1
45; GFX7-NEXT:    v_bfe_i32 v5, v2, 12, 4
46; GFX7-NEXT:    v_bfe_i32 v12, v0, 12, 4
47; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v11, v1
48; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
49; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
50; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v12, v1
51; GFX7-NEXT:    v_bfe_i32 v7, v2, 20, 4
52; GFX7-NEXT:    v_bfe_i32 v14, v0, 20, 4
53; GFX7-NEXT:    v_mad_i32_i24 v1, v6, v13, v1
54; GFX7-NEXT:    v_bfe_i32 v8, v2, 24, 4
55; GFX7-NEXT:    v_bfe_i32 v15, v0, 24, 4
56; GFX7-NEXT:    v_mad_i32_i24 v1, v7, v14, v1
57; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
58; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
59; GFX7-NEXT:    v_mad_i32_i24 v1, v8, v15, v1
60; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
61; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
62; GFX7-NEXT:    s_endpgm
63;
64; GFX8-LABEL: idot8_acc32:
65; GFX8:       ; %bb.0: ; %entry
66; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
67; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
68; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
69; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
70; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
71; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
72; GFX8-NEXT:    v_mov_b32_e32 v1, s1
73; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
74; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
75; GFX8-NEXT:    flat_load_dword v3, v[0:1]
76; GFX8-NEXT:    v_mov_b32_e32 v1, s3
77; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
78; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
79; GFX8-NEXT:    flat_load_dword v0, v[0:1]
80; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
81; GFX8-NEXT:    s_mov_b32 s14, -1
82; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
83; GFX8-NEXT:    s_add_u32 s12, s12, s11
84; GFX8-NEXT:    s_addc_u32 s13, s13, 0
85; GFX8-NEXT:    s_waitcnt vmcnt(1)
86; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 4
87; GFX8-NEXT:    v_bfe_i32 v4, v3, 4, 4
88; GFX8-NEXT:    v_bfe_i32 v6, v3, 8, 4
89; GFX8-NEXT:    v_bfe_i32 v8, v3, 12, 4
90; GFX8-NEXT:    v_bfe_i32 v10, v3, 16, 4
91; GFX8-NEXT:    v_bfe_i32 v12, v3, 20, 4
92; GFX8-NEXT:    s_waitcnt vmcnt(0)
93; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 4
94; GFX8-NEXT:    v_bfe_i32 v5, v0, 4, 4
95; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s0
97; GFX8-NEXT:    v_bfe_i32 v7, v0, 8, 4
98; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
99; GFX8-NEXT:    v_bfe_i32 v9, v0, 12, 4
100; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
101; GFX8-NEXT:    v_bfe_i32 v11, v0, 16, 4
102; GFX8-NEXT:    v_mad_i32_i24 v1, v8, v9, v1
103; GFX8-NEXT:    v_bfe_i32 v13, v0, 20, 4
104; GFX8-NEXT:    v_mad_i32_i24 v1, v10, v11, v1
105; GFX8-NEXT:    v_bfe_i32 v14, v3, 24, 4
106; GFX8-NEXT:    v_bfe_i32 v15, v0, 24, 4
107; GFX8-NEXT:    v_mad_i32_i24 v1, v12, v13, v1
108; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 28, v3
109; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
110; GFX8-NEXT:    v_mad_i32_i24 v1, v14, v15, v1
111; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
112; GFX8-NEXT:    v_mov_b32_e32 v0, s4
113; GFX8-NEXT:    v_mov_b32_e32 v1, s5
114; GFX8-NEXT:    flat_store_dword v[0:1], v2
115; GFX8-NEXT:    s_endpgm
116;
117; GFX9-LABEL: idot8_acc32:
118; GFX9:       ; %bb.0: ; %entry
119; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
120; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
121; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
122; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
123; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
124; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
126; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
127; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
128; GFX9-NEXT:    s_mov_b32 s14, -1
129; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
130; GFX9-NEXT:    s_add_u32 s12, s12, s11
131; GFX9-NEXT:    v_mov_b32_e32 v0, 0
132; GFX9-NEXT:    s_addc_u32 s13, s13, 0
133; GFX9-NEXT:    s_waitcnt vmcnt(1)
134; GFX9-NEXT:    v_bfe_i32 v3, v1, 0, 4
135; GFX9-NEXT:    s_waitcnt vmcnt(0)
136; GFX9-NEXT:    v_bfe_i32 v4, v2, 0, 4
137; GFX9-NEXT:    v_bfe_i32 v5, v1, 4, 4
138; GFX9-NEXT:    v_bfe_i32 v6, v2, 4, 4
139; GFX9-NEXT:    v_bfe_i32 v7, v1, 8, 4
140; GFX9-NEXT:    v_bfe_i32 v8, v2, 8, 4
141; GFX9-NEXT:    v_bfe_i32 v9, v1, 12, 4
142; GFX9-NEXT:    v_bfe_i32 v10, v2, 12, 4
143; GFX9-NEXT:    v_bfe_i32 v11, v1, 16, 4
144; GFX9-NEXT:    v_bfe_i32 v12, v2, 16, 4
145; GFX9-NEXT:    v_bfe_i32 v13, v1, 20, 4
146; GFX9-NEXT:    v_bfe_i32 v14, v2, 20, 4
147; GFX9-NEXT:    v_bfe_i32 v15, v1, 24, 4
148; GFX9-NEXT:    v_bfe_i32 v16, v2, 24, 4
149; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
150; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
151; GFX9-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
152; GFX9-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
153; GFX9-NEXT:    v_mul_i32_i24_e32 v5, v7, v8
154; GFX9-NEXT:    v_mul_i32_i24_e32 v6, v9, v10
155; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
156; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX9-NEXT:    v_add3_u32 v2, v3, s0, v4
158; GFX9-NEXT:    v_mul_i32_i24_e32 v7, v11, v12
159; GFX9-NEXT:    v_mul_i32_i24_e32 v8, v13, v14
160; GFX9-NEXT:    v_add3_u32 v2, v2, v5, v6
161; GFX9-NEXT:    v_mul_i32_i24_e32 v9, v15, v16
162; GFX9-NEXT:    v_add3_u32 v2, v2, v7, v8
163; GFX9-NEXT:    v_add3_u32 v1, v2, v9, v1
164; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
165; GFX9-NEXT:    s_endpgm
166;
167; GFX9-DL-LABEL: idot8_acc32:
168; GFX9-DL:       ; %bb.0: ; %entry
169; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
170; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
171; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
172; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
173; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
174; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
176; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
177; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
178; GFX9-DL-NEXT:    s_mov_b32 s14, -1
179; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
180; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
181; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
182; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
183; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
184; GFX9-DL-NEXT:    v_dot8_i32_i4 v1, v1, v2, s0
185; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
186; GFX9-DL-NEXT:    s_endpgm
187;
188; GFX10-DL-XNACK-LABEL: idot8_acc32:
189; GFX10-DL-XNACK:       ; %bb.0: ; %entry
190; GFX10-DL-XNACK-NEXT:    s_clause 0x1
191; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
192; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
193; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
194; GFX10-DL-XNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
195; GFX10-DL-XNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
196; GFX10-DL-XNACK-NEXT:    s_mov_b32 s14, -1
197; GFX10-DL-XNACK-NEXT:    s_mov_b32 s15, 0x31c16000
198; GFX10-DL-XNACK-NEXT:    s_add_u32 s12, s12, s11
199; GFX10-DL-XNACK-NEXT:    s_addc_u32 s13, s13, 0
200; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX10-DL-XNACK-NEXT:    s_clause 0x1
202; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[0:1]
203; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[2:3]
204; GFX10-DL-XNACK-NEXT:    s_waitcnt_depctr 0xffe3
205; GFX10-DL-XNACK-NEXT:    s_load_dword s0, s[6:7], 0x0
206; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
207; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
208; GFX10-DL-XNACK-NEXT:    v_dot8_i32_i4 v1, v1, v2, s0
209; GFX10-DL-XNACK-NEXT:    global_store_dword v0, v1, s[6:7]
210; GFX10-DL-XNACK-NEXT:    s_endpgm
211;
212; GFX10-DL-NOXNACK-LABEL: idot8_acc32:
213; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
214; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
215; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
216; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
217; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
218; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
219; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
220; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s14, -1
221; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s15, 0x31c16000
222; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s12, s12, s11
223; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s13, s13, 0
224; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
226; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[0:1]
227; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[2:3]
228; GFX10-DL-NOXNACK-NEXT:    s_load_dword s0, s[4:5], 0x0
229; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
230; GFX10-DL-NOXNACK-NEXT:    v_dot8_i32_i4 v0, v1, v0, s0
231; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[4:5]
232; GFX10-DL-NOXNACK-NEXT:    s_endpgm
233                                       ptr addrspace(1) %src2,
234                                       ptr addrspace(1) nocapture %dst) {
235entry:
236  %idx = call i32 @llvm.amdgcn.workitem.id.x()
237  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
238  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
239  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
240  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
241
242  %v1e0 = extractelement <8 x i4> %vec1, i64 0
243  %cv1e0 = sext i4 %v1e0 to i32
244  %v2e0 = extractelement <8 x i4> %vec2, i64 0
245  %cv2e0 = sext i4 %v2e0 to i32
246  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
247
248  %v1e1 = extractelement <8 x i4> %vec1, i64 1
249  %cv1e1 = sext i4 %v1e1 to i32
250  %v2e1 = extractelement <8 x i4> %vec2, i64 1
251  %cv2e1 = sext i4 %v2e1 to i32
252  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
253
254  %v1e2 = extractelement <8 x i4> %vec1, i64 2
255  %cv1e2 = sext i4 %v1e2 to i32
256  %v2e2 = extractelement <8 x i4> %vec2, i64 2
257  %cv2e2 = sext i4 %v2e2 to i32
258  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
259
260  %v1e3 = extractelement <8 x i4> %vec1, i64 3
261  %cv1e3 = sext i4 %v1e3 to i32
262  %v2e3 = extractelement <8 x i4> %vec2, i64 3
263  %cv2e3 = sext i4 %v2e3 to i32
264  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
265
266  %v1e4 = extractelement <8 x i4> %vec1, i64 4
267  %cv1e4 = sext i4 %v1e4 to i32
268  %v2e4 = extractelement <8 x i4> %vec2, i64 4
269  %cv2e4 = sext i4 %v2e4 to i32
270  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
271
272  %v1e5 = extractelement <8 x i4> %vec1, i64 5
273  %cv1e5 = sext i4 %v1e5 to i32
274  %v2e5 = extractelement <8 x i4> %vec2, i64 5
275  %cv2e5 = sext i4 %v2e5 to i32
276  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
277
278  %v1e6 = extractelement <8 x i4> %vec1, i64 6
279  %cv1e6 = sext i4 %v1e6 to i32
280  %v2e6 = extractelement <8 x i4> %vec2, i64 6
281  %cv2e6 = sext i4 %v2e6 to i32
282  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
283
284  %v1e7 = extractelement <8 x i4> %vec1, i64 7
285  %cv1e7 = sext i4 %v1e7 to i32
286  %v2e7 = extractelement <8 x i4> %vec2, i64 7
287  %cv2e7 = sext i4 %v2e7 to i32
288  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
289
290  %acc = load i32, ptr addrspace(1) %dst, align 4
291  %add1 = add i32 %mul0, %acc
292  %add2 = add i32 %add1, %mul1
293  %add3 = add i32 %add2, %mul2
294  %add4 = add i32 %add3, %mul3
295  %add5 = add i32 %add4, %mul4
296  %add6 = add i32 %add5, %mul5
297  %add7 = add i32 %add6, %mul6
298  %add8 = add i32 %add7, %mul7
299
300  store i32 %add8, ptr addrspace(1) %dst, align 4
301  ret void
302}
303
304; TODO: Once the unnecessary zero extentions of the elements are removed;
305; pattern recognizer will kick in.
306define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
307; GFX7-LABEL: idot8_acc16:
308; GFX7:       ; %bb.0: ; %entry
309; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
310; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
311; GFX7-NEXT:    s_mov_b32 s14, -1
312; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
313; GFX7-NEXT:    s_add_u32 s12, s12, s11
314; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
315; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
316; GFX7-NEXT:    s_mov_b32 s3, 0xf000
317; GFX7-NEXT:    s_mov_b32 s6, 0
318; GFX7-NEXT:    s_mov_b32 s7, s3
319; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
321; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
322; GFX7-NEXT:    v_mov_b32_e32 v1, 0
323; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
324; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
325; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
326; GFX7-NEXT:    s_mov_b32 s2, -1
327; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
328; GFX7-NEXT:    s_addc_u32 s13, s13, 0
329; GFX7-NEXT:    s_waitcnt vmcnt(2)
330; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 4
331; GFX7-NEXT:    v_bfe_i32 v4, v2, 4, 4
332; GFX7-NEXT:    s_waitcnt vmcnt(1)
333; GFX7-NEXT:    v_bfe_i32 v10, v0, 0, 4
334; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
335; GFX7-NEXT:    v_bfe_i32 v11, v0, 4, 4
336; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff, v10
337; GFX7-NEXT:    v_bfe_i32 v5, v2, 8, 4
338; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
339; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
340; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v11
341; GFX7-NEXT:    s_waitcnt vmcnt(0)
342; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
343; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
344; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
345; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
346; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff, v12
347; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
348; GFX7-NEXT:    v_bfe_i32 v7, v2, 16, 4
349; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
350; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
351; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v13
352; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
353; GFX7-NEXT:    v_bfe_i32 v8, v2, 20, 4
354; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
355; GFX7-NEXT:    v_bfe_i32 v15, v0, 20, 4
356; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v14
357; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
358; GFX7-NEXT:    v_bfe_i32 v9, v2, 24, 4
359; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
360; GFX7-NEXT:    v_bfe_i32 v16, v0, 24, 4
361; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff, v15
362; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
363; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
364; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff, v9
365; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
366; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff, v16
367; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
368; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
369; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
370; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v16, v1
371; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
372; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
373; GFX7-NEXT:    s_endpgm
374;
375; GFX8-LABEL: idot8_acc16:
376; GFX8:       ; %bb.0: ; %entry
377; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
378; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
379; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
380; GFX8-NEXT:    v_mov_b32_e32 v5, 12
381; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
382; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
383; GFX8-NEXT:    v_mov_b32_e32 v1, s1
384; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
385; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
386; GFX8-NEXT:    flat_load_dword v3, v[0:1]
387; GFX8-NEXT:    v_mov_b32_e32 v1, s3
388; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
389; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
390; GFX8-NEXT:    flat_load_dword v2, v[0:1]
391; GFX8-NEXT:    v_mov_b32_e32 v0, s4
392; GFX8-NEXT:    v_mov_b32_e32 v1, s5
393; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
394; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
395; GFX8-NEXT:    s_mov_b32 s14, -1
396; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
397; GFX8-NEXT:    s_add_u32 s12, s12, s11
398; GFX8-NEXT:    s_addc_u32 s13, s13, 0
399; GFX8-NEXT:    s_waitcnt vmcnt(2)
400; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 4, v3
401; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
402; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
403; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 20, v3
404; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
405; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
406; GFX8-NEXT:    s_waitcnt vmcnt(1)
407; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
408; GFX8-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
409; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
410; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 20, v2
411; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
412; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
413; GFX8-NEXT:    v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
414; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
415; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
416; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
417; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v16
418; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
419; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
420; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
421; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
422; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
423; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
424; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
425; GFX8-NEXT:    s_waitcnt vmcnt(0)
426; GFX8-NEXT:    v_mad_u16 v4, v5, v16, v4
427; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
428; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
429; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
430; GFX8-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
431; GFX8-NEXT:    v_mad_u16 v4, v10, v15, v4
432; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
433; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
434; GFX8-NEXT:    v_mad_u16 v4, v9, v14, v4
435; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
436; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
437; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
438; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
439; GFX8-NEXT:    v_mad_u16 v4, v8, v13, v4
440; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
441; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
442; GFX8-NEXT:    v_mad_u16 v4, v17, v18, v4
443; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
444; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
445; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
446; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
447; GFX8-NEXT:    v_mad_u16 v4, v7, v12, v4
448; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
449; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
450; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
451; GFX8-NEXT:    v_mad_u16 v2, v6, v11, v2
452; GFX8-NEXT:    flat_store_short v[0:1], v2
453; GFX8-NEXT:    s_endpgm
454;
455; GFX9-LABEL: idot8_acc16:
456; GFX9:       ; %bb.0: ; %entry
457; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
458; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
459; GFX9-NEXT:    s_mov_b32 s14, -1
460; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
461; GFX9-NEXT:    s_add_u32 s12, s12, s11
462; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
463; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
464; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
465; GFX9-NEXT:    v_mov_b32_e32 v4, 12
466; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
467; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
468; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
469; GFX9-NEXT:    v_mov_b32_e32 v0, 0
470; GFX9-NEXT:    global_load_ushort v3, v0, s[0:1]
471; GFX9-NEXT:    s_addc_u32 s13, s13, 0
472; GFX9-NEXT:    s_waitcnt vmcnt(2)
473; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
474; GFX9-NEXT:    s_waitcnt vmcnt(1)
475; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
476; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
477; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
478; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
479; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
480; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
481; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
482; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
483; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
484; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
485; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
486; GFX9-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
487; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
488; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
489; GFX9-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
490; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
491; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
492; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
493; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
494; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
495; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
496; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
497; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
498; GFX9-NEXT:    s_waitcnt vmcnt(0)
499; GFX9-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
500; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
501; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
502; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
503; GFX9-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
504; GFX9-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
505; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
506; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
507; GFX9-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
508; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
509; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
510; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
511; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
512; GFX9-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
513; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
514; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
515; GFX9-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
516; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
517; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
518; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
519; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
520; GFX9-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
521; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
522; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
523; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
524; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
525; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
526; GFX9-NEXT:    s_endpgm
527;
528; GFX9-DL-LABEL: idot8_acc16:
529; GFX9-DL:       ; %bb.0: ; %entry
530; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
531; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
532; GFX9-DL-NEXT:    s_mov_b32 s14, -1
533; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
534; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
535; GFX9-DL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
536; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
537; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
538; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 12
539; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
540; GFX9-DL-NEXT:    global_load_dword v1, v0, s[8:9]
541; GFX9-DL-NEXT:    global_load_dword v2, v0, s[10:11]
542; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
543; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
544; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
545; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
546; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
547; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
548; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
549; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
550; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
551; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
552; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
553; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
554; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
555; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
556; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
557; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
558; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
559; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
560; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
561; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
562; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
563; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
564; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
565; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
566; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
567; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
568; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
569; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
570; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
571; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
572; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
573; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
574; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
575; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
576; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
577; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
578; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
579; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
580; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
581; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
582; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
583; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
584; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
585; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
586; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
587; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
588; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
589; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
590; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
591; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
592; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
593; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
594; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
595; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
596; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
597; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
598; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
599; GFX9-DL-NEXT:    s_endpgm
600;
601; GFX10-DL-XNACK-LABEL: idot8_acc16:
602; GFX10-DL-XNACK:       ; %bb.0: ; %entry
603; GFX10-DL-XNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
604; GFX10-DL-XNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
605; GFX10-DL-XNACK-NEXT:    s_mov_b32 s14, -1
606; GFX10-DL-XNACK-NEXT:    s_mov_b32 s15, 0x31c16000
607; GFX10-DL-XNACK-NEXT:    s_add_u32 s12, s12, s11
608; GFX10-DL-XNACK-NEXT:    s_clause 0x1
609; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
610; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
611; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
612; GFX10-DL-XNACK-NEXT:    s_addc_u32 s13, s13, 0
613; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
614; GFX10-DL-XNACK-NEXT:    s_clause 0x1
615; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[8:9]
616; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[10:11]
617; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
618; GFX10-DL-XNACK-NEXT:    global_load_ushort v3, v0, s[0:1]
619; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
620; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
621; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
622; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
623; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
624; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
625; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
626; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
627; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
628; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
629; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v2
630; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v2
631; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
632; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
633; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v2
634; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
635; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
636; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
637; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
638; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
639; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
640; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
641; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
642; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
643; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
644; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
645; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
646; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
647; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
648; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
649; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
650; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
651; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
652; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
653; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
654; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
655; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
656; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v3, v2, v1
657; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v7
658; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
659; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
660; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
661; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v9, v1
662; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
663; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
664; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
665; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
666; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
667; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v5
668; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
669; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
670; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
671; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v6, v7, v1
672; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
673; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
674; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
675; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v4, v5, v1
676; GFX10-DL-XNACK-NEXT:    global_store_short v0, v1, s[0:1]
677; GFX10-DL-XNACK-NEXT:    s_endpgm
678;
679; GFX10-DL-NOXNACK-LABEL: idot8_acc16:
680; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
681; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
682; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
683; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s14, -1
684; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s15, 0x31c16000
685; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s12, s12, s11
686; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
687; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
688; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
689; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
690; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
691; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s13, s13, 0
692; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
693; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
694; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[8:9]
695; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[10:11]
696; GFX10-DL-NOXNACK-NEXT:    global_load_ushort v3, v2, s[0:1]
697; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
698; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
699; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
700; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
701; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
702; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
703; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
704; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
705; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
706; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
707; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
708; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v0
709; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
710; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
711; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
712; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
713; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
714; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
715; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
716; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
717; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
718; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
719; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
720; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
721; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
722; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
723; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
724; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
725; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
726; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
727; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
728; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
729; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
730; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
731; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
732; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
733; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
734; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v3, v0, v1
735; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v7
736; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
737; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
738; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
739; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v8, v9, v0
740; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
741; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
742; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
743; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
744; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
745; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v5
746; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
747; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
748; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
749; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v6, v7, v0
750; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
751; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
752; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
753; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v4, v5, v0
754; GFX10-DL-NOXNACK-NEXT:    global_store_short v2, v0, s[0:1]
755; GFX10-DL-NOXNACK-NEXT:    s_endpgm
756                                       ptr addrspace(1) %src2,
757                                       ptr addrspace(1) nocapture %dst) {
758entry:
759  %idx = call i32 @llvm.amdgcn.workitem.id.x()
760  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
761  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
762  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
763  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
764
765  %v1e0 = extractelement <8 x i4> %vec1, i64 0
766  %cv1e0 = sext i4 %v1e0 to i16
767  %v2e0 = extractelement <8 x i4> %vec2, i64 0
768  %cv2e0 = sext i4 %v2e0 to i16
769  %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
770
771  %v1e1 = extractelement <8 x i4> %vec1, i64 1
772  %cv1e1 = sext i4 %v1e1 to i16
773  %v2e1 = extractelement <8 x i4> %vec2, i64 1
774  %cv2e1 = sext i4 %v2e1 to i16
775  %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
776
777  %v1e2 = extractelement <8 x i4> %vec1, i64 2
778  %cv1e2 = sext i4 %v1e2 to i16
779  %v2e2 = extractelement <8 x i4> %vec2, i64 2
780  %cv2e2 = sext i4 %v2e2 to i16
781  %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
782
783  %v1e3 = extractelement <8 x i4> %vec1, i64 3
784  %cv1e3 = sext i4 %v1e3 to i16
785  %v2e3 = extractelement <8 x i4> %vec2, i64 3
786  %cv2e3 = sext i4 %v2e3 to i16
787  %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
788
789  %v1e4 = extractelement <8 x i4> %vec1, i64 4
790  %cv1e4 = sext i4 %v1e4 to i16
791  %v2e4 = extractelement <8 x i4> %vec2, i64 4
792  %cv2e4 = sext i4 %v2e4 to i16
793  %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
794
795  %v1e5 = extractelement <8 x i4> %vec1, i64 5
796  %cv1e5 = sext i4 %v1e5 to i16
797  %v2e5 = extractelement <8 x i4> %vec2, i64 5
798  %cv2e5 = sext i4 %v2e5 to i16
799  %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
800
801  %v1e6 = extractelement <8 x i4> %vec1, i64 6
802  %cv1e6 = sext i4 %v1e6 to i16
803  %v2e6 = extractelement <8 x i4> %vec2, i64 6
804  %cv2e6 = sext i4 %v2e6 to i16
805  %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
806
807  %v1e7 = extractelement <8 x i4> %vec1, i64 7
808  %cv1e7 = sext i4 %v1e7 to i16
809  %v2e7 = extractelement <8 x i4> %vec2, i64 7
810  %cv2e7 = sext i4 %v2e7 to i16
811  %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
812
813  %acc = load i16, ptr addrspace(1) %dst, align 4
814  %add1 = add i16 %mul0, %acc
815  %add2 = add i16 %add1, %mul1
816  %add3 = add i16 %add2, %mul2
817  %add4 = add i16 %add3, %mul3
818  %add5 = add i16 %add4, %mul4
819  %add6 = add i16 %add5, %mul5
820  %add7 = add i16 %add6, %mul6
821  %add8 = add i16 %add7, %mul7
822
823  store i16 %add8, ptr addrspace(1) %dst, align 4
824  ret void
825}
826
827; TODO: Support this pattern.
828define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
829; GFX7-LABEL: idot8_acc8:
830; GFX7:       ; %bb.0: ; %entry
831; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
832; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
833; GFX7-NEXT:    s_mov_b32 s14, -1
834; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
835; GFX7-NEXT:    s_add_u32 s12, s12, s11
836; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
837; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
838; GFX7-NEXT:    s_mov_b32 s3, 0xf000
839; GFX7-NEXT:    s_mov_b32 s6, 0
840; GFX7-NEXT:    s_mov_b32 s7, s3
841; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
842; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
843; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
844; GFX7-NEXT:    v_mov_b32_e32 v1, 0
845; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
846; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
847; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
848; GFX7-NEXT:    s_mov_b32 s2, -1
849; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
850; GFX7-NEXT:    s_addc_u32 s13, s13, 0
851; GFX7-NEXT:    s_waitcnt vmcnt(2)
852; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 4
853; GFX7-NEXT:    v_bfe_i32 v4, v2, 4, 4
854; GFX7-NEXT:    s_waitcnt vmcnt(1)
855; GFX7-NEXT:    v_bfe_i32 v10, v0, 0, 4
856; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v3
857; GFX7-NEXT:    v_bfe_i32 v11, v0, 4, 4
858; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v10
859; GFX7-NEXT:    v_bfe_i32 v5, v2, 8, 4
860; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
861; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
862; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v11
863; GFX7-NEXT:    s_waitcnt vmcnt(0)
864; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
865; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
866; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v5
867; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
868; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v12
869; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
870; GFX7-NEXT:    v_bfe_i32 v7, v2, 16, 4
871; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v6
872; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
873; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v13
874; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
875; GFX7-NEXT:    v_bfe_i32 v8, v2, 20, 4
876; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v7
877; GFX7-NEXT:    v_bfe_i32 v15, v0, 20, 4
878; GFX7-NEXT:    v_and_b32_e32 v14, 0xff, v14
879; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
880; GFX7-NEXT:    v_bfe_i32 v9, v2, 24, 4
881; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v8
882; GFX7-NEXT:    v_bfe_i32 v16, v0, 24, 4
883; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v15
884; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
885; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
886; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v9
887; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
888; GFX7-NEXT:    v_and_b32_e32 v16, 0xff, v16
889; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
890; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
891; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
892; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v16, v1
893; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
894; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
895; GFX7-NEXT:    s_endpgm
896;
897; GFX8-LABEL: idot8_acc8:
898; GFX8:       ; %bb.0: ; %entry
899; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
900; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
901; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
902; GFX8-NEXT:    v_mov_b32_e32 v5, 12
903; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
904; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
905; GFX8-NEXT:    v_mov_b32_e32 v1, s1
906; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
907; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
908; GFX8-NEXT:    flat_load_dword v3, v[0:1]
909; GFX8-NEXT:    v_mov_b32_e32 v1, s3
910; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
911; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
912; GFX8-NEXT:    flat_load_dword v2, v[0:1]
913; GFX8-NEXT:    v_mov_b32_e32 v0, s4
914; GFX8-NEXT:    v_mov_b32_e32 v1, s5
915; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
916; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
917; GFX8-NEXT:    s_mov_b32 s14, -1
918; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
919; GFX8-NEXT:    s_add_u32 s12, s12, s11
920; GFX8-NEXT:    s_addc_u32 s13, s13, 0
921; GFX8-NEXT:    s_waitcnt vmcnt(2)
922; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 4, v3
923; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
924; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
925; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 20, v3
926; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
927; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
928; GFX8-NEXT:    s_waitcnt vmcnt(1)
929; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
930; GFX8-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
931; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
932; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 20, v2
933; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
934; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
935; GFX8-NEXT:    v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
936; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
937; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
938; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
939; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v16
940; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
941; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
942; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
943; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
944; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
945; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
946; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
947; GFX8-NEXT:    s_waitcnt vmcnt(0)
948; GFX8-NEXT:    v_mad_u16 v4, v5, v16, v4
949; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
950; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
951; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
952; GFX8-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
953; GFX8-NEXT:    v_mad_u16 v4, v10, v15, v4
954; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
955; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
956; GFX8-NEXT:    v_mad_u16 v4, v9, v14, v4
957; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
958; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
959; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
960; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
961; GFX8-NEXT:    v_mad_u16 v4, v8, v13, v4
962; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
963; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
964; GFX8-NEXT:    v_mad_u16 v4, v17, v18, v4
965; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
966; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
967; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
968; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
969; GFX8-NEXT:    v_mad_u16 v4, v7, v12, v4
970; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
971; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
972; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
973; GFX8-NEXT:    v_mad_u16 v2, v6, v11, v2
974; GFX8-NEXT:    flat_store_byte v[0:1], v2
975; GFX8-NEXT:    s_endpgm
976;
977; GFX9-LABEL: idot8_acc8:
978; GFX9:       ; %bb.0: ; %entry
979; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
980; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
981; GFX9-NEXT:    s_mov_b32 s14, -1
982; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
983; GFX9-NEXT:    s_add_u32 s12, s12, s11
984; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
985; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
986; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
987; GFX9-NEXT:    v_mov_b32_e32 v4, 12
988; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
989; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
990; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
991; GFX9-NEXT:    v_mov_b32_e32 v0, 0
992; GFX9-NEXT:    global_load_ubyte v3, v0, s[0:1]
993; GFX9-NEXT:    s_addc_u32 s13, s13, 0
994; GFX9-NEXT:    s_waitcnt vmcnt(2)
995; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
996; GFX9-NEXT:    s_waitcnt vmcnt(1)
997; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
998; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
999; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
1000; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
1001; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1002; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
1003; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
1004; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
1005; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
1006; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
1007; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
1008; GFX9-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1009; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1010; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1011; GFX9-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1012; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
1013; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
1014; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
1015; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
1016; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
1017; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
1018; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
1019; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
1020; GFX9-NEXT:    s_waitcnt vmcnt(0)
1021; GFX9-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
1022; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
1023; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
1024; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
1025; GFX9-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
1026; GFX9-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
1027; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
1028; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
1029; GFX9-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
1030; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
1031; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
1032; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
1033; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
1034; GFX9-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
1035; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
1036; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
1037; GFX9-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
1038; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
1039; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
1040; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
1041; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
1042; GFX9-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
1043; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
1044; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
1045; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1046; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
1047; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
1048; GFX9-NEXT:    s_endpgm
1049;
1050; GFX9-DL-LABEL: idot8_acc8:
1051; GFX9-DL:       ; %bb.0: ; %entry
1052; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1053; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1054; GFX9-DL-NEXT:    s_mov_b32 s14, -1
1055; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
1056; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
1057; GFX9-DL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
1058; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1059; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1060; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 12
1061; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1062; GFX9-DL-NEXT:    global_load_dword v1, v0, s[8:9]
1063; GFX9-DL-NEXT:    global_load_dword v2, v0, s[10:11]
1064; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1065; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
1066; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
1067; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1068; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
1069; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1070; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
1071; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
1072; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
1073; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
1074; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1075; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
1076; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
1077; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
1078; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
1079; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
1080; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
1081; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1082; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1083; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1084; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1085; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
1086; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
1087; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
1088; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
1089; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
1090; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
1091; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
1092; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
1093; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1094; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
1095; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
1096; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
1097; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
1098; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
1099; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
1100; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
1101; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
1102; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
1103; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
1104; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
1105; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
1106; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
1107; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
1108; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
1109; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
1110; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
1111; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
1112; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
1113; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
1114; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
1115; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
1116; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
1117; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
1118; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1119; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
1120; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
1121; GFX9-DL-NEXT:    s_endpgm
1122;
1123; GFX10-DL-XNACK-LABEL: idot8_acc8:
1124; GFX10-DL-XNACK:       ; %bb.0: ; %entry
1125; GFX10-DL-XNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1126; GFX10-DL-XNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1127; GFX10-DL-XNACK-NEXT:    s_mov_b32 s14, -1
1128; GFX10-DL-XNACK-NEXT:    s_mov_b32 s15, 0x31c16000
1129; GFX10-DL-XNACK-NEXT:    s_add_u32 s12, s12, s11
1130; GFX10-DL-XNACK-NEXT:    s_clause 0x1
1131; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
1132; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1133; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1134; GFX10-DL-XNACK-NEXT:    s_addc_u32 s13, s13, 0
1135; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
1136; GFX10-DL-XNACK-NEXT:    s_clause 0x1
1137; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[8:9]
1138; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[10:11]
1139; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
1140; GFX10-DL-XNACK-NEXT:    global_load_ubyte v3, v0, s[0:1]
1141; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
1142; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1143; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1144; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1145; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
1146; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
1147; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
1148; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
1149; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
1150; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
1151; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v2
1152; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v2
1153; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1154; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
1155; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v2
1156; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
1157; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
1158; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
1159; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
1160; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
1161; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
1162; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
1163; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
1164; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
1165; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
1166; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
1167; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
1168; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
1169; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
1170; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
1171; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
1172; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
1173; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
1174; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
1175; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
1176; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
1177; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
1178; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v3, v2, v1
1179; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v7
1180; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
1181; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
1182; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
1183; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v9, v1
1184; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
1185; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
1186; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
1187; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
1188; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
1189; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v5
1190; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
1191; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
1192; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
1193; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v6, v7, v1
1194; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
1195; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
1196; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
1197; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v4, v5, v1
1198; GFX10-DL-XNACK-NEXT:    global_store_byte v0, v1, s[0:1]
1199; GFX10-DL-XNACK-NEXT:    s_endpgm
1200;
1201; GFX10-DL-NOXNACK-LABEL: idot8_acc8:
1202; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
1203; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1204; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1205; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s14, -1
1206; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s15, 0x31c16000
1207; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s12, s12, s11
1208; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
1209; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
1210; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1211; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1212; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
1213; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s13, s13, 0
1214; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
1215; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
1216; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[8:9]
1217; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[10:11]
1218; GFX10-DL-NOXNACK-NEXT:    global_load_ubyte v3, v2, s[0:1]
1219; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
1220; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1221; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1222; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1223; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
1224; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
1225; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
1226; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
1227; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
1228; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
1229; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
1230; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v0
1231; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
1232; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
1233; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
1234; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
1235; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
1236; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
1237; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
1238; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
1239; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
1240; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
1241; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
1242; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
1243; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
1244; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
1245; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
1246; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
1247; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
1248; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
1249; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
1250; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
1251; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
1252; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
1253; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
1254; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
1255; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
1256; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v3, v0, v1
1257; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v7
1258; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
1259; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
1260; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
1261; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v8, v9, v0
1262; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
1263; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
1264; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
1265; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
1266; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
1267; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v5
1268; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
1269; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
1270; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
1271; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v6, v7, v0
1272; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
1273; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
1274; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
1275; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v4, v5, v0
1276; GFX10-DL-NOXNACK-NEXT:    global_store_byte v2, v0, s[0:1]
1277; GFX10-DL-NOXNACK-NEXT:    s_endpgm
1278                                       ptr addrspace(1) %src2,
1279                                       ptr addrspace(1) nocapture %dst) {
1280entry:
1281  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1282  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
1283  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
1284  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
1285  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
1286
1287  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1288  %cv1e0 = sext i4 %v1e0 to i8
1289  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1290  %cv2e0 = sext i4 %v2e0 to i8
1291  %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
1292
1293  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1294  %cv1e1 = sext i4 %v1e1 to i8
1295  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1296  %cv2e1 = sext i4 %v2e1 to i8
1297  %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
1298
1299  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1300  %cv1e2 = sext i4 %v1e2 to i8
1301  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1302  %cv2e2 = sext i4 %v2e2 to i8
1303  %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
1304
1305  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1306  %cv1e3 = sext i4 %v1e3 to i8
1307  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1308  %cv2e3 = sext i4 %v2e3 to i8
1309  %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
1310
1311  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1312  %cv1e4 = sext i4 %v1e4 to i8
1313  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1314  %cv2e4 = sext i4 %v2e4 to i8
1315  %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
1316
1317  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1318  %cv1e5 = sext i4 %v1e5 to i8
1319  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1320  %cv2e5 = sext i4 %v2e5 to i8
1321  %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
1322
1323  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1324  %cv1e6 = sext i4 %v1e6 to i8
1325  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1326  %cv2e6 = sext i4 %v2e6 to i8
1327  %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
1328
1329  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1330  %cv1e7 = sext i4 %v1e7 to i8
1331  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1332  %cv2e7 = sext i4 %v2e7 to i8
1333  %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
1334
1335  %acc = load i8, ptr addrspace(1) %dst, align 4
1336  %add1 = add i8 %mul0, %acc
1337  %add2 = add i8 %add1, %mul1
1338  %add3 = add i8 %add2, %mul2
1339  %add4 = add i8 %add3, %mul3
1340  %add5 = add i8 %add4, %mul4
1341  %add6 = add i8 %add5, %mul5
1342  %add7 = add i8 %add6, %mul6
1343  %add8 = add i8 %add7, %mul7
1344
1345  store i8 %add8, ptr addrspace(1) %dst, align 4
1346  ret void
1347}
1348
1349; Make sure the pattern is not recognized if there are multiple uses of the
1350; intermediate multiplications.
1351define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
1352; GFX7-LABEL: idot8_multiuses_mul1:
1353; GFX7:       ; %bb.0: ; %entry
1354; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1355; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1356; GFX7-NEXT:    s_mov_b32 s14, -1
1357; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1358; GFX7-NEXT:    s_add_u32 s12, s12, s11
1359; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1360; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1361; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1362; GFX7-NEXT:    s_mov_b32 s6, 0
1363; GFX7-NEXT:    s_mov_b32 s7, s3
1364; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1365; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1366; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1367; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1368; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1369; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1370; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1371; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1372; GFX7-NEXT:    s_mov_b32 s2, -1
1373; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1374; GFX7-NEXT:    s_waitcnt vmcnt(1)
1375; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 4
1376; GFX7-NEXT:    v_bfe_i32 v3, v2, 4, 4
1377; GFX7-NEXT:    s_waitcnt vmcnt(0)
1378; GFX7-NEXT:    v_bfe_i32 v9, v0, 0, 4
1379; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1380; GFX7-NEXT:    v_mad_i32_i24 v16, v1, v9, s4
1381; GFX7-NEXT:    v_bfe_i32 v10, v0, 4, 4
1382; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v9, v16
1383; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 4
1384; GFX7-NEXT:    v_bfe_i32 v11, v0, 8, 4
1385; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v10, v1
1386; GFX7-NEXT:    v_bfe_i32 v5, v2, 12, 4
1387; GFX7-NEXT:    v_bfe_i32 v12, v0, 12, 4
1388; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v11, v1
1389; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
1390; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
1391; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v12, v1
1392; GFX7-NEXT:    v_bfe_i32 v7, v2, 20, 4
1393; GFX7-NEXT:    v_bfe_i32 v14, v0, 20, 4
1394; GFX7-NEXT:    v_mad_i32_i24 v1, v6, v13, v1
1395; GFX7-NEXT:    v_bfe_i32 v8, v2, 24, 4
1396; GFX7-NEXT:    v_bfe_i32 v15, v0, 24, 4
1397; GFX7-NEXT:    v_mad_i32_i24 v1, v7, v14, v1
1398; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1399; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
1400; GFX7-NEXT:    v_mad_i32_i24 v1, v8, v15, v1
1401; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
1402; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
1403; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1404; GFX7-NEXT:    s_endpgm
1405;
1406; GFX8-LABEL: idot8_multiuses_mul1:
1407; GFX8:       ; %bb.0: ; %entry
1408; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1409; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1410; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1411; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1412; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1413; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1414; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1415; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1416; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1417; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1418; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1419; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1420; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1421; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1422; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1423; GFX8-NEXT:    s_mov_b32 s14, -1
1424; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
1425; GFX8-NEXT:    s_add_u32 s12, s12, s11
1426; GFX8-NEXT:    s_addc_u32 s13, s13, 0
1427; GFX8-NEXT:    s_waitcnt vmcnt(1)
1428; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 4
1429; GFX8-NEXT:    v_bfe_i32 v4, v3, 4, 4
1430; GFX8-NEXT:    v_bfe_i32 v6, v3, 8, 4
1431; GFX8-NEXT:    v_bfe_i32 v8, v3, 12, 4
1432; GFX8-NEXT:    v_bfe_i32 v10, v3, 16, 4
1433; GFX8-NEXT:    v_bfe_i32 v12, v3, 20, 4
1434; GFX8-NEXT:    s_waitcnt vmcnt(0)
1435; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 4
1436; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1437; GFX8-NEXT:    v_mad_i32_i24 v16, v1, v2, s0
1438; GFX8-NEXT:    v_bfe_i32 v5, v0, 4, 4
1439; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, v16
1440; GFX8-NEXT:    v_bfe_i32 v7, v0, 8, 4
1441; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
1442; GFX8-NEXT:    v_bfe_i32 v9, v0, 12, 4
1443; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
1444; GFX8-NEXT:    v_bfe_i32 v11, v0, 16, 4
1445; GFX8-NEXT:    v_mad_i32_i24 v1, v8, v9, v1
1446; GFX8-NEXT:    v_bfe_i32 v13, v0, 20, 4
1447; GFX8-NEXT:    v_mad_i32_i24 v1, v10, v11, v1
1448; GFX8-NEXT:    v_bfe_i32 v14, v3, 24, 4
1449; GFX8-NEXT:    v_bfe_i32 v15, v0, 24, 4
1450; GFX8-NEXT:    v_mad_i32_i24 v1, v12, v13, v1
1451; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 28, v3
1452; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
1453; GFX8-NEXT:    v_mad_i32_i24 v1, v14, v15, v1
1454; GFX8-NEXT:    v_mad_i32_i24 v0, v3, v0, v1
1455; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v16, v0
1456; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1457; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1458; GFX8-NEXT:    flat_store_dword v[0:1], v2
1459; GFX8-NEXT:    s_endpgm
1460;
1461; GFX9-LABEL: idot8_multiuses_mul1:
1462; GFX9:       ; %bb.0: ; %entry
1463; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1464; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1465; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1466; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1467; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1468; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1469; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
1470; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
1471; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
1472; GFX9-NEXT:    s_mov_b32 s14, -1
1473; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
1474; GFX9-NEXT:    s_add_u32 s12, s12, s11
1475; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1476; GFX9-NEXT:    s_addc_u32 s13, s13, 0
1477; GFX9-NEXT:    s_waitcnt vmcnt(1)
1478; GFX9-NEXT:    v_bfe_i32 v3, v1, 0, 4
1479; GFX9-NEXT:    s_waitcnt vmcnt(0)
1480; GFX9-NEXT:    v_bfe_i32 v4, v2, 0, 4
1481; GFX9-NEXT:    v_bfe_i32 v5, v1, 4, 4
1482; GFX9-NEXT:    v_bfe_i32 v6, v2, 4, 4
1483; GFX9-NEXT:    v_bfe_i32 v7, v1, 8, 4
1484; GFX9-NEXT:    v_bfe_i32 v8, v2, 8, 4
1485; GFX9-NEXT:    v_bfe_i32 v9, v1, 12, 4
1486; GFX9-NEXT:    v_bfe_i32 v10, v2, 12, 4
1487; GFX9-NEXT:    v_bfe_i32 v11, v1, 16, 4
1488; GFX9-NEXT:    v_bfe_i32 v12, v2, 16, 4
1489; GFX9-NEXT:    v_bfe_i32 v13, v1, 20, 4
1490; GFX9-NEXT:    v_bfe_i32 v14, v2, 20, 4
1491; GFX9-NEXT:    v_bfe_i32 v15, v1, 24, 4
1492; GFX9-NEXT:    v_bfe_i32 v16, v2, 24, 4
1493; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1494; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1495; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
1496; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1497; GFX9-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
1498; GFX9-NEXT:    v_mul_i32_i24_e32 v5, v5, v6
1499; GFX9-NEXT:    v_mul_i32_i24_e32 v6, v7, v8
1500; GFX9-NEXT:    v_mad_i32_i24 v3, v3, v4, v2
1501; GFX9-NEXT:    v_mul_i32_i24_e32 v7, v9, v10
1502; GFX9-NEXT:    v_mul_i32_i24_e32 v8, v11, v12
1503; GFX9-NEXT:    v_add3_u32 v3, v3, v5, v6
1504; GFX9-NEXT:    v_mul_i32_i24_e32 v9, v13, v14
1505; GFX9-NEXT:    v_mul_i32_i24_e32 v10, v15, v16
1506; GFX9-NEXT:    v_add3_u32 v3, v3, v7, v8
1507; GFX9-NEXT:    v_add3_u32 v3, v3, v9, v10
1508; GFX9-NEXT:    v_add3_u32 v1, v3, v1, v2
1509; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
1510; GFX9-NEXT:    s_endpgm
1511;
1512; GFX9-DL-LABEL: idot8_multiuses_mul1:
1513; GFX9-DL:       ; %bb.0: ; %entry
1514; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1515; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1516; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1517; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1518; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1519; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1520; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1521; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1522; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1523; GFX9-DL-NEXT:    s_mov_b32 s14, -1
1524; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
1525; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
1526; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1527; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
1528; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1529; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 4
1530; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1531; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 4
1532; GFX9-DL-NEXT:    v_bfe_i32 v5, v1, 4, 4
1533; GFX9-DL-NEXT:    v_bfe_i32 v6, v2, 4, 4
1534; GFX9-DL-NEXT:    v_bfe_i32 v7, v1, 8, 4
1535; GFX9-DL-NEXT:    v_bfe_i32 v8, v2, 8, 4
1536; GFX9-DL-NEXT:    v_bfe_i32 v9, v1, 12, 4
1537; GFX9-DL-NEXT:    v_bfe_i32 v10, v2, 12, 4
1538; GFX9-DL-NEXT:    v_bfe_i32 v11, v1, 16, 4
1539; GFX9-DL-NEXT:    v_bfe_i32 v12, v2, 16, 4
1540; GFX9-DL-NEXT:    v_bfe_i32 v13, v1, 20, 4
1541; GFX9-DL-NEXT:    v_bfe_i32 v14, v2, 20, 4
1542; GFX9-DL-NEXT:    v_bfe_i32 v15, v1, 24, 4
1543; GFX9-DL-NEXT:    v_bfe_i32 v16, v2, 24, 4
1544; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1545; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1546; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
1547; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1548; GFX9-DL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
1549; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v5, v5, v6
1550; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v6, v7, v8
1551; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v3, v4, v2
1552; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v7, v9, v10
1553; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v8, v11, v12
1554; GFX9-DL-NEXT:    v_add3_u32 v3, v3, v5, v6
1555; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v9, v13, v14
1556; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v10, v15, v16
1557; GFX9-DL-NEXT:    v_add3_u32 v3, v3, v7, v8
1558; GFX9-DL-NEXT:    v_add3_u32 v3, v3, v9, v10
1559; GFX9-DL-NEXT:    v_add3_u32 v1, v3, v1, v2
1560; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1561; GFX9-DL-NEXT:    s_endpgm
1562;
1563; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1:
1564; GFX10-DL-XNACK:       ; %bb.0: ; %entry
1565; GFX10-DL-XNACK-NEXT:    s_clause 0x1
1566; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1567; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1568; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1569; GFX10-DL-XNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1570; GFX10-DL-XNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1571; GFX10-DL-XNACK-NEXT:    s_mov_b32 s14, -1
1572; GFX10-DL-XNACK-NEXT:    s_mov_b32 s15, 0x31c16000
1573; GFX10-DL-XNACK-NEXT:    s_add_u32 s12, s12, s11
1574; GFX10-DL-XNACK-NEXT:    s_addc_u32 s13, s13, 0
1575; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
1576; GFX10-DL-XNACK-NEXT:    s_clause 0x1
1577; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[0:1]
1578; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[2:3]
1579; GFX10-DL-XNACK-NEXT:    s_waitcnt_depctr 0xffe3
1580; GFX10-DL-XNACK-NEXT:    s_load_dword s0, s[6:7], 0x0
1581; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
1582; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v0, v1, 0, 4
1583; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v3, v1, 4, 4
1584; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
1585; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v4, v2, 4, 4
1586; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v5, v1, 8, 4
1587; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v6, v2, 8, 4
1588; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v7, v2, 0, 4
1589; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v8, v1, 12, 4
1590; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
1591; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v9, v2, 12, 4
1592; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
1593; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
1594; GFX10-DL-XNACK-NEXT:    v_mad_i32_i24 v5, v0, v7, s0
1595; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v6, v1, 16, 4
1596; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v10, v2, 16, 4
1597; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v11, v1, 20, 4
1598; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v12, v2, 20, 4
1599; GFX10-DL-XNACK-NEXT:    v_mad_i32_i24 v0, v0, v7, v5
1600; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v7, v1, 24, 4
1601; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v13, v2, 24, 4
1602; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v8, v8, v9
1603; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v6, v6, v10
1604; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v3, v4
1605; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v3, v11, v12
1606; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v4, v7, v13
1607; GFX10-DL-XNACK-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1608; GFX10-DL-XNACK-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1609; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v8, v6
1610; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
1611; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v3, v4
1612; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v2, 0
1613; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v1, v5
1614; GFX10-DL-XNACK-NEXT:    global_store_dword v2, v0, s[6:7]
1615; GFX10-DL-XNACK-NEXT:    s_endpgm
1616;
1617; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1:
1618; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
1619; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1620; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1621; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1622; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1623; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1624; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s14, -1
1625; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s15, 0x31c16000
1626; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s12, s12, s11
1627; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s13, s13, 0
1628; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
1629; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
1630; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[0:1]
1631; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[2:3]
1632; GFX10-DL-NOXNACK-NEXT:    s_load_dword s0, s[4:5], 0x0
1633; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
1634; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v2, v1, 0, 4
1635; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v3, v1, 4, 4
1636; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
1637; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v4, v0, 4, 4
1638; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v5, v1, 8, 4
1639; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v6, v0, 8, 4
1640; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v7, v0, 0, 4
1641; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v8, v1, 12, 4
1642; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
1643; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v9, v0, 12, 4
1644; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
1645; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
1646; GFX10-DL-NOXNACK-NEXT:    v_mad_i32_i24 v5, v2, v7, s0
1647; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v6, v1, 16, 4
1648; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v10, v0, 16, 4
1649; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v11, v1, 20, 4
1650; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v12, v0, 20, 4
1651; GFX10-DL-NOXNACK-NEXT:    v_mad_i32_i24 v2, v2, v7, v5
1652; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v7, v1, 24, 4
1653; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v13, v0, 24, 4
1654; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v8, v8, v9
1655; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v6, v6, v10
1656; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v2, v2, v3, v4
1657; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v3, v11, v12
1658; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v4, v7, v13
1659; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1660; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
1661; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v2, v2, v8, v6
1662; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v0, v1, v0
1663; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v1, v2, v3, v4
1664; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
1665; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v0, v1, v0, v5
1666; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[4:5]
1667; GFX10-DL-NOXNACK-NEXT:    s_endpgm
1668                                                ptr addrspace(1) %src2,
1669                                                ptr addrspace(1) nocapture %dst) {
1670entry:
1671  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1672  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
1673  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
1674  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
1675  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
1676
1677  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1678  %cv1e0 = sext i4 %v1e0 to i32
1679  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1680  %cv2e0 = sext i4 %v2e0 to i32
1681  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1682
1683  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1684  %cv1e1 = sext i4 %v1e1 to i32
1685  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1686  %cv2e1 = sext i4 %v2e1 to i32
1687  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1688
1689  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1690  %cv1e2 = sext i4 %v1e2 to i32
1691  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1692  %cv2e2 = sext i4 %v2e2 to i32
1693  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1694
1695  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1696  %cv1e3 = sext i4 %v1e3 to i32
1697  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1698  %cv2e3 = sext i4 %v2e3 to i32
1699  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1700
1701  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1702  %cv1e4 = sext i4 %v1e4 to i32
1703  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1704  %cv2e4 = sext i4 %v2e4 to i32
1705  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1706
1707  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1708  %cv1e5 = sext i4 %v1e5 to i32
1709  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1710  %cv2e5 = sext i4 %v2e5 to i32
1711  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1712
1713  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1714  %cv1e6 = sext i4 %v1e6 to i32
1715  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1716  %cv2e6 = sext i4 %v2e6 to i32
1717  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1718
1719  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1720  %cv1e7 = sext i4 %v1e7 to i32
1721  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1722  %cv2e7 = sext i4 %v2e7 to i32
1723  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1724
1725  %acc = load i32, ptr addrspace(1) %dst, align 4
1726  %add =  add i32  %mul0, %acc
1727  %add1 = add i32 %mul0, %add
1728  %add2 = add i32 %add1, %mul1
1729  %add3 = add i32 %add2, %mul2
1730  %add4 = add i32 %add3, %mul3
1731  %add5 = add i32 %add4, %mul4
1732  %add6 = add i32 %add5, %mul5
1733  %add7 = add i32 %add6, %mul6
1734  %add8 = add i32 %add7, %mul7
1735
1736  %res = add i32 %add, %add8
1737  store i32 %res, ptr addrspace(1) %dst, align 4
1738  ret void
1739}
1740
1741; TODO: Support this pattern.
1742define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
1743; GFX7-LABEL: idot8_acc32_vecMul:
1744; GFX7:       ; %bb.0: ; %entry
1745; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1746; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1747; GFX7-NEXT:    s_mov_b32 s14, -1
1748; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1749; GFX7-NEXT:    s_add_u32 s12, s12, s11
1750; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1751; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1752; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1753; GFX7-NEXT:    s_mov_b32 s6, 0
1754; GFX7-NEXT:    s_mov_b32 s7, s3
1755; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1756; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1757; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1758; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1759; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1760; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1761; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1762; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1763; GFX7-NEXT:    s_mov_b32 s2, -1
1764; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1765; GFX7-NEXT:    s_waitcnt vmcnt(1)
1766; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 28, v2
1767; GFX7-NEXT:    v_bfe_i32 v3, v2, 24, 4
1768; GFX7-NEXT:    v_bfe_i32 v4, v2, 20, 4
1769; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 4
1770; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
1771; GFX7-NEXT:    v_bfe_i32 v7, v2, 8, 4
1772; GFX7-NEXT:    v_bfe_i32 v8, v2, 4, 4
1773; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 4
1774; GFX7-NEXT:    s_waitcnt vmcnt(0)
1775; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 28, v0
1776; GFX7-NEXT:    v_bfe_i32 v10, v0, 24, 4
1777; GFX7-NEXT:    v_bfe_i32 v11, v0, 20, 4
1778; GFX7-NEXT:    v_bfe_i32 v12, v0, 16, 4
1779; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
1780; GFX7-NEXT:    v_bfe_i32 v14, v0, 8, 4
1781; GFX7-NEXT:    v_bfe_i32 v15, v0, 4, 4
1782; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 4
1783; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1784; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, s4
1785; GFX7-NEXT:    v_mad_i32_i24 v0, v8, v15, v0
1786; GFX7-NEXT:    v_mad_i32_i24 v0, v7, v14, v0
1787; GFX7-NEXT:    v_mad_i32_i24 v0, v6, v13, v0
1788; GFX7-NEXT:    v_mad_i32_i24 v0, v5, v12, v0
1789; GFX7-NEXT:    v_mad_i32_i24 v0, v4, v11, v0
1790; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v10, v0
1791; GFX7-NEXT:    v_mad_i32_i24 v0, v1, v9, v0
1792; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1793; GFX7-NEXT:    s_endpgm
1794;
1795; GFX8-LABEL: idot8_acc32_vecMul:
1796; GFX8:       ; %bb.0: ; %entry
1797; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1798; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1799; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1800; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1801; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1802; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1803; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1804; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1805; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1806; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1807; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1808; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1809; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1810; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1811; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1812; GFX8-NEXT:    s_mov_b32 s14, -1
1813; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
1814; GFX8-NEXT:    s_add_u32 s12, s12, s11
1815; GFX8-NEXT:    s_addc_u32 s13, s13, 0
1816; GFX8-NEXT:    s_waitcnt vmcnt(1)
1817; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 28, v3
1818; GFX8-NEXT:    v_bfe_i32 v2, v3, 24, 4
1819; GFX8-NEXT:    v_bfe_i32 v4, v3, 20, 4
1820; GFX8-NEXT:    v_bfe_i32 v5, v3, 16, 4
1821; GFX8-NEXT:    v_bfe_i32 v6, v3, 12, 4
1822; GFX8-NEXT:    v_bfe_i32 v7, v3, 8, 4
1823; GFX8-NEXT:    v_bfe_i32 v8, v3, 4, 4
1824; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 4
1825; GFX8-NEXT:    s_waitcnt vmcnt(0)
1826; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 28, v0
1827; GFX8-NEXT:    v_bfe_i32 v10, v0, 24, 4
1828; GFX8-NEXT:    v_bfe_i32 v11, v0, 20, 4
1829; GFX8-NEXT:    v_bfe_i32 v12, v0, 16, 4
1830; GFX8-NEXT:    v_bfe_i32 v13, v0, 12, 4
1831; GFX8-NEXT:    v_bfe_i32 v14, v0, 8, 4
1832; GFX8-NEXT:    v_bfe_i32 v15, v0, 4, 4
1833; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 4
1834; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1835; GFX8-NEXT:    v_mad_i32_i24 v0, v3, v0, s0
1836; GFX8-NEXT:    v_mad_i32_i24 v0, v8, v15, v0
1837; GFX8-NEXT:    v_mad_i32_i24 v0, v7, v14, v0
1838; GFX8-NEXT:    v_mad_i32_i24 v0, v6, v13, v0
1839; GFX8-NEXT:    v_mad_i32_i24 v0, v5, v12, v0
1840; GFX8-NEXT:    v_mad_i32_i24 v0, v4, v11, v0
1841; GFX8-NEXT:    v_mad_i32_i24 v0, v2, v10, v0
1842; GFX8-NEXT:    v_mad_i32_i24 v2, v1, v9, v0
1843; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1844; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1845; GFX8-NEXT:    flat_store_dword v[0:1], v2
1846; GFX8-NEXT:    s_endpgm
1847;
1848; GFX9-LABEL: idot8_acc32_vecMul:
1849; GFX9:       ; %bb.0: ; %entry
1850; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1851; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1852; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1853; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1854; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1855; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1856; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
1857; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
1858; GFX9-NEXT:    s_load_dword s0, s[6:7], 0x0
1859; GFX9-NEXT:    s_mov_b32 s14, -1
1860; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
1861; GFX9-NEXT:    s_add_u32 s12, s12, s11
1862; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1863; GFX9-NEXT:    s_addc_u32 s13, s13, 0
1864; GFX9-NEXT:    s_waitcnt vmcnt(1)
1865; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 28, v1
1866; GFX9-NEXT:    v_bfe_i32 v4, v1, 24, 4
1867; GFX9-NEXT:    v_bfe_i32 v5, v1, 20, 4
1868; GFX9-NEXT:    v_bfe_i32 v6, v1, 16, 4
1869; GFX9-NEXT:    v_bfe_i32 v7, v1, 12, 4
1870; GFX9-NEXT:    v_bfe_i32 v8, v1, 8, 4
1871; GFX9-NEXT:    v_bfe_i32 v9, v1, 4, 4
1872; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 4
1873; GFX9-NEXT:    s_waitcnt vmcnt(0)
1874; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 28, v2
1875; GFX9-NEXT:    v_bfe_i32 v11, v2, 24, 4
1876; GFX9-NEXT:    v_bfe_i32 v12, v2, 20, 4
1877; GFX9-NEXT:    v_bfe_i32 v13, v2, 16, 4
1878; GFX9-NEXT:    v_bfe_i32 v14, v2, 12, 4
1879; GFX9-NEXT:    v_bfe_i32 v15, v2, 8, 4
1880; GFX9-NEXT:    v_bfe_i32 v16, v2, 4, 4
1881; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 4
1882; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
1883; GFX9-NEXT:    v_mul_i32_i24_e32 v2, v9, v16
1884; GFX9-NEXT:    v_mul_i32_i24_e32 v8, v8, v15
1885; GFX9-NEXT:    v_mul_i32_i24_e32 v7, v7, v14
1886; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1887; GFX9-NEXT:    v_add3_u32 v1, v1, s0, v2
1888; GFX9-NEXT:    v_mul_i32_i24_e32 v6, v6, v13
1889; GFX9-NEXT:    v_mul_i32_i24_e32 v5, v5, v12
1890; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
1891; GFX9-NEXT:    v_mul_i32_i24_e32 v4, v4, v11
1892; GFX9-NEXT:    v_mul_i32_i24_e32 v3, v3, v10
1893; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v5
1894; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
1895; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
1896; GFX9-NEXT:    s_endpgm
1897;
1898; GFX9-DL-LABEL: idot8_acc32_vecMul:
1899; GFX9-DL:       ; %bb.0: ; %entry
1900; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1901; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1902; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1903; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1904; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1905; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1906; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1907; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1908; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1909; GFX9-DL-NEXT:    s_mov_b32 s14, -1
1910; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
1911; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
1912; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1913; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
1914; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1915; GFX9-DL-NEXT:    v_dot8_i32_i4 v1, v1, v2, s0
1916; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1917; GFX9-DL-NEXT:    s_endpgm
1918;
1919; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul:
1920; GFX10-DL-XNACK:       ; %bb.0: ; %entry
1921; GFX10-DL-XNACK-NEXT:    s_clause 0x1
1922; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1923; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1924; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1925; GFX10-DL-XNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1926; GFX10-DL-XNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1927; GFX10-DL-XNACK-NEXT:    s_mov_b32 s14, -1
1928; GFX10-DL-XNACK-NEXT:    s_mov_b32 s15, 0x31c16000
1929; GFX10-DL-XNACK-NEXT:    s_add_u32 s12, s12, s11
1930; GFX10-DL-XNACK-NEXT:    s_addc_u32 s13, s13, 0
1931; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
1932; GFX10-DL-XNACK-NEXT:    s_clause 0x1
1933; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[0:1]
1934; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[2:3]
1935; GFX10-DL-XNACK-NEXT:    s_waitcnt_depctr 0xffe3
1936; GFX10-DL-XNACK-NEXT:    s_load_dword s0, s[6:7], 0x0
1937; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
1938; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1939; GFX10-DL-XNACK-NEXT:    v_dot8_i32_i4 v1, v1, v2, s0
1940; GFX10-DL-XNACK-NEXT:    global_store_dword v0, v1, s[6:7]
1941; GFX10-DL-XNACK-NEXT:    s_endpgm
1942;
1943; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul:
1944; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
1945; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1946; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1947; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1948; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1949; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
1950; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1951; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s14, -1
1952; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s15, 0x31c16000
1953; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s12, s12, s11
1954; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s13, s13, 0
1955; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
1956; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
1957; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[0:1]
1958; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[2:3]
1959; GFX10-DL-NOXNACK-NEXT:    s_load_dword s0, s[4:5], 0x0
1960; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1961; GFX10-DL-NOXNACK-NEXT:    v_dot8_i32_i4 v0, v1, v0, s0
1962; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[4:5]
1963; GFX10-DL-NOXNACK-NEXT:    s_endpgm
1964                                              ptr addrspace(1) %src2,
1965                                              ptr addrspace(1) nocapture %dst) {
1966entry:
1967  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1968  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
1969  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
1970  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
1971  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
1972
1973  %cvec1 = sext <8 x i4> %vec1 to <8 x i32>
1974  %cvec2 = sext <8 x i4> %vec2 to <8 x i32>
1975
1976  %mul = mul <8 x i32> %cvec1, %cvec2
1977  %mul0 = extractelement <8 x i32> %mul, i64 0
1978  %mul1 = extractelement <8 x i32> %mul, i64 1
1979  %mul2 = extractelement <8 x i32> %mul, i64 2
1980  %mul3 = extractelement <8 x i32> %mul, i64 3
1981  %mul4 = extractelement <8 x i32> %mul, i64 4
1982  %mul5 = extractelement <8 x i32> %mul, i64 5
1983  %mul6 = extractelement <8 x i32> %mul, i64 6
1984  %mul7 = extractelement <8 x i32> %mul, i64 7
1985
1986  %acc = load i32, ptr addrspace(1) %dst, align 4
1987  %add1 = add i32 %mul0, %acc
1988  %add2 = add i32 %add1, %mul1
1989  %add3 = add i32 %add2, %mul2
1990  %add4 = add i32 %add3, %mul3
1991  %add5 = add i32 %add4, %mul4
1992  %add6 = add i32 %add5, %mul5
1993  %add7 = add i32 %add6, %mul6
1994  %add8 = add i32 %add7, %mul7
1995
1996  store i32 %add8, ptr addrspace(1) %dst, align 4
1997  ret void
1998}
1999
2000; TODO: Support this pattern.
2001define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
2002; GFX7-LABEL: idot8_acc16_vecMul:
2003; GFX7:       ; %bb.0: ; %entry
2004; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2005; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2006; GFX7-NEXT:    s_mov_b32 s14, -1
2007; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2008; GFX7-NEXT:    s_add_u32 s12, s12, s11
2009; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2010; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2011; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2012; GFX7-NEXT:    s_mov_b32 s6, 0
2013; GFX7-NEXT:    s_mov_b32 s7, s3
2014; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2015; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2016; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2017; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2018; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2019; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
2020; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2021; GFX7-NEXT:    s_mov_b32 s2, -1
2022; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
2023; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2024; GFX7-NEXT:    s_waitcnt vmcnt(2)
2025; GFX7-NEXT:    v_bfe_i32 v6, v2, 0, 4
2026; GFX7-NEXT:    v_bfe_i32 v3, v2, 24, 4
2027; GFX7-NEXT:    s_waitcnt vmcnt(1)
2028; GFX7-NEXT:    v_bfe_i32 v13, v0, 0, 4
2029; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 4
2030; GFX7-NEXT:    v_bfe_i32 v5, v2, 8, 4
2031; GFX7-NEXT:    v_ashrrev_i32_e32 v7, 28, v2
2032; GFX7-NEXT:    v_bfe_i32 v8, v2, 20, 4
2033; GFX7-NEXT:    v_bfe_i32 v9, v2, 12, 4
2034; GFX7-NEXT:    v_bfe_i32 v2, v2, 4, 4
2035; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
2036; GFX7-NEXT:    v_bfe_i32 v10, v0, 24, 4
2037; GFX7-NEXT:    v_bfe_i32 v11, v0, 16, 4
2038; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
2039; GFX7-NEXT:    v_ashrrev_i32_e32 v14, 28, v0
2040; GFX7-NEXT:    v_bfe_i32 v15, v0, 20, 4
2041; GFX7-NEXT:    v_bfe_i32 v16, v0, 12, 4
2042; GFX7-NEXT:    v_bfe_i32 v0, v0, 4, 4
2043; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v13
2044; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2045; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2046; GFX7-NEXT:    s_waitcnt vmcnt(0)
2047; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
2048; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
2049; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff, v12
2050; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2051; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff, v9
2052; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff, v16
2053; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
2054; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
2055; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v11
2056; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
2057; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
2058; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff, v15
2059; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
2060; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2061; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff, v10
2062; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
2063; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
2064; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v14
2065; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
2066; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
2067; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
2068; GFX7-NEXT:    s_endpgm
2069;
2070; GFX8-LABEL: idot8_acc16_vecMul:
2071; GFX8:       ; %bb.0: ; %entry
2072; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2073; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2074; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2075; GFX8-NEXT:    v_mov_b32_e32 v5, 12
2076; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2077; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2078; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2079; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2080; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2081; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2082; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2083; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2084; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2085; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2086; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2087; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2088; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
2089; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2090; GFX8-NEXT:    s_mov_b32 s14, -1
2091; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
2092; GFX8-NEXT:    s_add_u32 s12, s12, s11
2093; GFX8-NEXT:    s_addc_u32 s13, s13, 0
2094; GFX8-NEXT:    s_waitcnt vmcnt(2)
2095; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
2096; GFX8-NEXT:    v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2097; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 20, v3
2098; GFX8-NEXT:    v_lshlrev_b16_sdwa v9, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2099; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 12, v3
2100; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
2101; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 4, v3
2102; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 12, v3
2103; GFX8-NEXT:    s_waitcnt vmcnt(1)
2104; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
2105; GFX8-NEXT:    v_lshlrev_b16_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2106; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 20, v2
2107; GFX8-NEXT:    v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2108; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 12, v2
2109; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
2110; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 4, v2
2111; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 12, v2
2112; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
2113; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
2114; GFX8-NEXT:    v_lshlrev_b16_e32 v18, 12, v18
2115; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2116; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
2117; GFX8-NEXT:    v_lshlrev_b16_e32 v17, 12, v17
2118; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2119; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v18
2120; GFX8-NEXT:    s_waitcnt vmcnt(0)
2121; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2122; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
2123; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v16
2124; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
2125; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
2126; GFX8-NEXT:    v_mad_u16 v2, v12, v18, v2
2127; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
2128; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
2129; GFX8-NEXT:    v_mad_u16 v2, v11, v17, v2
2130; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
2131; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
2132; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
2133; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2134; GFX8-NEXT:    v_mad_u16 v2, v10, v16, v2
2135; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2136; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
2137; GFX8-NEXT:    v_mad_u16 v2, v9, v5, v2
2138; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
2139; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2140; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
2141; GFX8-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
2142; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
2143; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2144; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2145; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
2146; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
2147; GFX8-NEXT:    flat_store_short v[0:1], v2
2148; GFX8-NEXT:    s_endpgm
2149;
2150; GFX9-LABEL: idot8_acc16_vecMul:
2151; GFX9:       ; %bb.0: ; %entry
2152; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2153; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2154; GFX9-NEXT:    s_mov_b32 s14, -1
2155; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
2156; GFX9-NEXT:    s_add_u32 s12, s12, s11
2157; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2158; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2159; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2160; GFX9-NEXT:    v_mov_b32_e32 v4, 12
2161; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2162; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
2163; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
2164; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2165; GFX9-NEXT:    global_load_ushort v3, v0, s[0:1]
2166; GFX9-NEXT:    s_mov_b32 s2, 0x5040100
2167; GFX9-NEXT:    s_addc_u32 s13, s13, 0
2168; GFX9-NEXT:    s_waitcnt vmcnt(2)
2169; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v1
2170; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
2171; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
2172; GFX9-NEXT:    s_waitcnt vmcnt(1)
2173; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 4, v2
2174; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v1
2175; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v2
2176; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
2177; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
2178; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
2179; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
2180; GFX9-NEXT:    v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2181; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 20, v1
2182; GFX9-NEXT:    v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2183; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
2184; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
2185; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
2186; GFX9-NEXT:    v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2187; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 20, v2
2188; GFX9-NEXT:    v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2189; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
2190; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2191; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2192; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2193; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2194; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2195; GFX9-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2196; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
2197; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 12, v1
2198; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
2199; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
2200; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 12, v17
2201; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 12, v2
2202; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s2
2203; GFX9-NEXT:    v_perm_b32 v8, v13, v12, s2
2204; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s2
2205; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
2206; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
2207; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
2208; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
2209; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
2210; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
2211; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
2212; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
2213; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
2214; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2215; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v5, v8
2216; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s2
2217; GFX9-NEXT:    v_perm_b32 v1, v1, v11, s2
2218; GFX9-NEXT:    v_perm_b32 v4, v17, v16, s2
2219; GFX9-NEXT:    v_perm_b32 v9, v10, v9, s2
2220; GFX9-NEXT:    v_perm_b32 v10, v15, v14, s2
2221; GFX9-NEXT:    s_waitcnt vmcnt(0)
2222; GFX9-NEXT:    v_add_u16_e32 v3, v5, v3
2223; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2224; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v9, v4
2225; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v7, v10
2226; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2227; GFX9-NEXT:    v_add_u16_e32 v3, v3, v4
2228; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2229; GFX9-NEXT:    v_add_u16_e32 v3, v3, v2
2230; GFX9-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2231; GFX9-NEXT:    v_add_u16_e32 v2, v2, v1
2232; GFX9-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2233; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
2234; GFX9-NEXT:    s_endpgm
2235;
2236; GFX9-DL-LABEL: idot8_acc16_vecMul:
2237; GFX9-DL:       ; %bb.0: ; %entry
2238; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2239; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2240; GFX9-DL-NEXT:    s_mov_b32 s14, -1
2241; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
2242; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
2243; GFX9-DL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2244; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2245; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2246; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 12
2247; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2248; GFX9-DL-NEXT:    global_load_dword v1, v0, s[8:9]
2249; GFX9-DL-NEXT:    global_load_dword v2, v0, s[10:11]
2250; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2251; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
2252; GFX9-DL-NEXT:    s_mov_b32 s2, 0x5040100
2253; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
2254; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2255; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 4, v1
2256; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
2257; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
2258; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2259; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 4, v2
2260; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v1
2261; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v2
2262; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
2263; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
2264; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
2265; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
2266; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2267; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 20, v1
2268; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2269; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
2270; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
2271; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
2272; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2273; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 20, v2
2274; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2275; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
2276; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2277; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2278; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2279; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2280; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2281; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2282; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
2283; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v1, 12, v1
2284; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
2285; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
2286; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v17, 12, v17
2287; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v2, 12, v2
2288; GFX9-DL-NEXT:    v_perm_b32 v7, v8, v7, s2
2289; GFX9-DL-NEXT:    v_perm_b32 v8, v13, v12, s2
2290; GFX9-DL-NEXT:    v_perm_b32 v5, v6, v5, s2
2291; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
2292; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
2293; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
2294; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
2295; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
2296; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
2297; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
2298; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
2299; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
2300; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2301; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v8
2302; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v4, s2
2303; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v11, s2
2304; GFX9-DL-NEXT:    v_perm_b32 v4, v17, v16, s2
2305; GFX9-DL-NEXT:    v_perm_b32 v9, v10, v9, s2
2306; GFX9-DL-NEXT:    v_perm_b32 v10, v15, v14, s2
2307; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2308; GFX9-DL-NEXT:    v_add_u16_e32 v3, v5, v3
2309; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2310; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v9, v4
2311; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v7, v10
2312; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2313; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v4
2314; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2315; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v2
2316; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2317; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v1
2318; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2319; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
2320; GFX9-DL-NEXT:    s_endpgm
2321;
2322; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul:
2323; GFX10-DL-XNACK:       ; %bb.0: ; %entry
2324; GFX10-DL-XNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2325; GFX10-DL-XNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2326; GFX10-DL-XNACK-NEXT:    s_mov_b32 s14, -1
2327; GFX10-DL-XNACK-NEXT:    s_mov_b32 s15, 0x31c16000
2328; GFX10-DL-XNACK-NEXT:    s_add_u32 s12, s12, s11
2329; GFX10-DL-XNACK-NEXT:    s_clause 0x1
2330; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2331; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2332; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2333; GFX10-DL-XNACK-NEXT:    s_addc_u32 s13, s13, 0
2334; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
2335; GFX10-DL-XNACK-NEXT:    s_clause 0x1
2336; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[8:9]
2337; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[10:11]
2338; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
2339; GFX10-DL-XNACK-NEXT:    global_load_ushort v3, v0, s[0:1]
2340; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
2341; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 4, v1
2342; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
2343; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 4, v2
2344; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v1
2345; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v11, 12, v2
2346; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
2347; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
2348; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
2349; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
2350; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
2351; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
2352; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 12, v2
2353; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
2354; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
2355; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
2356; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
2357; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
2358; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
2359; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
2360; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
2361; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
2362; GFX10-DL-XNACK-NEXT:    v_perm_b32 v11, v12, v11, 0x5040100
2363; GFX10-DL-XNACK-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
2364; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 20, v1
2365; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 16, v2
2366; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 20, v2
2367; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
2368; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
2369; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v8
2370; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v13
2371; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v12, 12, v14
2372; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v4, v4, v11
2373; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
2374; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
2375; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
2376; GFX10-DL-XNACK-NEXT:    v_perm_b32 v8, v12, v8, 0x5040100
2377; GFX10-DL-XNACK-NEXT:    v_perm_b32 v6, v7, v6, 0x5040100
2378; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
2379; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
2380; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v4, v3
2381; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
2382; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
2383; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
2384; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
2385; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v15
2386; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
2387; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v16
2388; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v6, v6, v8
2389; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v7
2390; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
2391; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
2392; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v12, 12, v17
2393; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
2394; GFX10-DL-XNACK-NEXT:    v_perm_b32 v4, v4, v11, 0x5040100
2395; GFX10-DL-XNACK-NEXT:    v_perm_b32 v5, v9, v5, 0x5040100
2396; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
2397; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v6
2398; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
2399; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v12
2400; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
2401; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
2402; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
2403; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v7
2404; GFX10-DL-XNACK-NEXT:    v_perm_b32 v2, v2, v6, 0x5040100
2405; GFX10-DL-XNACK-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
2406; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2407; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v4
2408; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2409; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v2, v3, v5
2410; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2411; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v2, v1
2412; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v3
2413; GFX10-DL-XNACK-NEXT:    global_store_short v0, v1, s[0:1]
2414; GFX10-DL-XNACK-NEXT:    s_endpgm
2415;
2416; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul:
2417; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
2418; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2419; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2420; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s14, -1
2421; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s15, 0x31c16000
2422; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s12, s12, s11
2423; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
2424; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2425; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2426; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2427; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
2428; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s13, s13, 0
2429; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
2430; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
2431; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[8:9]
2432; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[10:11]
2433; GFX10-DL-NOXNACK-NEXT:    global_load_ushort v3, v2, s[0:1]
2434; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
2435; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 4, v1
2436; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
2437; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 4, v0
2438; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v1
2439; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v11, 12, v0
2440; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
2441; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
2442; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
2443; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
2444; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
2445; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
2446; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 12, v0
2447; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
2448; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
2449; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
2450; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
2451; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
2452; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
2453; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
2454; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
2455; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
2456; GFX10-DL-NOXNACK-NEXT:    v_perm_b32 v11, v12, v11, 0x5040100
2457; GFX10-DL-NOXNACK-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
2458; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 20, v1
2459; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
2460; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 20, v0
2461; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
2462; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
2463; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v8
2464; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v13
2465; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v12, 12, v14
2466; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v4, v4, v11
2467; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
2468; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
2469; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
2470; GFX10-DL-NOXNACK-NEXT:    v_perm_b32 v8, v12, v8, 0x5040100
2471; GFX10-DL-NOXNACK-NEXT:    v_perm_b32 v6, v7, v6, 0x5040100
2472; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
2473; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
2474; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v4, v3
2475; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
2476; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
2477; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v17, 24, v0
2478; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
2479; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v15
2480; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
2481; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v16
2482; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v6, v6, v8
2483; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v7
2484; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
2485; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
2486; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v12, 12, v17
2487; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
2488; GFX10-DL-NOXNACK-NEXT:    v_perm_b32 v4, v4, v11, 0x5040100
2489; GFX10-DL-NOXNACK-NEXT:    v_perm_b32 v5, v9, v5, 0x5040100
2490; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
2491; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v6
2492; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
2493; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v12
2494; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
2495; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
2496; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
2497; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v7
2498; GFX10-DL-NOXNACK-NEXT:    v_perm_b32 v0, v0, v6, 0x5040100
2499; GFX10-DL-NOXNACK-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
2500; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2501; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v4
2502; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
2503; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v1, v3, v5
2504; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
2505; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v1, v0
2506; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v3
2507; GFX10-DL-NOXNACK-NEXT:    global_store_short v2, v0, s[0:1]
2508; GFX10-DL-NOXNACK-NEXT:    s_endpgm
2509                                              ptr addrspace(1) %src2,
2510                                              ptr addrspace(1) nocapture %dst) {
2511entry:
2512  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2513  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
2514  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
2515  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
2516  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
2517
2518  %cvec1 = sext <8 x i4> %vec1 to <8 x i16>
2519  %cvec2 = sext <8 x i4> %vec2 to <8 x i16>
2520
2521  %mul = mul <8 x i16> %cvec1, %cvec2
2522  %mul0 = extractelement <8 x i16> %mul, i64 0
2523  %mul1 = extractelement <8 x i16> %mul, i64 1
2524  %mul2 = extractelement <8 x i16> %mul, i64 2
2525  %mul3 = extractelement <8 x i16> %mul, i64 3
2526  %mul4 = extractelement <8 x i16> %mul, i64 4
2527  %mul5 = extractelement <8 x i16> %mul, i64 5
2528  %mul6 = extractelement <8 x i16> %mul, i64 6
2529  %mul7 = extractelement <8 x i16> %mul, i64 7
2530
2531  %acc = load i16, ptr addrspace(1) %dst, align 4
2532  %add1 = add i16 %mul0, %acc
2533  %add2 = add i16 %add1, %mul1
2534  %add3 = add i16 %add2, %mul2
2535  %add4 = add i16 %add3, %mul3
2536  %add5 = add i16 %add4, %mul4
2537  %add6 = add i16 %add5, %mul5
2538  %add7 = add i16 %add6, %mul6
2539  %add8 = add i16 %add7, %mul7
2540
2541  store i16 %add8, ptr addrspace(1) %dst, align 4
2542  ret void
2543}
2544
2545; TODO: Support this pattern.
2546define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
2547; GFX7-LABEL: idot8_acc8_vecMul:
2548; GFX7:       ; %bb.0: ; %entry
2549; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2550; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2551; GFX7-NEXT:    s_mov_b32 s14, -1
2552; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2553; GFX7-NEXT:    s_add_u32 s12, s12, s11
2554; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2555; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2556; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2557; GFX7-NEXT:    s_mov_b32 s6, 0
2558; GFX7-NEXT:    s_mov_b32 s7, s3
2559; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2560; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2561; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2562; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2563; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2564; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
2565; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2566; GFX7-NEXT:    s_mov_b32 s2, -1
2567; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
2568; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2569; GFX7-NEXT:    s_waitcnt vmcnt(2)
2570; GFX7-NEXT:    v_bfe_i32 v7, v2, 0, 4
2571; GFX7-NEXT:    v_bfe_i32 v3, v2, 24, 4
2572; GFX7-NEXT:    s_waitcnt vmcnt(1)
2573; GFX7-NEXT:    v_bfe_i32 v14, v0, 0, 4
2574; GFX7-NEXT:    v_bfe_i32 v4, v2, 20, 4
2575; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 4
2576; GFX7-NEXT:    v_bfe_i32 v6, v2, 8, 4
2577; GFX7-NEXT:    v_ashrrev_i32_e32 v8, 28, v2
2578; GFX7-NEXT:    v_bfe_i32 v9, v2, 12, 4
2579; GFX7-NEXT:    v_bfe_i32 v2, v2, 4, 4
2580; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v7
2581; GFX7-NEXT:    v_bfe_i32 v10, v0, 24, 4
2582; GFX7-NEXT:    v_bfe_i32 v11, v0, 20, 4
2583; GFX7-NEXT:    v_bfe_i32 v12, v0, 16, 4
2584; GFX7-NEXT:    v_bfe_i32 v13, v0, 8, 4
2585; GFX7-NEXT:    v_ashrrev_i32_e32 v15, 28, v0
2586; GFX7-NEXT:    v_bfe_i32 v16, v0, 12, 4
2587; GFX7-NEXT:    v_bfe_i32 v0, v0, 4, 4
2588; GFX7-NEXT:    v_and_b32_e32 v14, 0xff, v14
2589; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
2590; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
2591; GFX7-NEXT:    s_waitcnt vmcnt(0)
2592; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
2593; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v6
2594; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v13
2595; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2596; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v9
2597; GFX7-NEXT:    v_and_b32_e32 v16, 0xff, v16
2598; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
2599; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v5
2600; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v12
2601; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
2602; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
2603; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v11
2604; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
2605; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v3
2606; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v10
2607; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
2608; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v8
2609; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v15
2610; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
2611; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
2612; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2613; GFX7-NEXT:    s_endpgm
2614;
2615; GFX8-LABEL: idot8_acc8_vecMul:
2616; GFX8:       ; %bb.0: ; %entry
2617; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2618; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2619; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2620; GFX8-NEXT:    v_mov_b32_e32 v5, 12
2621; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2622; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2623; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2624; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2625; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2626; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2627; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2628; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2629; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2630; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2631; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2632; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2633; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
2634; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2635; GFX8-NEXT:    s_mov_b32 s14, -1
2636; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
2637; GFX8-NEXT:    s_add_u32 s12, s12, s11
2638; GFX8-NEXT:    s_addc_u32 s13, s13, 0
2639; GFX8-NEXT:    s_waitcnt vmcnt(2)
2640; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 20, v3
2641; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 28, v3
2642; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
2643; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
2644; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 4, v3
2645; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
2646; GFX8-NEXT:    s_waitcnt vmcnt(1)
2647; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
2648; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
2649; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
2650; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
2651; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
2652; GFX8-NEXT:    v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2653; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2654; GFX8-NEXT:    v_lshlrev_b16_e32 v18, 12, v2
2655; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2656; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2657; GFX8-NEXT:    v_lshlrev_b16_e32 v5, 12, v10
2658; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v16
2659; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
2660; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
2661; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v3
2662; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 12, v6
2663; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v15
2664; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v18
2665; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
2666; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
2667; GFX8-NEXT:    v_ashrrev_i16_e32 v19, 12, v2
2668; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 12, v11
2669; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
2670; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
2671; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
2672; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
2673; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2674; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
2675; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2676; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2677; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
2678; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2679; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v14
2680; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2681; GFX8-NEXT:    v_mul_lo_u16_e32 v10, v10, v15
2682; GFX8-NEXT:    v_mul_lo_u16_e32 v15, v16, v18
2683; GFX8-NEXT:    v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2684; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2685; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2686; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2687; GFX8-NEXT:    v_mul_lo_u16_e32 v14, v17, v19
2688; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2689; GFX8-NEXT:    v_mul_lo_u16_e32 v8, v9, v11
2690; GFX8-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2691; GFX8-NEXT:    v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2692; GFX8-NEXT:    v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2693; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2694; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
2695; GFX8-NEXT:    v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2696; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
2697; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
2698; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2699; GFX8-NEXT:    v_or_b32_e32 v5, v5, v2
2700; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
2701; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 24, v[2:3]
2702; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
2703; GFX8-NEXT:    s_waitcnt vmcnt(0)
2704; GFX8-NEXT:    v_add_u16_e32 v3, v8, v4
2705; GFX8-NEXT:    v_add_u16_e32 v3, v3, v5
2706; GFX8-NEXT:    v_add_u16_e32 v3, v3, v7
2707; GFX8-NEXT:    v_add_u16_e32 v2, v3, v2
2708; GFX8-NEXT:    v_mad_u16 v2, v17, v19, v2
2709; GFX8-NEXT:    v_add_u16_e32 v2, v2, v6
2710; GFX8-NEXT:    v_mad_u16 v2, v16, v18, v2
2711; GFX8-NEXT:    v_add_u16_e32 v2, v2, v10
2712; GFX8-NEXT:    flat_store_byte v[0:1], v2
2713; GFX8-NEXT:    s_endpgm
2714;
2715; GFX9-LABEL: idot8_acc8_vecMul:
2716; GFX9:       ; %bb.0: ; %entry
2717; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2718; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2719; GFX9-NEXT:    s_mov_b32 s14, -1
2720; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
2721; GFX9-NEXT:    s_add_u32 s12, s12, s11
2722; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2723; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2724; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2725; GFX9-NEXT:    v_mov_b32_e32 v4, 12
2726; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2727; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
2728; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
2729; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2730; GFX9-NEXT:    global_load_ubyte v3, v0, s[0:1]
2731; GFX9-NEXT:    s_addc_u32 s13, s13, 0
2732; GFX9-NEXT:    s_waitcnt vmcnt(2)
2733; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
2734; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
2735; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
2736; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
2737; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
2738; GFX9-NEXT:    s_waitcnt vmcnt(1)
2739; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 20, v2
2740; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
2741; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
2742; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
2743; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
2744; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
2745; GFX9-NEXT:    v_lshlrev_b16_sdwa v16, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2746; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2747; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
2748; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2749; GFX9-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2750; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 12, v9
2751; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v15
2752; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
2753; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
2754; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
2755; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
2756; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v1
2757; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 12, v5
2758; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v14
2759; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v17
2760; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
2761; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
2762; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
2763; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
2764; GFX9-NEXT:    v_ashrrev_i16_e32 v18, 12, v2
2765; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 12, v10
2766; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2767; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2768; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2769; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
2770; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
2771; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2772; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
2773; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2774; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
2775; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2776; GFX9-NEXT:    v_mul_lo_u16_e32 v13, v16, v18
2777; GFX9-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
2778; GFX9-NEXT:    v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2779; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2780; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2781; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
2782; GFX9-NEXT:    v_mul_lo_u16_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2783; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2784; GFX9-NEXT:    v_or_b32_sdwa v5, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2785; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2786; GFX9-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
2787; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
2788; GFX9-NEXT:    v_or_b32_sdwa v2, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2789; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
2790; GFX9-NEXT:    v_or_b32_sdwa v7, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2791; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
2792; GFX9-NEXT:    v_or_b32_e32 v4, v4, v1
2793; GFX9-NEXT:    v_lshrrev_b64 v[1:2], 24, v[1:2]
2794; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
2795; GFX9-NEXT:    s_waitcnt vmcnt(0)
2796; GFX9-NEXT:    v_add_u16_e32 v3, v7, v3
2797; GFX9-NEXT:    v_add_u16_e32 v2, v3, v2
2798; GFX9-NEXT:    v_add_u16_e32 v2, v2, v6
2799; GFX9-NEXT:    v_add_u16_e32 v1, v2, v1
2800; GFX9-NEXT:    v_mad_legacy_u16 v1, v16, v18, v1
2801; GFX9-NEXT:    v_add_u16_e32 v1, v1, v5
2802; GFX9-NEXT:    v_mad_legacy_u16 v1, v15, v17, v1
2803; GFX9-NEXT:    v_add_u16_e32 v1, v1, v8
2804; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
2805; GFX9-NEXT:    s_endpgm
2806;
2807; GFX9-DL-LABEL: idot8_acc8_vecMul:
2808; GFX9-DL:       ; %bb.0: ; %entry
2809; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2810; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2811; GFX9-DL-NEXT:    s_mov_b32 s14, -1
2812; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
2813; GFX9-DL-NEXT:    s_add_u32 s12, s12, s11
2814; GFX9-DL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2815; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2816; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2817; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 12
2818; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2819; GFX9-DL-NEXT:    global_load_dword v1, v0, s[8:9]
2820; GFX9-DL-NEXT:    global_load_dword v2, v0, s[10:11]
2821; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2822; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
2823; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
2824; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2825; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
2826; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
2827; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
2828; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
2829; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
2830; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2831; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 20, v2
2832; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
2833; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
2834; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
2835; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
2836; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
2837; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v16, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2838; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2839; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
2840; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2841; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2842; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v4, 12, v9
2843; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v15
2844; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
2845; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
2846; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
2847; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
2848; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v1
2849; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v1, 12, v5
2850; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v14
2851; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v17
2852; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
2853; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
2854; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
2855; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
2856; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v18, 12, v2
2857; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v2, 12, v10
2858; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2859; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2860; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2861; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
2862; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
2863; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2864; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
2865; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2866; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
2867; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2868; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v13, v16, v18
2869; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
2870; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2871; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2872; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2873; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
2874; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2875; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2876; GFX9-DL-NEXT:    v_or_b32_sdwa v5, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2877; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2878; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
2879; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
2880; GFX9-DL-NEXT:    v_or_b32_sdwa v2, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2881; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
2882; GFX9-DL-NEXT:    v_or_b32_sdwa v7, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2883; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
2884; GFX9-DL-NEXT:    v_or_b32_e32 v4, v4, v1
2885; GFX9-DL-NEXT:    v_lshrrev_b64 v[1:2], 24, v[1:2]
2886; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
2887; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2888; GFX9-DL-NEXT:    v_add_u16_e32 v3, v7, v3
2889; GFX9-DL-NEXT:    v_add_u16_e32 v2, v3, v2
2890; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v6
2891; GFX9-DL-NEXT:    v_add_u16_e32 v1, v2, v1
2892; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v16, v18, v1
2893; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v5
2894; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v15, v17, v1
2895; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v8
2896; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
2897; GFX9-DL-NEXT:    s_endpgm
2898;
2899; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul:
2900; GFX10-DL-XNACK:       ; %bb.0: ; %entry
2901; GFX10-DL-XNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2902; GFX10-DL-XNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2903; GFX10-DL-XNACK-NEXT:    s_mov_b32 s14, -1
2904; GFX10-DL-XNACK-NEXT:    s_mov_b32 s15, 0x31c16000
2905; GFX10-DL-XNACK-NEXT:    s_add_u32 s12, s12, s11
2906; GFX10-DL-XNACK-NEXT:    s_clause 0x1
2907; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2908; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2909; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2910; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v4, 0
2911; GFX10-DL-XNACK-NEXT:    s_addc_u32 s13, s13, 0
2912; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
2913; GFX10-DL-XNACK-NEXT:    s_clause 0x1
2914; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[8:9]
2915; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[10:11]
2916; GFX10-DL-XNACK-NEXT:    global_load_ubyte v3, v4, s[0:1]
2917; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
2918; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
2919; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
2920; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
2921; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
2922; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
2923; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
2924; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
2925; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
2926; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
2927; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
2928; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
2929; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
2930; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v15, 12, v15
2931; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v0, 20, v1
2932; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
2933; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
2934; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
2935; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
2936; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
2937; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
2938; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
2939; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
2940; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
2941; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
2942; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
2943; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
2944; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
2945; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
2946; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
2947; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
2948; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
2949; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
2950; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
2951; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v15, 12, v17
2952; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v9, v9, v16
2953; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
2954; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
2955; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
2956; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
2957; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
2958; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
2959; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
2960; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
2961; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
2962; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
2963; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
2964; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
2965; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2966; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
2967; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
2968; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
2969; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
2970; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v1, v1, v2
2971; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v2, v0, v11
2972; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
2973; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 8, v10
2974; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
2975; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v10, v5, v12
2976; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v11, v7, v14
2977; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 8, v2
2978; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
2979; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2980; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2981; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2982; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2983; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 8, v13
2984; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
2985; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v1, v3
2986; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2987; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v9, v3, v10
2988; GFX10-DL-XNACK-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
2989; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
2990; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v9, v8
2991; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v2
2992; GFX10-DL-XNACK-NEXT:    v_mad_u16 v0, v5, v12, v0
2993; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v1
2994; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
2995; GFX10-DL-XNACK-NEXT:    v_mad_u16 v0, v7, v14, v0
2996; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v1
2997; GFX10-DL-XNACK-NEXT:    global_store_byte v4, v0, s[0:1]
2998; GFX10-DL-XNACK-NEXT:    s_endpgm
2999;
3000; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul:
3001; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
3002; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
3003; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
3004; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s14, -1
3005; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s15, 0x31c16000
3006; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s12, s12, s11
3007; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
3008; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
3009; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
3010; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3011; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v4, 0
3012; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s13, s13, 0
3013; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
3014; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
3015; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[8:9]
3016; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[10:11]
3017; GFX10-DL-NOXNACK-NEXT:    global_load_ubyte v2, v4, s[0:1]
3018; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
3019; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
3020; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
3021; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
3022; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
3023; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
3024; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
3025; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
3026; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
3027; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v17, 4, v0
3028; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
3029; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
3030; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
3031; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v15
3032; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v3, 20, v1
3033; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
3034; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 20, v0
3035; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
3036; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
3037; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
3038; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
3039; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
3040; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
3041; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
3042; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
3043; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
3044; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 24, v0
3045; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
3046; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v3, 12, v3
3047; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
3048; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
3049; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
3050; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
3051; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
3052; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v17
3053; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v9, v9, v16
3054; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
3055; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
3056; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
3057; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
3058; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
3059; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
3060; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v3
3061; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
3062; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
3063; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
3064; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
3065; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
3066; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3067; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
3068; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
3069; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
3070; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
3071; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v3, v3, v11
3072; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
3073; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v1, v1, v0
3074; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 8, v10
3075; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
3076; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v10, v5, v12
3077; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v11, v7, v14
3078; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v3, 8, v3
3079; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
3080; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3081; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3082; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3083; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3084; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 8, v13
3085; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
3086; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v2, v1, v2
3087; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3088; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v9, v2, v10
3089; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
3090; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
3091; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v9, v8
3092; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v2
3093; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v5, v12, v0
3094; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v1
3095; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
3096; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v7, v14, v0
3097; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v1
3098; GFX10-DL-NOXNACK-NEXT:    global_store_byte v4, v0, s[0:1]
3099; GFX10-DL-NOXNACK-NEXT:    s_endpgm
3100                                             ptr addrspace(1) %src2,
3101                                             ptr addrspace(1) nocapture %dst) {
3102entry:
3103  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3104  %gep1 = getelementptr <8 x i4>, ptr addrspace(1) %src1, i32 %idx
3105  %vec1 = load <8 x i4>, ptr addrspace(1) %gep1
3106  %gep2 = getelementptr <8 x i4>, ptr addrspace(1) %src2, i32 %idx
3107  %vec2 = load <8 x i4>, ptr addrspace(1) %gep2
3108
3109  %cvec1 = sext <8 x i4> %vec1 to <8 x i8>
3110  %cvec2 = sext <8 x i4> %vec2 to <8 x i8>
3111
3112  %mul = mul <8 x i8> %cvec1, %cvec2
3113  %mul0 = extractelement <8 x i8> %mul, i64 0
3114  %mul1 = extractelement <8 x i8> %mul, i64 1
3115  %mul2 = extractelement <8 x i8> %mul, i64 2
3116  %mul3 = extractelement <8 x i8> %mul, i64 3
3117  %mul4 = extractelement <8 x i8> %mul, i64 4
3118  %mul5 = extractelement <8 x i8> %mul, i64 5
3119  %mul6 = extractelement <8 x i8> %mul, i64 6
3120  %mul7 = extractelement <8 x i8> %mul, i64 7
3121
3122  %acc = load i8, ptr addrspace(1) %dst, align 4
3123  %add1 = add i8 %mul0, %acc
3124  %add2 = add i8 %add1, %mul1
3125  %add3 = add i8 %add2, %mul2
3126  %add4 = add i8 %add3, %mul3
3127  %add5 = add i8 %add4, %mul4
3128  %add6 = add i8 %add5, %mul5
3129  %add7 = add i8 %add6, %mul6
3130  %add8 = add i8 %add7, %mul7
3131
3132  store i8 %add8, ptr addrspace(1) %dst, align 4
3133  ret void
3134}
3135
3136declare i32 @llvm.amdgcn.workitem.id.x()
3137