xref: /llvm-project/llvm/test/CodeGen/AMDGPU/idot4s.ll (revision 5a3299a684d7d8c40f48d732e5b80a8bd29aa882)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-DL %s
9
10define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
11; GFX7-LABEL: idot4_acc32:
12; GFX7:       ; %bb.0: ; %entry
13; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
14; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
15; GFX7-NEXT:    s_mov_b32 s3, 0xf000
16; GFX7-NEXT:    s_mov_b32 s6, 0
17; GFX7-NEXT:    s_mov_b32 s7, s3
18; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
20; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
21; GFX7-NEXT:    v_mov_b32_e32 v1, 0
22; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
23; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
24; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
25; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
26; GFX7-NEXT:    s_mov_b32 s2, -1
27; GFX7-NEXT:    s_waitcnt vmcnt(1)
28; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
29; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
30; GFX7-NEXT:    s_waitcnt vmcnt(0)
31; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
32; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
33; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
34; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v5, s4
35; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
36; GFX7-NEXT:    v_bfe_i32 v7, v0, 16, 8
37; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v6, v1
38; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
39; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
40; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v7, v1
41; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
42; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
43; GFX7-NEXT:    s_endpgm
44;
45; GFX8-LABEL: idot4_acc32:
46; GFX8:       ; %bb.0: ; %entry
47; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
48; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
49; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
50; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX8-NEXT:    v_mov_b32_e32 v1, s1
52; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
53; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
54; GFX8-NEXT:    flat_load_dword v3, v[0:1]
55; GFX8-NEXT:    v_mov_b32_e32 v1, s3
56; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
57; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
58; GFX8-NEXT:    flat_load_dword v0, v[0:1]
59; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
60; GFX8-NEXT:    s_waitcnt vmcnt(1)
61; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
62; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
63; GFX8-NEXT:    v_bfe_i32 v6, v3, 16, 8
64; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
65; GFX8-NEXT:    s_waitcnt vmcnt(0)
66; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
67; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
68; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s0
70; GFX8-NEXT:    v_bfe_i32 v7, v0, 16, 8
71; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
72; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
73; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
74; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
75; GFX8-NEXT:    v_mov_b32_e32 v0, s4
76; GFX8-NEXT:    v_mov_b32_e32 v1, s5
77; GFX8-NEXT:    flat_store_dword v[0:1], v2
78; GFX8-NEXT:    s_endpgm
79;
80; GFX9-NODL-LABEL: idot4_acc32:
81; GFX9-NODL:       ; %bb.0: ; %entry
82; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
83; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
84; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
85; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
86; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
87; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
88; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
89; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
90; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
91; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
92; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
93; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
94; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
95; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX9-NODL-NEXT:    v_add3_u32 v2, v3, s0, v4
97; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
98; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
99; GFX9-NODL-NEXT:    s_endpgm
100;
101; GFX9-DL-LABEL: idot4_acc32:
102; GFX9-DL:       ; %bb.0: ; %entry
103; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
104; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
105; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
106; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
107; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
108; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
109; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
110; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
111; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
112; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s0
113; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
114; GFX9-DL-NEXT:    s_endpgm
115;
116; GFX10-DL-LABEL: idot4_acc32:
117; GFX10-DL:       ; %bb.0: ; %entry
118; GFX10-DL-NEXT:    s_clause 0x1
119; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
120; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
121; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
122; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
123; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
124; GFX10-DL-NEXT:    s_clause 0x1
125; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
126; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
127; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
128; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
129; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
131; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
132; GFX10-DL-NEXT:    v_dot4c_i32_i8 v0, v1, v2
133; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
134; GFX10-DL-NEXT:    s_endpgm
135;
136; GFX11-DL-LABEL: idot4_acc32:
137; GFX11-DL:       ; %bb.0: ; %entry
138; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
139; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
140; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
141; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
142; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
143; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
144; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
145; GFX11-DL-NEXT:    s_clause 0x1
146; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
147; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
148; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
149; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
150; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
151; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
152; GFX11-DL-NEXT:    s_endpgm
153                                       ptr addrspace(1) %src2,
154                                       ptr addrspace(1) nocapture %dst) {
155entry:
156  %idx = call i32 @llvm.amdgcn.workitem.id.x()
157  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
158  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
159  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
160  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
161
162  %v1e0 = extractelement <4 x i8> %vec1, i64 0
163  %cv1e0 = sext i8 %v1e0 to i32
164  %v2e0 = extractelement <4 x i8> %vec2, i64 0
165  %cv2e0 = sext i8 %v2e0 to i32
166  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
167
168  %v1e1 = extractelement <4 x i8> %vec1, i64 1
169  %cv1e1 = sext i8 %v1e1 to i32
170  %v2e1 = extractelement <4 x i8> %vec2, i64 1
171  %cv2e1 = sext i8 %v2e1 to i32
172  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
173
174  %v1e2 = extractelement <4 x i8> %vec1, i64 2
175  %cv1e2 = sext i8 %v1e2 to i32
176  %v2e2 = extractelement <4 x i8> %vec2, i64 2
177  %cv2e2 = sext i8 %v2e2 to i32
178  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
179
180  %v1e3 = extractelement <4 x i8> %vec1, i64 3
181  %cv1e3 = sext i8 %v1e3 to i32
182  %v2e3 = extractelement <4 x i8> %vec2, i64 3
183  %cv2e3 = sext i8 %v2e3 to i32
184  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
185
186  %acc = load i32, ptr addrspace(1) %dst, align 4
187  %add1 = add i32 %mul1, %acc
188  %add2 = add i32 %add1, %mul2
189  %add3 = add i32 %add2, %mul3
190  %add4 = add i32 %add3, %mul4
191  store i32 %add4, ptr addrspace(1) %dst, align 4
192  ret void
193}
194
195define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
196; GFX7-LABEL: idot4_acc16:
197; GFX7:       ; %bb.0: ; %entry
198; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
199; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
200; GFX7-NEXT:    s_mov_b32 s3, 0xf000
201; GFX7-NEXT:    s_mov_b32 s6, 0
202; GFX7-NEXT:    s_mov_b32 s7, s3
203; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
205; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
206; GFX7-NEXT:    v_mov_b32_e32 v1, 0
207; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
208; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
209; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
210; GFX7-NEXT:    s_mov_b32 s2, -1
211; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
212; GFX7-NEXT:    s_waitcnt vmcnt(2)
213; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
214; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
215; GFX7-NEXT:    s_waitcnt vmcnt(1)
216; GFX7-NEXT:    v_bfe_i32 v6, v0, 0, 8
217; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
218; GFX7-NEXT:    v_bfe_i32 v7, v0, 8, 8
219; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
220; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
221; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
222; GFX7-NEXT:    v_bfe_i32 v8, v0, 16, 8
223; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
224; GFX7-NEXT:    s_waitcnt vmcnt(0)
225; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
226; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
227; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
228; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
229; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
230; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
231; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
232; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
233; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
234; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
235; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
236; GFX7-NEXT:    s_endpgm
237;
238; GFX8-LABEL: idot4_acc16:
239; GFX8:       ; %bb.0: ; %entry
240; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
241; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
242; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
243; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX8-NEXT:    v_mov_b32_e32 v1, s1
245; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
246; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
247; GFX8-NEXT:    flat_load_dword v3, v[0:1]
248; GFX8-NEXT:    v_mov_b32_e32 v1, s3
249; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
250; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
251; GFX8-NEXT:    flat_load_dword v2, v[0:1]
252; GFX8-NEXT:    v_mov_b32_e32 v0, s4
253; GFX8-NEXT:    v_mov_b32_e32 v1, s5
254; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
255; GFX8-NEXT:    s_waitcnt vmcnt(2)
256; GFX8-NEXT:    v_bfe_i32 v7, v3, 0, 8
257; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
258; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
259; GFX8-NEXT:    v_bfe_i32 v9, v9, 0, 8
260; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
261; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
262; GFX8-NEXT:    s_waitcnt vmcnt(1)
263; GFX8-NEXT:    v_bfe_i32 v8, v2, 0, 8
264; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
265; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
266; GFX8-NEXT:    v_bfe_i32 v10, v10, 0, 8
267; GFX8-NEXT:    s_waitcnt vmcnt(0)
268; GFX8-NEXT:    v_mad_u16 v4, v7, v8, v4
269; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
270; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 8
271; GFX8-NEXT:    v_mad_u16 v4, v9, v10, v4
272; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
273; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
274; GFX8-NEXT:    v_mad_u16 v4, v5, v6, v4
275; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
276; GFX8-NEXT:    flat_store_short v[0:1], v2
277; GFX8-NEXT:    s_endpgm
278;
279; GFX9-NODL-LABEL: idot4_acc16:
280; GFX9-NODL:       ; %bb.0: ; %entry
281; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
282; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
283; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
284; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
285; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
286; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
287; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
288; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[6:7]
289; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
290; GFX9-NODL-NEXT:    v_bfe_i32 v6, v1, 0, 8
291; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
292; GFX9-NODL-NEXT:    v_bfe_i32 v7, v2, 0, 8
293; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
294; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
295; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
296; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
297; GFX9-NODL-NEXT:    v_bfe_i32 v8, v8, 0, 8
298; GFX9-NODL-NEXT:    v_bfe_i32 v9, v9, 0, 8
299; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
300; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
301; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
302; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
303; GFX9-NODL-NEXT:    v_bfe_i32 v4, v4, 0, 8
304; GFX9-NODL-NEXT:    v_bfe_i32 v5, v5, 0, 8
305; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
306; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
307; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
308; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
309; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
310; GFX9-NODL-NEXT:    global_store_short v0, v1, s[6:7]
311; GFX9-NODL-NEXT:    s_endpgm
312;
313; GFX9-DL-LABEL: idot4_acc16:
314; GFX9-DL:       ; %bb.0: ; %entry
315; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
316; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
317; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
318; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
319; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
321; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
322; GFX9-DL-NEXT:    global_load_sshort v4, v1, s[6:7]
323; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
324; GFX9-DL-NEXT:    v_dot4_i32_i8 v0, v2, v3, v4
325; GFX9-DL-NEXT:    global_store_short v1, v0, s[6:7]
326; GFX9-DL-NEXT:    s_endpgm
327;
328; GFX10-DL-LABEL: idot4_acc16:
329; GFX10-DL:       ; %bb.0: ; %entry
330; GFX10-DL-NEXT:    s_clause 0x1
331; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
332; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
333; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
334; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
335; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
336; GFX10-DL-NEXT:    s_clause 0x1
337; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
338; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
339; GFX10-DL-NEXT:    global_load_sshort v4, v1, s[6:7]
340; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
341; GFX10-DL-NEXT:    v_dot4c_i32_i8 v4, v2, v3
342; GFX10-DL-NEXT:    global_store_short v1, v4, s[6:7]
343; GFX10-DL-NEXT:    s_endpgm
344;
345; GFX11-DL-LABEL: idot4_acc16:
346; GFX11-DL:       ; %bb.0: ; %entry
347; GFX11-DL-NEXT:    s_clause 0x1
348; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
349; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
350; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
351; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
352; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
353; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
354; GFX11-DL-NEXT:    s_clause 0x1
355; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
356; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
357; GFX11-DL-NEXT:    global_load_i16 v3, v1, s[4:5]
358; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
359; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0]
360; GFX11-DL-NEXT:    global_store_b16 v1, v0, s[4:5]
361; GFX11-DL-NEXT:    s_endpgm
362                                       ptr addrspace(1) %src2,
363                                       ptr addrspace(1) nocapture %dst) {
364entry:
365  %idx = call i32 @llvm.amdgcn.workitem.id.x()
366  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
367  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
368  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
369  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
370
371  %v1e0 = extractelement <4 x i8> %vec1, i64 0
372  %cv1e0 = sext i8 %v1e0 to i16
373  %v2e0 = extractelement <4 x i8> %vec2, i64 0
374  %cv2e0 = sext i8 %v2e0 to i16
375  %mul1 = mul nsw i16 %cv1e0, %cv2e0
376
377  %v1e1 = extractelement <4 x i8> %vec1, i64 1
378  %cv1e1 = sext i8 %v1e1 to i16
379  %v2e1 = extractelement <4 x i8> %vec2, i64 1
380  %cv2e1 = sext i8 %v2e1 to i16
381  %mul2 = mul nsw i16 %cv1e1, %cv2e1
382
383  %v1e2 = extractelement <4 x i8> %vec1, i64 2
384  %cv1e2 = sext i8 %v1e2 to i16
385  %v2e2 = extractelement <4 x i8> %vec2, i64 2
386  %cv2e2 = sext i8 %v2e2 to i16
387  %mul3 = mul nsw i16 %cv1e2, %cv2e2
388
389  %v1e3 = extractelement <4 x i8> %vec1, i64 3
390  %cv1e3 = sext i8 %v1e3 to i16
391  %v2e3 = extractelement <4 x i8> %vec2, i64 3
392  %cv2e3 = sext i8 %v2e3 to i16
393  %mul4 = mul nsw i16 %cv1e3, %cv2e3
394
395  %acc = load i16, ptr addrspace(1) %dst, align 2
396  %add1 = add i16 %mul1, %acc
397  %add2 = add i16 %add1, %mul2
398  %add3 = add i16 %add2, %mul3
399  %add4 = add i16 %add3, %mul4
400  store i16 %add4, ptr addrspace(1) %dst, align 2
401  ret void
402}
403
404define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
405; GFX7-LABEL: idot4_acc8:
406; GFX7:       ; %bb.0: ; %entry
407; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
408; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
409; GFX7-NEXT:    s_mov_b32 s3, 0xf000
410; GFX7-NEXT:    s_mov_b32 s6, 0
411; GFX7-NEXT:    s_mov_b32 s7, s3
412; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
413; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
414; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
415; GFX7-NEXT:    v_mov_b32_e32 v1, 0
416; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
417; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
418; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
419; GFX7-NEXT:    s_mov_b32 s2, -1
420; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
421; GFX7-NEXT:    s_waitcnt vmcnt(2)
422; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
423; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
424; GFX7-NEXT:    s_waitcnt vmcnt(1)
425; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
426; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
427; GFX7-NEXT:    s_waitcnt vmcnt(0)
428; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
429; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
430; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
431; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
432; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
433; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
434; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
435; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
436; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
437; GFX7-NEXT:    s_endpgm
438;
439; GFX8-LABEL: idot4_acc8:
440; GFX8:       ; %bb.0: ; %entry
441; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
442; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
443; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
444; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
445; GFX8-NEXT:    v_mov_b32_e32 v1, s1
446; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
447; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
448; GFX8-NEXT:    flat_load_dword v3, v[0:1]
449; GFX8-NEXT:    v_mov_b32_e32 v1, s3
450; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
451; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
452; GFX8-NEXT:    flat_load_dword v2, v[0:1]
453; GFX8-NEXT:    v_mov_b32_e32 v0, s4
454; GFX8-NEXT:    v_mov_b32_e32 v1, s5
455; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
456; GFX8-NEXT:    s_waitcnt vmcnt(2)
457; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
458; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
459; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
460; GFX8-NEXT:    s_waitcnt vmcnt(1)
461; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
462; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
463; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
464; GFX8-NEXT:    s_waitcnt vmcnt(0)
465; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
466; GFX8-NEXT:    v_mad_u16 v2, v7, v8, v2
467; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
468; GFX8-NEXT:    v_mad_u16 v2, v9, v10, v2
469; GFX8-NEXT:    flat_store_byte v[0:1], v2
470; GFX8-NEXT:    s_endpgm
471;
472; GFX9-NODL-LABEL: idot4_acc8:
473; GFX9-NODL:       ; %bb.0: ; %entry
474; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
475; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
476; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
477; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
478; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
479; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
480; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
481; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[6:7]
482; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
483; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
484; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
485; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
486; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
487; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
488; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
489; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
490; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
491; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
492; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
493; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
494; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
495; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[6:7]
496; GFX9-NODL-NEXT:    s_endpgm
497;
498; GFX9-DL-LABEL: idot4_acc8:
499; GFX9-DL:       ; %bb.0: ; %entry
500; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
501; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
502; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
503; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
504; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
506; GFX9-DL-NEXT:    global_load_dword v3, v0, s[2:3]
507; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[6:7]
508; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
509; GFX9-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
510; GFX9-DL-NEXT:    global_store_byte v1, v0, s[6:7]
511; GFX9-DL-NEXT:    s_endpgm
512;
513; GFX10-DL-LABEL: idot4_acc8:
514; GFX10-DL:       ; %bb.0: ; %entry
515; GFX10-DL-NEXT:    s_clause 0x1
516; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
517; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
518; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
519; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
520; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX10-DL-NEXT:    s_clause 0x1
522; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
523; GFX10-DL-NEXT:    global_load_dword v3, v0, s[2:3]
524; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[6:7]
525; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
526; GFX10-DL-NEXT:    v_dot4_u32_u8 v0, v2, v3, v4
527; GFX10-DL-NEXT:    global_store_byte v1, v0, s[6:7]
528; GFX10-DL-NEXT:    s_endpgm
529;
530; GFX11-DL-LABEL: idot4_acc8:
531; GFX11-DL:       ; %bb.0: ; %entry
532; GFX11-DL-NEXT:    s_clause 0x1
533; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
534; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
535; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
536; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
537; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
538; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX11-DL-NEXT:    s_clause 0x1
540; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
541; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
542; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[4:5]
543; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
544; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v2, v0, v3
545; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[4:5]
546; GFX11-DL-NEXT:    s_endpgm
547                                      ptr addrspace(1) %src2,
548                                      ptr addrspace(1) nocapture %dst) {
549entry:
550  %idx = call i32 @llvm.amdgcn.workitem.id.x()
551  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
552  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
553  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
554  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
555
556  %v1e0 = extractelement <4 x i8> %vec1, i64 0
557  %v2e0 = extractelement <4 x i8> %vec2, i64 0
558  %mul1 = mul i8 %v1e0, %v2e0
559
560  %v1e1 = extractelement <4 x i8> %vec1, i64 1
561  %v2e1 = extractelement <4 x i8> %vec2, i64 1
562  %mul2 = mul i8 %v1e1, %v2e1
563
564  %v1e2 = extractelement <4 x i8> %vec1, i64 2
565  %v2e2 = extractelement <4 x i8> %vec2, i64 2
566  %mul3 = mul i8 %v1e2, %v2e2
567
568  %v1e3 = extractelement <4 x i8> %vec1, i64 3
569  %v2e3 = extractelement <4 x i8> %vec2, i64 3
570  %mul4 = mul i8 %v1e3, %v2e3
571
572  %acc = load i8, ptr addrspace(1) %dst, align 2
573  %add1 = add i8 %mul1, %acc
574  %add2 = add i8 %add1, %mul2
575  %add3 = add i8 %add2, %mul3
576  %add4 = add nsw i8 %add3, %mul4
577  store i8 %add4, ptr addrspace(1) %dst, align 2
578  ret void
579}
580
581define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
582; GFX7-LABEL: idot4_multiuse_mul1:
583; GFX7:       ; %bb.0: ; %entry
584; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
585; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
586; GFX7-NEXT:    s_mov_b32 s3, 0xf000
587; GFX7-NEXT:    s_mov_b32 s6, 0
588; GFX7-NEXT:    s_mov_b32 s7, s3
589; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
590; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
591; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
592; GFX7-NEXT:    v_mov_b32_e32 v1, 0
593; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
594; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
595; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
596; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
597; GFX7-NEXT:    s_mov_b32 s2, -1
598; GFX7-NEXT:    s_waitcnt vmcnt(1)
599; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
600; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
601; GFX7-NEXT:    s_waitcnt vmcnt(0)
602; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
603; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
604; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX7-NEXT:    v_mad_i32_i24 v8, v1, v5, s4
606; GFX7-NEXT:    v_mad_i32_i24 v3, v3, v6, v8
607; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
608; GFX7-NEXT:    v_bfe_i32 v7, v0, 16, 8
609; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v5, v3
610; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
611; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
612; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v7, v1
613; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
614; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
615; GFX7-NEXT:    s_endpgm
616;
617; GFX8-LABEL: idot4_multiuse_mul1:
618; GFX8:       ; %bb.0: ; %entry
619; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
620; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
621; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
622; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
623; GFX8-NEXT:    v_mov_b32_e32 v1, s1
624; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
625; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
626; GFX8-NEXT:    flat_load_dword v3, v[0:1]
627; GFX8-NEXT:    v_mov_b32_e32 v1, s3
628; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
629; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
630; GFX8-NEXT:    flat_load_dword v0, v[0:1]
631; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
632; GFX8-NEXT:    s_waitcnt vmcnt(1)
633; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
634; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
635; GFX8-NEXT:    v_bfe_i32 v6, v3, 16, 8
636; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
637; GFX8-NEXT:    s_waitcnt vmcnt(0)
638; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
639; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
640; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
641; GFX8-NEXT:    v_mad_i32_i24 v8, v1, v2, s0
642; GFX8-NEXT:    v_mad_i32_i24 v4, v4, v5, v8
643; GFX8-NEXT:    v_bfe_i32 v7, v0, 16, 8
644; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, v4
645; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
646; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
647; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
648; GFX8-NEXT:    v_mov_b32_e32 v0, s4
649; GFX8-NEXT:    v_mov_b32_e32 v1, s5
650; GFX8-NEXT:    flat_store_dword v[0:1], v2
651; GFX8-NEXT:    s_endpgm
652;
653; GFX9-NODL-LABEL: idot4_multiuse_mul1:
654; GFX9-NODL:       ; %bb.0: ; %entry
655; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
656; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
657; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
658; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
659; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
660; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
661; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
662; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
663; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
664; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
665; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
666; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
667; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
668; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
669; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
670; GFX9-NODL-NEXT:    v_mul_i32_i24_e32 v2, v3, v4
671; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
672; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v3, v4, s0
673; GFX9-NODL-NEXT:    v_add3_u32 v2, v5, v3, v2
674; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
675; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
676; GFX9-NODL-NEXT:    s_endpgm
677;
678; GFX9-DL-LABEL: idot4_multiuse_mul1:
679; GFX9-DL:       ; %bb.0: ; %entry
680; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
681; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
682; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
683; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
684; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
685; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
686; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
687; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
688; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
689; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 8
690; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
691; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 8
692; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
693; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v3, v4, s0
694; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, v3
695; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
696; GFX9-DL-NEXT:    s_endpgm
697;
698; GFX10-DL-LABEL: idot4_multiuse_mul1:
699; GFX10-DL:       ; %bb.0: ; %entry
700; GFX10-DL-NEXT:    s_clause 0x1
701; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
702; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
703; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
704; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
705; GFX10-DL-NEXT:    s_clause 0x1
706; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
707; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
708; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
709; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
710; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
711; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
712; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
713; GFX10-DL-NEXT:    v_bfe_i32 v3, v2, 0, 8
714; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
715; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, v3, s0
716; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
717; GFX10-DL-NEXT:    v_dot4c_i32_i8 v0, v1, v2
718; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
719; GFX10-DL-NEXT:    s_endpgm
720;
721; GFX11-DL-LABEL: idot4_multiuse_mul1:
722; GFX11-DL:       ; %bb.0: ; %entry
723; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
724; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
725; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
726; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
727; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
728; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
729; GFX11-DL-NEXT:    s_clause 0x1
730; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
731; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
732; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
733; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
734; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
735; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
736; GFX11-DL-NEXT:    v_bfe_i32 v3, v0, 0, 8
737; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
738; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
739; GFX11-DL-NEXT:    v_mad_i32_i24 v2, v2, v3, s0
740; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
741; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
742; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[4:5]
743; GFX11-DL-NEXT:    s_endpgm
744                                               ptr addrspace(1) %src2,
745                                               ptr addrspace(1) nocapture %dst) {
746entry:
747  %idx = call i32 @llvm.amdgcn.workitem.id.x()
748  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
749  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
750  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
751  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
752
753  %v1e0 = extractelement <4 x i8> %vec1, i64 0
754  %cv1e0 = sext i8 %v1e0 to i32
755  %v2e0 = extractelement <4 x i8> %vec2, i64 0
756  %cv2e0 = sext i8 %v2e0 to i32
757  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
758
759  %v1e1 = extractelement <4 x i8> %vec1, i64 1
760  %cv1e1 = sext i8 %v1e1 to i32
761  %v2e1 = extractelement <4 x i8> %vec2, i64 1
762  %cv2e1 = sext i8 %v2e1 to i32
763  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
764
765  %v1e2 = extractelement <4 x i8> %vec1, i64 2
766  %cv1e2 = sext i8 %v1e2 to i32
767  %v2e2 = extractelement <4 x i8> %vec2, i64 2
768  %cv2e2 = sext i8 %v2e2 to i32
769  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
770
771  %v1e3 = extractelement <4 x i8> %vec1, i64 3
772  %cv1e3 = sext i8 %v1e3 to i32
773  %v2e3 = extractelement <4 x i8> %vec2, i64 3
774  %cv2e3 = sext i8 %v2e3 to i32
775  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
776
777  %acc = load i32, ptr addrspace(1) %dst, align 4
778  %add = add i32 %mul1, %acc
779  %add1 = add i32 %mul2, %add
780  %add2 = add i32 %add1, %mul1
781  %add3 = add i32 %add2, %mul3
782  %add4 = add i32 %add3, %mul4
783
784  store i32 %add4, ptr addrspace(1) %dst, align 4
785  ret void
786}
787
788define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
789; GFX7-LABEL: idot4_acc32_vecMul:
790; GFX7:       ; %bb.0: ; %entry
791; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
792; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
793; GFX7-NEXT:    s_mov_b32 s3, 0xf000
794; GFX7-NEXT:    s_mov_b32 s6, 0
795; GFX7-NEXT:    s_mov_b32 s7, s3
796; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
797; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
798; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
799; GFX7-NEXT:    v_mov_b32_e32 v1, 0
800; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
801; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
802; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
803; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
804; GFX7-NEXT:    s_mov_b32 s2, -1
805; GFX7-NEXT:    s_waitcnt vmcnt(1)
806; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 24, v2
807; GFX7-NEXT:    v_bfe_i32 v3, v2, 16, 8
808; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
809; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 8
810; GFX7-NEXT:    s_waitcnt vmcnt(0)
811; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 24, v0
812; GFX7-NEXT:    v_bfe_i32 v6, v0, 16, 8
813; GFX7-NEXT:    v_bfe_i32 v7, v0, 8, 8
814; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 8
815; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
816; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, s4
817; GFX7-NEXT:    v_mad_i32_i24 v0, v4, v7, v0
818; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v6, v0
819; GFX7-NEXT:    v_mad_i32_i24 v0, v1, v5, v0
820; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
821; GFX7-NEXT:    s_endpgm
822;
823; GFX8-LABEL: idot4_acc32_vecMul:
824; GFX8:       ; %bb.0: ; %entry
825; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
826; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
827; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
828; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
829; GFX8-NEXT:    v_mov_b32_e32 v1, s1
830; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
831; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
832; GFX8-NEXT:    flat_load_dword v3, v[0:1]
833; GFX8-NEXT:    v_mov_b32_e32 v1, s3
834; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
835; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
836; GFX8-NEXT:    flat_load_dword v0, v[0:1]
837; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
838; GFX8-NEXT:    s_waitcnt vmcnt(1)
839; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v3
840; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 24, v3
841; GFX8-NEXT:    v_bfe_i32 v5, v3, 16, 8
842; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
843; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 8
844; GFX8-NEXT:    s_waitcnt vmcnt(0)
845; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 8, v0
846; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 24, v0
847; GFX8-NEXT:    v_bfe_i32 v7, v0, 16, 8
848; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
849; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
850; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
851; GFX8-NEXT:    v_mad_i32_i24 v0, v3, v0, s0
852; GFX8-NEXT:    v_mad_i32_i24 v0, v1, v2, v0
853; GFX8-NEXT:    v_mad_i32_i24 v0, v5, v7, v0
854; GFX8-NEXT:    v_mad_i32_i24 v2, v4, v6, v0
855; GFX8-NEXT:    v_mov_b32_e32 v0, s4
856; GFX8-NEXT:    v_mov_b32_e32 v1, s5
857; GFX8-NEXT:    flat_store_dword v[0:1], v2
858; GFX8-NEXT:    s_endpgm
859;
860; GFX9-NODL-LABEL: idot4_acc32_vecMul:
861; GFX9-NODL:       ; %bb.0: ; %entry
862; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
863; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
864; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
865; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
866; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
867; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
868; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
869; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
870; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
871; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
872; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
873; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
874; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
875; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
876; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
877; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
878; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
879; GFX9-NODL-NEXT:    v_add3_u32 v2, v5, s0, v3
880; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
881; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
882; GFX9-NODL-NEXT:    s_endpgm
883;
884; GFX9-DL-LABEL: idot4_acc32_vecMul:
885; GFX9-DL:       ; %bb.0: ; %entry
886; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
887; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
888; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
889; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
890; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
891; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
892; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
893; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
894; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
895; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s0
896; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
897; GFX9-DL-NEXT:    s_endpgm
898;
899; GFX10-DL-LABEL: idot4_acc32_vecMul:
900; GFX10-DL:       ; %bb.0: ; %entry
901; GFX10-DL-NEXT:    s_clause 0x1
902; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
903; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
904; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
905; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
906; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX10-DL-NEXT:    s_clause 0x1
908; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
909; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
910; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
911; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
912; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
913; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s0
914; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
915; GFX10-DL-NEXT:    v_dot4c_i32_i8 v0, v1, v2
916; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
917; GFX10-DL-NEXT:    s_endpgm
918;
919; GFX11-DL-LABEL: idot4_acc32_vecMul:
920; GFX11-DL:       ; %bb.0: ; %entry
921; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
922; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
923; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
924; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
925; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
926; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
927; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
928; GFX11-DL-NEXT:    s_clause 0x1
929; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
930; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
931; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
932; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
933; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
934; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
935; GFX11-DL-NEXT:    s_endpgm
936                                              ptr addrspace(1) %src2,
937                                              ptr addrspace(1) nocapture %dst) {
938entry:
939  %idx = call i32 @llvm.amdgcn.workitem.id.x()
940  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
941  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
942  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
943  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
944
945  %cvec1 = sext <4 x i8> %vec1 to <4 x i32>
946  %cvec2 = sext <4 x i8> %vec2 to <4 x i32>
947
948  %mul = mul <4 x i32> %cvec1, %cvec2
949  %mul0 = extractelement <4 x i32> %mul, i64 0
950  %mul1 = extractelement <4 x i32> %mul, i64 1
951  %mul2 = extractelement <4 x i32> %mul, i64 2
952  %mul3 = extractelement <4 x i32> %mul, i64 3
953
954  %acc = load i32, ptr addrspace(1) %dst, align 4
955  %add1 = add i32 %mul0, %acc
956  %add2 = add i32 %add1, %mul1
957  %add3 = add i32 %add2, %mul2
958  %add4 = add i32 %add3, %mul3
959
960  store i32 %add4, ptr addrspace(1) %dst, align 4
961  ret void
962}
963
964define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
965; GFX7-LABEL: idot4_acc16_vecMul:
966; GFX7:       ; %bb.0: ; %entry
967; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
968; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
969; GFX7-NEXT:    s_mov_b32 s3, 0xf000
970; GFX7-NEXT:    s_mov_b32 s6, 0
971; GFX7-NEXT:    s_mov_b32 s7, s3
972; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
973; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
974; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
975; GFX7-NEXT:    v_mov_b32_e32 v1, 0
976; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
977; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
978; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
979; GFX7-NEXT:    s_mov_b32 s2, -1
980; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
981; GFX7-NEXT:    s_waitcnt vmcnt(2)
982; GFX7-NEXT:    v_bfe_i32 v4, v2, 0, 8
983; GFX7-NEXT:    v_bfe_i32 v3, v2, 16, 8
984; GFX7-NEXT:    s_waitcnt vmcnt(1)
985; GFX7-NEXT:    v_bfe_i32 v7, v0, 0, 8
986; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 24, v2
987; GFX7-NEXT:    v_bfe_i32 v2, v2, 8, 8
988; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
989; GFX7-NEXT:    v_bfe_i32 v6, v0, 16, 8
990; GFX7-NEXT:    v_ashrrev_i32_e32 v8, 24, v0
991; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
992; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
993; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
994; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
995; GFX7-NEXT:    s_waitcnt vmcnt(0)
996; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
997; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
998; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
999; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1000; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
1001; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
1002; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
1003; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v8, v0
1004; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
1005; GFX7-NEXT:    s_endpgm
1006;
1007; GFX8-LABEL: idot4_acc16_vecMul:
1008; GFX8:       ; %bb.0: ; %entry
1009; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1010; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1011; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1012; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1013; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1014; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1015; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1016; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1017; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1018; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1019; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1020; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1021; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1022; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1023; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
1024; GFX8-NEXT:    s_waitcnt vmcnt(2)
1025; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
1026; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 8, v3
1027; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
1028; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
1029; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
1030; GFX8-NEXT:    s_waitcnt vmcnt(1)
1031; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1032; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 8, v2
1033; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
1034; GFX8-NEXT:    s_waitcnt vmcnt(0)
1035; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1036; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 8, v6
1037; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 8
1038; GFX8-NEXT:    v_mad_u16 v2, v7, v8, v2
1039; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
1040; GFX8-NEXT:    v_mad_u16 v2, v9, v10, v2
1041; GFX8-NEXT:    flat_store_short v[0:1], v2
1042; GFX8-NEXT:    s_endpgm
1043;
1044; GFX9-NODL-LABEL: idot4_acc16_vecMul:
1045; GFX9-NODL:       ; %bb.0: ; %entry
1046; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1047; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1048; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1049; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1050; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1051; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1052; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1053; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[6:7]
1054; GFX9-NODL-NEXT:    s_mov_b32 s0, 0x5040100
1055; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
1056; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
1057; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1058; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1059; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v6, 8, v1
1060; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
1061; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v7, 8, v2
1062; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
1063; GFX9-NODL-NEXT:    v_perm_b32 v2, v7, v2, s0
1064; GFX9-NODL-NEXT:    v_perm_b32 v1, v6, v1, s0
1065; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v8, 8, v4
1066; GFX9-NODL-NEXT:    v_bfe_i32 v4, v4, 0, 8
1067; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
1068; GFX9-NODL-NEXT:    v_bfe_i32 v5, v5, 0, 8
1069; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1070; GFX9-NODL-NEXT:    v_perm_b32 v5, v9, v5, s0
1071; GFX9-NODL-NEXT:    v_perm_b32 v4, v8, v4, s0
1072; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1073; GFX9-NODL-NEXT:    v_add_u16_e32 v3, v1, v3
1074; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
1075; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1076; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v2
1077; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1078; GFX9-NODL-NEXT:    global_store_short v0, v1, s[6:7]
1079; GFX9-NODL-NEXT:    s_endpgm
1080;
1081; GFX9-DL-LABEL: idot4_acc16_vecMul:
1082; GFX9-DL:       ; %bb.0: ; %entry
1083; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1084; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1085; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1086; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1087; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1088; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1089; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1090; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[6:7]
1091; GFX9-DL-NEXT:    s_mov_b32 s0, 0x5040100
1092; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1093; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
1094; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1095; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1096; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 8, v1
1097; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
1098; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 8, v2
1099; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
1100; GFX9-DL-NEXT:    v_perm_b32 v2, v7, v2, s0
1101; GFX9-DL-NEXT:    v_perm_b32 v1, v6, v1, s0
1102; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 8, v4
1103; GFX9-DL-NEXT:    v_bfe_i32 v4, v4, 0, 8
1104; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
1105; GFX9-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
1106; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1107; GFX9-DL-NEXT:    v_perm_b32 v5, v9, v5, s0
1108; GFX9-DL-NEXT:    v_perm_b32 v4, v8, v4, s0
1109; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1110; GFX9-DL-NEXT:    v_add_u16_e32 v3, v1, v3
1111; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
1112; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1113; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
1114; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1115; GFX9-DL-NEXT:    global_store_short v0, v1, s[6:7]
1116; GFX9-DL-NEXT:    s_endpgm
1117;
1118; GFX10-DL-LABEL: idot4_acc16_vecMul:
1119; GFX10-DL:       ; %bb.0: ; %entry
1120; GFX10-DL-NEXT:    s_clause 0x1
1121; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1122; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1123; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1124; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1125; GFX10-DL-NEXT:    s_clause 0x1
1126; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1127; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1128; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1129; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[6:7]
1130; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1131; GFX10-DL-NEXT:    v_ashrrev_i16 v4, 8, v1
1132; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1133; GFX10-DL-NEXT:    v_ashrrev_i16 v5, 8, v2
1134; GFX10-DL-NEXT:    v_bfe_i32 v6, v2, 0, 8
1135; GFX10-DL-NEXT:    v_bfe_i32 v7, v1, 0, 8
1136; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1137; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1138; GFX10-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
1139; GFX10-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
1140; GFX10-DL-NEXT:    v_ashrrev_i16 v6, 8, v1
1141; GFX10-DL-NEXT:    v_ashrrev_i16 v7, 8, v2
1142; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
1143; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
1144; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
1145; GFX10-DL-NEXT:    v_perm_b32 v2, v7, v2, 0x5040100
1146; GFX10-DL-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
1147; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
1148; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1149; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
1150; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1151; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
1152; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1153; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
1154; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
1155; GFX10-DL-NEXT:    global_store_short v0, v1, s[6:7]
1156; GFX10-DL-NEXT:    s_endpgm
1157;
1158; GFX11-DL-LABEL: idot4_acc16_vecMul:
1159; GFX11-DL:       ; %bb.0: ; %entry
1160; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1161; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1162; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1163; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
1164; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1165; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1166; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1167; GFX11-DL-NEXT:    s_clause 0x1
1168; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
1169; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
1170; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[4:5]
1171; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
1172; GFX11-DL-NEXT:    v_ashrrev_i16 v4, 8, v1
1173; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
1174; GFX11-DL-NEXT:    v_ashrrev_i16 v5, 8, v0
1175; GFX11-DL-NEXT:    v_bfe_i32 v6, v0, 0, 8
1176; GFX11-DL-NEXT:    v_bfe_i32 v7, v1, 0, 8
1177; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1178; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1179; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1180; GFX11-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
1181; GFX11-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
1182; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1183; GFX11-DL-NEXT:    v_ashrrev_i16 v6, 8, v1
1184; GFX11-DL-NEXT:    v_ashrrev_i16 v7, 8, v0
1185; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
1186; GFX11-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
1187; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
1188; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1189; GFX11-DL-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
1190; GFX11-DL-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
1191; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
1192; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
1193; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1194; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
1195; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
1196; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1197; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
1198; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1199; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1200; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
1201; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
1202; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[4:5]
1203; GFX11-DL-NEXT:    s_endpgm
1204                                              ptr addrspace(1) %src2,
1205                                              ptr addrspace(1) nocapture %dst) {
1206entry:
1207  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1208  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1209  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1210  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1211  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1212
1213  %cvec1 = sext <4 x i8> %vec1 to <4 x i16>
1214  %cvec2 = sext <4 x i8> %vec2 to <4 x i16>
1215
1216  %mul = mul <4 x i16> %cvec1, %cvec2
1217  %mul0 = extractelement <4 x i16> %mul, i64 0
1218  %mul1 = extractelement <4 x i16> %mul, i64 1
1219  %mul2 = extractelement <4 x i16> %mul, i64 2
1220  %mul3 = extractelement <4 x i16> %mul, i64 3
1221
1222  %acc = load i16, ptr addrspace(1) %dst, align 4
1223  %add1 = add i16 %mul0, %acc
1224  %add2 = add i16 %add1, %mul1
1225  %add3 = add i16 %add2, %mul2
1226  %add4 = add i16 %add3, %mul3
1227
1228  store i16 %add4, ptr addrspace(1) %dst, align 4
1229  ret void
1230}
1231
1232define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
1233; GFX7-LABEL: idot4_acc32_2ele:
1234; GFX7:       ; %bb.0: ; %entry
1235; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1236; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1237; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1238; GFX7-NEXT:    s_mov_b32 s10, 0
1239; GFX7-NEXT:    s_mov_b32 s11, s7
1240; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1241; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
1242; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1243; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1244; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1245; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
1246; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1247; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
1248; GFX7-NEXT:    s_mov_b32 s6, -1
1249; GFX7-NEXT:    s_waitcnt vmcnt(1)
1250; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
1251; GFX7-NEXT:    v_bfe_i32 v2, v2, 8, 8
1252; GFX7-NEXT:    s_waitcnt vmcnt(0)
1253; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 8
1254; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
1255; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1256; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v3, s0
1257; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
1258; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1259; GFX7-NEXT:    s_endpgm
1260;
1261; GFX8-LABEL: idot4_acc32_2ele:
1262; GFX8:       ; %bb.0: ; %entry
1263; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1264; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1265; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1266; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1267; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1268; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1269; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1270; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1271; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1272; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1273; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1274; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1275; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1276; GFX8-NEXT:    s_waitcnt vmcnt(1)
1277; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
1278; GFX8-NEXT:    v_bfe_i32 v3, v3, 8, 8
1279; GFX8-NEXT:    s_waitcnt vmcnt(0)
1280; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
1281; GFX8-NEXT:    v_bfe_i32 v0, v0, 8, 8
1282; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1283; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s0
1284; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
1285; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1286; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1287; GFX8-NEXT:    flat_store_dword v[0:1], v2
1288; GFX8-NEXT:    s_endpgm
1289;
1290; GFX9-NODL-LABEL: idot4_acc32_2ele:
1291; GFX9-NODL:       ; %bb.0: ; %entry
1292; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1293; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1294; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1295; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1296; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1297; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1298; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1299; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1300; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1301; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1302; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1303; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, s0, v1
1305; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
1306; GFX9-NODL-NEXT:    s_endpgm
1307;
1308; GFX9-DL-LABEL: idot4_acc32_2ele:
1309; GFX9-DL:       ; %bb.0: ; %entry
1310; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1311; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1312; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1313; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1314; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
1315; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
1316; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1317; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0100
1318; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1319; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1320; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
1321; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1322; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
1323; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1324; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
1325; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1326; GFX9-DL-NEXT:    s_endpgm
1327;
1328; GFX10-DL-LABEL: idot4_acc32_2ele:
1329; GFX10-DL:       ; %bb.0: ; %entry
1330; GFX10-DL-NEXT:    s_clause 0x1
1331; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1332; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1333; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1334; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
1335; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1336; GFX10-DL-NEXT:    s_clause 0x1
1337; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
1338; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
1339; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1340; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1341; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1342; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc0c0100
1343; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1344; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc0c0100
1345; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1346; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s0
1347; GFX10-DL-NEXT:    v_dot4c_i32_i8 v2, v1, v0
1348; GFX10-DL-NEXT:    global_store_dword v3, v2, s[6:7]
1349; GFX10-DL-NEXT:    s_endpgm
1350;
1351; GFX11-DL-LABEL: idot4_acc32_2ele:
1352; GFX11-DL:       ; %bb.0: ; %entry
1353; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1354; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1355; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1356; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
1357; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1358; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1359; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1360; GFX11-DL-NEXT:    s_clause 0x1
1361; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
1362; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
1363; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
1364; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
1365; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0100
1366; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1367; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0100
1368; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1369; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1370; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
1371; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
1372; GFX11-DL-NEXT:    s_endpgm
1373                                       ptr addrspace(1) %src2,
1374                                       ptr addrspace(1) nocapture %dst) {
1375entry:
1376  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1377  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1378  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1379  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1380  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1381
1382  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1383  %cv1e0 = sext i8 %v1e0 to i32
1384  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1385  %cv2e0 = sext i8 %v2e0 to i32
1386  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1387
1388  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1389  %cv1e1 = sext i8 %v1e1 to i32
1390  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1391  %cv2e1 = sext i8 %v2e1 to i32
1392  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1393
1394  %acc = load i32, ptr addrspace(1) %dst, align 4
1395  %add1 = add i32 %mul1, %acc
1396  %add2 = add i32 %add1, %mul2
1397  store i32 %add2, ptr addrspace(1) %dst, align 4
1398  ret void
1399}
1400
1401
1402define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
1403; GFX7-LABEL: idot4_acc32_3ele:
1404; GFX7:       ; %bb.0: ; %entry
1405; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1406; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1407; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1408; GFX7-NEXT:    s_mov_b32 s6, 0
1409; GFX7-NEXT:    s_mov_b32 s7, s3
1410; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1411; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1412; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1413; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1414; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1415; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1416; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1417; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1418; GFX7-NEXT:    s_mov_b32 s2, -1
1419; GFX7-NEXT:    s_waitcnt vmcnt(1)
1420; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
1421; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
1422; GFX7-NEXT:    s_waitcnt vmcnt(0)
1423; GFX7-NEXT:    v_bfe_i32 v4, v0, 0, 8
1424; GFX7-NEXT:    v_bfe_i32 v5, v0, 8, 8
1425; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1426; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v4, s4
1427; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
1428; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
1429; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v5, v1
1430; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
1431; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1432; GFX7-NEXT:    s_endpgm
1433;
1434; GFX8-LABEL: idot4_acc32_3ele:
1435; GFX8:       ; %bb.0: ; %entry
1436; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1437; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1438; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1439; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1440; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1441; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1442; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1443; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1444; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1445; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1446; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1447; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1448; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1449; GFX8-NEXT:    s_waitcnt vmcnt(1)
1450; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
1451; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
1452; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
1453; GFX8-NEXT:    s_waitcnt vmcnt(0)
1454; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
1455; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
1456; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1457; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s0
1458; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
1459; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
1460; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
1461; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1462; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1463; GFX8-NEXT:    flat_store_dword v[0:1], v2
1464; GFX8-NEXT:    s_endpgm
1465;
1466; GFX9-NODL-LABEL: idot4_acc32_3ele:
1467; GFX9-NODL:       ; %bb.0: ; %entry
1468; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1469; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1470; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1471; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1472; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1473; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1474; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1475; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1476; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1477; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
1478; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1479; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
1480; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1481; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1482; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1483; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
1484; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
1485; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
1486; GFX9-NODL-NEXT:    s_endpgm
1487;
1488; GFX9-DL-LABEL: idot4_acc32_3ele:
1489; GFX9-DL:       ; %bb.0: ; %entry
1490; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1491; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1492; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1493; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1494; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
1495; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
1496; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1497; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
1498; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1499; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1500; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
1501; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1502; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
1503; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1504; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
1505; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1506; GFX9-DL-NEXT:    s_endpgm
1507;
1508; GFX10-DL-LABEL: idot4_acc32_3ele:
1509; GFX10-DL:       ; %bb.0: ; %entry
1510; GFX10-DL-NEXT:    s_clause 0x1
1511; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1512; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1513; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1514; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
1515; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1516; GFX10-DL-NEXT:    s_clause 0x1
1517; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
1518; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
1519; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1520; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1521; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1522; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
1523; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1524; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
1525; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1526; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s0
1527; GFX10-DL-NEXT:    v_dot4c_i32_i8 v2, v1, v0
1528; GFX10-DL-NEXT:    global_store_dword v3, v2, s[6:7]
1529; GFX10-DL-NEXT:    s_endpgm
1530;
1531; GFX11-DL-LABEL: idot4_acc32_3ele:
1532; GFX11-DL:       ; %bb.0: ; %entry
1533; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1534; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1535; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1536; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
1537; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1538; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1539; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1540; GFX11-DL-NEXT:    s_clause 0x1
1541; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
1542; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
1543; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
1544; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
1545; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
1546; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1547; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
1548; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1549; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1550; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
1551; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
1552; GFX11-DL-NEXT:    s_endpgm
1553                                       ptr addrspace(1) %src2,
1554                                       ptr addrspace(1) nocapture %dst) {
1555entry:
1556  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1557  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1558  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1559  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1560  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1561
1562  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1563  %cv1e0 = sext i8 %v1e0 to i32
1564  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1565  %cv2e0 = sext i8 %v2e0 to i32
1566  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1567
1568  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1569  %cv1e1 = sext i8 %v1e1 to i32
1570  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1571  %cv2e1 = sext i8 %v2e1 to i32
1572  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1573
1574  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1575  %cv1e2 = sext i8 %v1e2 to i32
1576  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1577  %cv2e2 = sext i8 %v2e2 to i32
1578  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1579
1580  %acc = load i32, ptr addrspace(1) %dst, align 4
1581  %add1 = add i32 %mul1, %acc
1582  %add2 = add i32 %add1, %mul2
1583  %add3 = add i32 %add2, %mul3
1584  store i32 %add3, ptr addrspace(1) %dst, align 4
1585  ret void
1586}
1587
1588
1589define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
1590; GFX7-LABEL: idot4_acc32_3ele_permuted:
1591; GFX7:       ; %bb.0: ; %entry
1592; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1593; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1594; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1595; GFX7-NEXT:    s_mov_b32 s6, 0
1596; GFX7-NEXT:    s_mov_b32 s7, s3
1597; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1598; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1599; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1600; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1601; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1602; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1603; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1604; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1605; GFX7-NEXT:    s_mov_b32 s2, -1
1606; GFX7-NEXT:    s_waitcnt vmcnt(1)
1607; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 24, v2
1608; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
1609; GFX7-NEXT:    s_waitcnt vmcnt(0)
1610; GFX7-NEXT:    v_ashrrev_i32_e32 v4, 24, v0
1611; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
1612; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1613; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v4, s4
1614; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
1615; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
1616; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v5, v1
1617; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
1618; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1619; GFX7-NEXT:    s_endpgm
1620;
1621; GFX8-LABEL: idot4_acc32_3ele_permuted:
1622; GFX8:       ; %bb.0: ; %entry
1623; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1624; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1625; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1626; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1627; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1628; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1629; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1630; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1631; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1632; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1633; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1634; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1635; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
1636; GFX8-NEXT:    s_waitcnt vmcnt(1)
1637; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 24, v3
1638; GFX8-NEXT:    v_bfe_i32 v4, v3, 0, 8
1639; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
1640; GFX8-NEXT:    s_waitcnt vmcnt(0)
1641; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 24, v0
1642; GFX8-NEXT:    v_bfe_i32 v5, v0, 0, 8
1643; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1644; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s0
1645; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
1646; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
1647; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
1648; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1649; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1650; GFX8-NEXT:    flat_store_dword v[0:1], v2
1651; GFX8-NEXT:    s_endpgm
1652;
1653; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted:
1654; GFX9-NODL:       ; %bb.0: ; %entry
1655; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1656; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1657; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1658; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1659; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1660; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1661; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
1662; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1663; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1664; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v3, 24, v1
1665; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1666; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v4, 24, v2
1667; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1668; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1669; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1670; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
1671; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
1672; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
1673; GFX9-NODL-NEXT:    s_endpgm
1674;
1675; GFX9-DL-LABEL: idot4_acc32_3ele_permuted:
1676; GFX9-DL:       ; %bb.0: ; %entry
1677; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1678; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1679; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1680; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1681; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
1682; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
1683; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1684; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020003
1685; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1686; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1687; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
1688; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1689; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
1690; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1691; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
1692; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1693; GFX9-DL-NEXT:    s_endpgm
1694;
1695; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
1696; GFX10-DL:       ; %bb.0: ; %entry
1697; GFX10-DL-NEXT:    s_clause 0x1
1698; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1699; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1700; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1701; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
1702; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1703; GFX10-DL-NEXT:    s_clause 0x1
1704; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
1705; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
1706; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
1707; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
1708; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1709; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020003
1710; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1711; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020003
1712; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1713; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s0
1714; GFX10-DL-NEXT:    v_dot4c_i32_i8 v2, v1, v0
1715; GFX10-DL-NEXT:    global_store_dword v3, v2, s[6:7]
1716; GFX10-DL-NEXT:    s_endpgm
1717;
1718; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
1719; GFX11-DL:       ; %bb.0: ; %entry
1720; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1721; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1722; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1723; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
1724; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1725; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1726; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1727; GFX11-DL-NEXT:    s_clause 0x1
1728; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
1729; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
1730; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
1731; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
1732; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020003
1733; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1734; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020003
1735; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1736; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1737; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
1738; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
1739; GFX11-DL-NEXT:    s_endpgm
1740                                       ptr addrspace(1) %src2,
1741                                       ptr addrspace(1) nocapture %dst) {
1742entry:
1743  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1744  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1745  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1746  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1747  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1748
1749  %v1e0 = extractelement <4 x i8> %vec1, i64 3
1750  %cv1e0 = sext i8 %v1e0 to i32
1751  %v2e0 = extractelement <4 x i8> %vec2, i64 3
1752  %cv2e0 = sext i8 %v2e0 to i32
1753  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1754
1755  %v1e1 = extractelement <4 x i8> %vec1, i64 0
1756  %cv1e1 = sext i8 %v1e1 to i32
1757  %v2e1 = extractelement <4 x i8> %vec2, i64 0
1758  %cv2e1 = sext i8 %v2e1 to i32
1759  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1760
1761  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1762  %cv1e2 = sext i8 %v1e2 to i32
1763  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1764  %cv2e2 = sext i8 %v2e2 to i32
1765  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1766
1767  %acc = load i32, ptr addrspace(1) %dst, align 4
1768  %add1 = add i32 %mul1, %acc
1769  %add2 = add i32 %add1, %mul2
1770  %add3 = add i32 %add2, %mul3
1771  store i32 %add3, ptr addrspace(1) %dst, align 4
1772  ret void
1773}
1774
1775define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
1776; GFX7-LABEL: idot4_acc32_opt:
1777; GFX7:       ; %bb.0: ; %entry
1778; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1779; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1780; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1781; GFX7-NEXT:    s_mov_b32 s6, 0
1782; GFX7-NEXT:    s_mov_b32 s7, s3
1783; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1784; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
1785; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1786; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1787; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1788; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
1789; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1790; GFX7-NEXT:    s_mov_b32 s2, -1
1791; GFX7-NEXT:    s_waitcnt vmcnt(1)
1792; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
1793; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
1794; GFX7-NEXT:    s_waitcnt vmcnt(0)
1795; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
1796; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
1797; GFX7-NEXT:    v_mul_i32_i24_e32 v3, v3, v6
1798; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
1799; GFX7-NEXT:    v_bfe_i32 v7, v0, 16, 8
1800; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v5, v3
1801; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
1802; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
1803; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v7, v1
1804; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
1805; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1806; GFX7-NEXT:    s_endpgm
1807;
1808; GFX8-LABEL: idot4_acc32_opt:
1809; GFX8:       ; %bb.0: ; %entry
1810; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1811; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1812; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1813; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1814; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1815; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1816; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1817; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1818; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1819; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1820; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1821; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1822; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1823; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1824; GFX8-NEXT:    s_waitcnt vmcnt(1)
1825; GFX8-NEXT:    v_bfe_i32 v4, v3, 0, 8
1826; GFX8-NEXT:    v_bfe_i32 v7, v3, 16, 8
1827; GFX8-NEXT:    s_waitcnt vmcnt(0)
1828; GFX8-NEXT:    v_bfe_i32 v5, v2, 0, 8
1829; GFX8-NEXT:    v_mul_i32_i24_sdwa v6, sext(v3), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1830; GFX8-NEXT:    v_bfe_i32 v8, v2, 16, 8
1831; GFX8-NEXT:    v_mad_i32_i24 v4, v4, v5, v6
1832; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
1833; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
1834; GFX8-NEXT:    v_mad_i32_i24 v4, v7, v8, v4
1835; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v2, v4
1836; GFX8-NEXT:    flat_store_dword v[0:1], v2
1837; GFX8-NEXT:    s_endpgm
1838;
1839; GFX9-NODL-LABEL: idot4_acc32_opt:
1840; GFX9-NODL:       ; %bb.0: ; %entry
1841; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1842; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1843; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1844; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1845; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
1846; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
1847; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1848; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1849; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
1850; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1851; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
1852; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1853; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1854; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1855; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, v5
1856; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
1857; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
1858; GFX9-NODL-NEXT:    s_endpgm
1859;
1860; GFX9-DL-LABEL: idot4_acc32_opt:
1861; GFX9-DL:       ; %bb.0: ; %entry
1862; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1863; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1864; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1865; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1866; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1867; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1868; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1869; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1870; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, 0
1871; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
1872; GFX9-DL-NEXT:    s_endpgm
1873;
1874; GFX10-DL-LABEL: idot4_acc32_opt:
1875; GFX10-DL:       ; %bb.0: ; %entry
1876; GFX10-DL-NEXT:    s_clause 0x1
1877; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1878; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1879; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1880; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
1881; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1882; GFX10-DL-NEXT:    s_clause 0x1
1883; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
1884; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
1885; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1886; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1887; GFX10-DL-NEXT:    v_dot4c_i32_i8 v0, v1, v2
1888; GFX10-DL-NEXT:    global_store_dword v3, v0, s[6:7]
1889; GFX10-DL-NEXT:    s_endpgm
1890;
1891; GFX11-DL-LABEL: idot4_acc32_opt:
1892; GFX11-DL:       ; %bb.0: ; %entry
1893; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1894; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1895; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1896; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
1897; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1898; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1899; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
1900; GFX11-DL-NEXT:    s_clause 0x1
1901; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
1902; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
1903; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
1904; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0]
1905; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
1906; GFX11-DL-NEXT:    s_endpgm
1907                                       ptr addrspace(1) %src2,
1908                                       ptr addrspace(1) nocapture %dst) {
1909entry:
1910  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1911  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
1912  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
1913  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
1914  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
1915
1916  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1917  %cv1e0 = sext i8 %v1e0 to i32
1918  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1919  %cv2e0 = sext i8 %v2e0 to i32
1920  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1921
1922  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1923  %cv1e1 = sext i8 %v1e1 to i32
1924  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1925  %cv2e1 = sext i8 %v2e1 to i32
1926  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1927
1928  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1929  %cv1e2 = sext i8 %v1e2 to i32
1930  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1931  %cv2e2 = sext i8 %v2e2 to i32
1932  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1933
1934  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1935  %cv1e3 = sext i8 %v1e3 to i32
1936  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1937  %cv2e3 = sext i8 %v2e3 to i32
1938  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1939
1940  %add2 = add i32 %mul1, %mul2
1941  %add3 = add i32 %add2, %mul3
1942  %add4 = add i32 %add3, %mul4
1943  store i32 %add4, ptr addrspace(1) %dst, align 4
1944  ret void
1945}
1946
1947define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
1948; GFX7-LABEL: idot4_acc32_3src:
1949; GFX7:       ; %bb.0: ; %entry
1950; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
1951; GFX7-NEXT:    s_mov_b32 s11, 0xf000
1952; GFX7-NEXT:    s_mov_b32 s14, 0
1953; GFX7-NEXT:    s_mov_b32 s15, s11
1954; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1955; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1956; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
1957; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1958; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
1959; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
1960; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
1961; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
1962; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
1963; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
1964; GFX7-NEXT:    s_mov_b32 s10, -1
1965; GFX7-NEXT:    s_mov_b32 s8, s6
1966; GFX7-NEXT:    s_mov_b32 s9, s7
1967; GFX7-NEXT:    s_waitcnt vmcnt(2)
1968; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
1969; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
1970; GFX7-NEXT:    s_waitcnt vmcnt(1)
1971; GFX7-NEXT:    v_bfe_i32 v3, v3, 8, 8
1972; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1973; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
1974; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
1975; GFX7-NEXT:    s_waitcnt vmcnt(0)
1976; GFX7-NEXT:    v_bfe_i32 v6, v0, 16, 8
1977; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v3, v1
1978; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
1979; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
1980; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v6, v1
1981; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
1982; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1983; GFX7-NEXT:    s_endpgm
1984;
1985; GFX8-LABEL: idot4_acc32_3src:
1986; GFX8:       ; %bb.0: ; %entry
1987; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
1988; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1989; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1990; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1991; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1992; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1993; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1994; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1995; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1996; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1997; GFX8-NEXT:    flat_load_dword v4, v[0:1]
1998; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1999; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2000; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2001; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2002; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
2003; GFX8-NEXT:    s_waitcnt vmcnt(2)
2004; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
2005; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
2006; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2007; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
2008; GFX8-NEXT:    v_bfe_i32 v5, v3, 16, 8
2009; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
2010; GFX8-NEXT:    s_waitcnt vmcnt(1)
2011; GFX8-NEXT:    v_bfe_i32 v4, v4, 8, 8
2012; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v4, v1
2013; GFX8-NEXT:    s_waitcnt vmcnt(0)
2014; GFX8-NEXT:    v_bfe_i32 v6, v0, 16, 8
2015; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
2016; GFX8-NEXT:    v_mad_i32_i24 v1, v5, v6, v1
2017; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
2018; GFX8-NEXT:    v_mov_b32_e32 v0, s6
2019; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2020; GFX8-NEXT:    flat_store_dword v[0:1], v2
2021; GFX8-NEXT:    s_endpgm
2022;
2023; GFX9-NODL-LABEL: idot4_acc32_3src:
2024; GFX9-NODL:       ; %bb.0: ; %entry
2025; GFX9-NODL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
2026; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2027; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2028; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[8:9]
2029; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[10:11]
2030; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[12:13]
2031; GFX9-NODL-NEXT:    s_load_dword s0, s[14:15], 0x0
2032; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2033; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
2034; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2035; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2036; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2037; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2038; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
2039; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2040; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2041; GFX9-NODL-NEXT:    v_add3_u32 v2, v4, s0, v2
2042; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
2043; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[14:15]
2044; GFX9-NODL-NEXT:    s_endpgm
2045;
2046; GFX9-DL-LABEL: idot4_acc32_3src:
2047; GFX9-DL:       ; %bb.0: ; %entry
2048; GFX9-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
2049; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2050; GFX9-DL-NEXT:    s_mov_b32 s0, 0x706010c
2051; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0c00
2052; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2053; GFX9-DL-NEXT:    global_load_dword v1, v0, s[10:11]
2054; GFX9-DL-NEXT:    global_load_dword v2, v0, s[12:13]
2055; GFX9-DL-NEXT:    global_load_dword v3, v0, s[8:9]
2056; GFX9-DL-NEXT:    s_load_dword s1, s[14:15], 0x0
2057; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2058; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2059; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
2060; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2061; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
2062; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
2063; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2064; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v3, v1, s1
2065; GFX9-DL-NEXT:    global_store_dword v0, v1, s[14:15]
2066; GFX9-DL-NEXT:    s_endpgm
2067;
2068; GFX10-DL-LABEL: idot4_acc32_3src:
2069; GFX10-DL:       ; %bb.0: ; %entry
2070; GFX10-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
2071; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2072; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2073; GFX10-DL-NEXT:    s_clause 0x2
2074; GFX10-DL-NEXT:    global_load_dword v1, v0, s[10:11]
2075; GFX10-DL-NEXT:    global_load_dword v2, v0, s[12:13]
2076; GFX10-DL-NEXT:    global_load_dword v3, v0, s[8:9]
2077; GFX10-DL-NEXT:    s_load_dword s0, s[14:15], 0x0
2078; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2079; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0x706010c
2080; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2081; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
2082; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
2083; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
2084; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2085; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
2086; GFX10-DL-NEXT:    v_dot4c_i32_i8 v1, v3, v0
2087; GFX10-DL-NEXT:    global_store_dword v2, v1, s[14:15]
2088; GFX10-DL-NEXT:    s_endpgm
2089;
2090; GFX11-DL-LABEL: idot4_acc32_3src:
2091; GFX11-DL:       ; %bb.0: ; %entry
2092; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
2093; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2094; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2095; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2096; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2097; GFX11-DL-NEXT:    s_clause 0x2
2098; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
2099; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
2100; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
2101; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
2102; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
2103; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0x706010c
2104; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
2105; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
2106; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2107; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
2108; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
2109; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2110; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
2111; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
2112; GFX11-DL-NEXT:    s_endpgm
2113                                       ptr addrspace(1) %src2,
2114                                       ptr addrspace(1) %src3,
2115                                       ptr addrspace(1) nocapture %dst) {
2116entry:
2117  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2118  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2119  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2120  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2121  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2122  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
2123  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
2124
2125  %v1e0 = extractelement <4 x i8> %vec1, i64 0
2126  %cv1e0 = sext i8 %v1e0 to i32
2127  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
2128
2129  %v1e1 = extractelement <4 x i8> %vec1, i64 1
2130  %cv1e1 = sext i8 %v1e1 to i32
2131  %v2e1 = extractelement <4 x i8> %vec2, i64 1
2132  %cv2e1 = sext i8 %v2e1 to i32
2133  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
2134
2135  %v1e2 = extractelement <4 x i8> %vec1, i64 2
2136  %cv1e2 = sext i8 %v1e2 to i32
2137  %v3e2 = extractelement <4 x i8> %vec3, i64 2
2138  %cv3e2 = sext i8 %v3e2 to i32
2139  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
2140
2141  %v1e3 = extractelement <4 x i8> %vec1, i64 3
2142  %cv1e3 = sext i8 %v1e3 to i32
2143  %v3e3 = extractelement <4 x i8> %vec3, i64 3
2144  %cv3e3 = sext i8 %v3e3 to i32
2145  %mul4 = mul nuw nsw i32 %cv1e3, %cv3e3
2146
2147  %acc = load i32, ptr addrspace(1) %dst, align 4
2148  %add1 = add i32 %mul1, %acc
2149  %add2 = add i32 %add1, %mul2
2150  %add3 = add i32 %add2, %mul3
2151  %add4 = add i32 %add3, %mul4
2152  store i32 %add4, ptr addrspace(1) %dst, align 4
2153  ret void
2154}
2155
2156define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
2157; GFX7-LABEL: idot4_acc32_3src_3ele:
2158; GFX7:       ; %bb.0: ; %entry
2159; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
2160; GFX7-NEXT:    s_mov_b32 s11, 0xf000
2161; GFX7-NEXT:    s_mov_b32 s14, 0
2162; GFX7-NEXT:    s_mov_b32 s15, s11
2163; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2164; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2165; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
2166; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2167; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
2168; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
2169; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
2170; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
2171; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
2172; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
2173; GFX7-NEXT:    s_mov_b32 s10, -1
2174; GFX7-NEXT:    s_mov_b32 s8, s6
2175; GFX7-NEXT:    s_mov_b32 s9, s7
2176; GFX7-NEXT:    s_waitcnt vmcnt(2)
2177; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
2178; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
2179; GFX7-NEXT:    s_waitcnt vmcnt(1)
2180; GFX7-NEXT:    v_bfe_i32 v3, v3, 8, 8
2181; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2182; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
2183; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
2184; GFX7-NEXT:    s_waitcnt vmcnt(0)
2185; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
2186; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v3, v1
2187; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
2188; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
2189; GFX7-NEXT:    s_endpgm
2190;
2191; GFX8-LABEL: idot4_acc32_3src_3ele:
2192; GFX8:       ; %bb.0: ; %entry
2193; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
2194; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2195; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2196; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2197; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2198; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2199; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2200; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2201; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2202; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2203; GFX8-NEXT:    flat_load_dword v4, v[0:1]
2204; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2205; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2206; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2207; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2208; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
2209; GFX8-NEXT:    s_waitcnt vmcnt(2)
2210; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
2211; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
2212; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2213; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v1, s0
2214; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
2215; GFX8-NEXT:    s_waitcnt vmcnt(1)
2216; GFX8-NEXT:    v_bfe_i32 v4, v4, 8, 8
2217; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v4, v1
2218; GFX8-NEXT:    s_waitcnt vmcnt(0)
2219; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
2220; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
2221; GFX8-NEXT:    v_mov_b32_e32 v0, s6
2222; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2223; GFX8-NEXT:    flat_store_dword v[0:1], v2
2224; GFX8-NEXT:    s_endpgm
2225;
2226; GFX9-NODL-LABEL: idot4_acc32_3src_3ele:
2227; GFX9-NODL:       ; %bb.0: ; %entry
2228; GFX9-NODL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
2229; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2230; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2231; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[8:9]
2232; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[10:11]
2233; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[12:13]
2234; GFX9-NODL-NEXT:    s_load_dword s0, s[14:15], 0x0
2235; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2236; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
2237; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 0, 8
2238; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2239; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2240; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2241; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
2242; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2243; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v4, s0
2244; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
2245; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[14:15]
2246; GFX9-NODL-NEXT:    s_endpgm
2247;
2248; GFX9-DL-LABEL: idot4_acc32_3src_3ele:
2249; GFX9-DL:       ; %bb.0: ; %entry
2250; GFX9-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
2251; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2252; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
2253; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c00
2254; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020100
2255; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2256; GFX9-DL-NEXT:    global_load_dword v1, v0, s[10:11]
2257; GFX9-DL-NEXT:    global_load_dword v2, v0, s[12:13]
2258; GFX9-DL-NEXT:    global_load_dword v3, v0, s[8:9]
2259; GFX9-DL-NEXT:    s_load_dword s3, s[14:15], 0x0
2260; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2261; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2262; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s0
2263; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2264; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
2265; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
2266; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
2267; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2268; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s3
2269; GFX9-DL-NEXT:    global_store_dword v0, v1, s[14:15]
2270; GFX9-DL-NEXT:    s_endpgm
2271;
2272; GFX10-DL-LABEL: idot4_acc32_3src_3ele:
2273; GFX10-DL:       ; %bb.0: ; %entry
2274; GFX10-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
2275; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2276; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2277; GFX10-DL-NEXT:    s_clause 0x2
2278; GFX10-DL-NEXT:    global_load_dword v1, v0, s[10:11]
2279; GFX10-DL-NEXT:    global_load_dword v2, v0, s[12:13]
2280; GFX10-DL-NEXT:    global_load_dword v3, v0, s[8:9]
2281; GFX10-DL-NEXT:    s_load_dword s0, s[14:15], 0x0
2282; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2283; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc06010c
2284; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2285; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c00
2286; GFX10-DL-NEXT:    v_perm_b32 v2, v3, v3, 0xc020100
2287; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2288; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
2289; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2290; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
2291; GFX10-DL-NEXT:    v_dot4c_i32_i8 v1, v2, v0
2292; GFX10-DL-NEXT:    global_store_dword v3, v1, s[14:15]
2293; GFX10-DL-NEXT:    s_endpgm
2294;
2295; GFX11-DL-LABEL: idot4_acc32_3src_3ele:
2296; GFX11-DL:       ; %bb.0: ; %entry
2297; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
2298; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2299; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2300; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2301; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2302; GFX11-DL-NEXT:    s_clause 0x2
2303; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
2304; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
2305; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
2306; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
2307; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
2308; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc06010c
2309; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
2310; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c00
2311; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
2312; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2313; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
2314; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
2315; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2316; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
2317; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
2318; GFX11-DL-NEXT:    s_endpgm
2319                                       ptr addrspace(1) %src2,
2320                                       ptr addrspace(1) %src3,
2321                                       ptr addrspace(1) nocapture %dst) {
2322entry:
2323  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2324  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2325  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2326  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2327  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2328  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
2329  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
2330
2331  %v1e0 = extractelement <4 x i8> %vec1, i64 0
2332  %cv1e0 = sext i8 %v1e0 to i32
2333  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0
2334
2335  %v1e1 = extractelement <4 x i8> %vec1, i64 1
2336  %cv1e1 = sext i8 %v1e1 to i32
2337  %v2e1 = extractelement <4 x i8> %vec2, i64 1
2338  %cv2e1 = sext i8 %v2e1 to i32
2339  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
2340
2341  %v1e2 = extractelement <4 x i8> %vec1, i64 2
2342  %cv1e2 = sext i8 %v1e2 to i32
2343  %v3e2 = extractelement <4 x i8> %vec3, i64 2
2344  %cv3e2 = sext i8 %v3e2 to i32
2345  %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2
2346
2347  %acc = load i32, ptr addrspace(1) %dst, align 4
2348  %add1 = add i32 %mul1, %acc
2349  %add2 = add i32 %add1, %mul2
2350  %add3 = add i32 %add2, %mul3
2351  store i32 %add3, ptr addrspace(1) %dst, align 4
2352  ret void
2353}
2354
2355define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
2356; GFX7-LABEL: idot4_bad_source:
2357; GFX7:       ; %bb.0: ; %entry
2358; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2359; GFX7-NEXT:    s_load_dword s12, s[4:5], 0xf
2360; GFX7-NEXT:    s_mov_b32 s7, 0xf000
2361; GFX7-NEXT:    s_mov_b32 s10, 0
2362; GFX7-NEXT:    s_mov_b32 s11, s7
2363; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2364; GFX7-NEXT:    s_mov_b64 s[8:9], s[0:1]
2365; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2366; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2367; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2368; GFX7-NEXT:    s_mov_b64 s[8:9], s[2:3]
2369; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2370; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x11
2371; GFX7-NEXT:    s_sext_i32_i16 s1, s12
2372; GFX7-NEXT:    s_mov_b32 s6, -1
2373; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2374; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
2375; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2376; GFX7-NEXT:    v_mov_b32_e32 v1, s0
2377; GFX7-NEXT:    s_waitcnt vmcnt(1)
2378; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
2379; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
2380; GFX7-NEXT:    s_waitcnt vmcnt(0)
2381; GFX7-NEXT:    v_bfe_i32 v5, v0, 8, 8
2382; GFX7-NEXT:    v_mad_i32_i24 v1, v3, s1, v1
2383; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
2384; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
2385; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
2386; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
2387; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2388; GFX7-NEXT:    s_endpgm
2389;
2390; GFX8-LABEL: idot4_bad_source:
2391; GFX8:       ; %bb.0: ; %entry
2392; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2393; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x3c
2394; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2395; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2396; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2397; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2398; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2399; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2400; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2401; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2402; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2403; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2404; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
2405; GFX8-NEXT:    s_sext_i32_i16 s3, s6
2406; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2407; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
2408; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2409; GFX8-NEXT:    v_mov_b32_e32 v1, s2
2410; GFX8-NEXT:    s_waitcnt vmcnt(1)
2411; GFX8-NEXT:    v_bfe_i32 v2, v3, 0, 8
2412; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
2413; GFX8-NEXT:    v_mad_i32_i24 v1, v2, s3, v1
2414; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
2415; GFX8-NEXT:    s_waitcnt vmcnt(0)
2416; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
2417; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
2418; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
2419; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
2420; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2421; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2422; GFX8-NEXT:    flat_store_dword v[0:1], v2
2423; GFX8-NEXT:    s_endpgm
2424;
2425; GFX9-NODL-LABEL: idot4_bad_source:
2426; GFX9-NODL:       ; %bb.0: ; %entry
2427; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2428; GFX9-NODL-NEXT:    s_load_dword s6, s[4:5], 0x3c
2429; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2430; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2431; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
2432; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
2433; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
2434; GFX9-NODL-NEXT:    s_sext_i32_i16 s3, s6
2435; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2436; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2437; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
2438; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2439; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
2440; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2441; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2442; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
2443; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2444; GFX9-NODL-NEXT:    v_mov_b32_e32 v2, s2
2445; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, s3, v2
2446; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v4, v1
2447; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
2448; GFX9-NODL-NEXT:    s_endpgm
2449;
2450; GFX9-DL-LABEL: idot4_bad_source:
2451; GFX9-DL:       ; %bb.0: ; %entry
2452; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2453; GFX9-DL-NEXT:    s_load_dword s6, s[4:5], 0x3c
2454; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2455; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2456; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2457; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2458; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
2459; GFX9-DL-NEXT:    s_mov_b32 s3, 0xc0c0201
2460; GFX9-DL-NEXT:    s_sext_i32_i16 s4, s6
2461; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2462; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2463; GFX9-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2464; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2465; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
2466; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2467; GFX9-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
2468; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2469; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s3
2470; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v4, s4, v3
2471; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s3
2472; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, v3
2473; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
2474; GFX9-DL-NEXT:    s_endpgm
2475;
2476; GFX10-DL-LABEL: idot4_bad_source:
2477; GFX10-DL:       ; %bb.0: ; %entry
2478; GFX10-DL-NEXT:    s_clause 0x1
2479; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2480; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x3c
2481; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2482; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2483; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2484; GFX10-DL-NEXT:    s_clause 0x1
2485; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
2486; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
2487; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
2488; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
2489; GFX10-DL-NEXT:    s_sext_i32_i16 s3, s6
2490; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2491; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2492; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2493; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
2494; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2495; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v2, 0xc0c0201
2496; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
2497; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2498; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, s3, s2
2499; GFX10-DL-NEXT:    v_dot4c_i32_i8 v0, v1, v2
2500; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
2501; GFX10-DL-NEXT:    s_endpgm
2502;
2503; GFX11-DL-LABEL: idot4_bad_source:
2504; GFX11-DL:       ; %bb.0: ; %entry
2505; GFX11-DL-NEXT:    s_clause 0x1
2506; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2507; GFX11-DL-NEXT:    s_load_b32 s6, s[4:5], 0x3c
2508; GFX11-DL-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2509; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2510; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2511; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2512; GFX11-DL-NEXT:    s_clause 0x1
2513; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
2514; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
2515; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
2516; GFX11-DL-NEXT:    s_sext_i32_i16 s3, s6
2517; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2518; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
2519; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
2520; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
2521; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
2522; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0201
2523; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0201
2524; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2525; GFX11-DL-NEXT:    v_mad_i32_i24 v2, v2, s3, s2
2526; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2527; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
2528; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
2529; GFX11-DL-NEXT:    s_endpgm
2530                                       ptr addrspace(1) %src2,
2531                                       ptr addrspace(1) %src3,
2532                                       i16 %badsource,
2533                                       ptr addrspace(1) nocapture %dst) {
2534entry:
2535  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2536  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2537  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2538  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2539  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2540  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
2541  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
2542
2543  %v1e0 = extractelement <4 x i8> %vec1, i64 0
2544  %cv1e0 = sext i8 %v1e0 to i32
2545  %v2e0 = extractelement <4 x i8> %vec2, i64 0
2546  %other = sext i16 %badsource to i32
2547  %mul1 = mul nuw nsw i32 %cv1e0, %other
2548
2549  %v1e1 = extractelement <4 x i8> %vec1, i64 1
2550  %cv1e1 = sext i8 %v1e1 to i32
2551  %v2e1 = extractelement <4 x i8> %vec2, i64 1
2552  %cv2e1 = sext i8 %v2e1 to i32
2553  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
2554
2555  %v2e2 = extractelement <4 x i8> %vec2, i64 2
2556  %cv2e2 = sext i8 %v2e2 to i32
2557  %v1e2 = extractelement <4 x i8> %vec1, i64 2
2558  %cv1e2 = sext i8 %v1e2 to i32
2559  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
2560
2561
2562  %acc = load i32, ptr addrspace(1) %dst, align 4
2563  %mad1 = add i32 %mul1, %acc
2564  %mad2 = add i32 %mad1, %mul2
2565  %mad3 = add i32 %mad2, %mul3
2566
2567  store i32 %mad3, ptr addrspace(1) %dst, align 4
2568  ret void
2569}
2570
2571
2572define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
2573; GFX7-LABEL: idot4_commutative:
2574; GFX7:       ; %bb.0: ; %entry
2575; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2576; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xf
2577; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2578; GFX7-NEXT:    s_mov_b32 s6, 0
2579; GFX7-NEXT:    s_mov_b32 s7, s3
2580; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2581; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
2582; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2583; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2584; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2585; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
2586; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
2587; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
2588; GFX7-NEXT:    s_mov_b32 s2, -1
2589; GFX7-NEXT:    s_waitcnt vmcnt(1)
2590; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
2591; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
2592; GFX7-NEXT:    s_waitcnt vmcnt(0)
2593; GFX7-NEXT:    v_bfe_i32 v4, v0, 0, 8
2594; GFX7-NEXT:    v_bfe_i32 v5, v0, 8, 8
2595; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2596; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v4, s4
2597; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
2598; GFX7-NEXT:    v_bfe_i32 v2, v2, 16, 8
2599; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v5, v1
2600; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
2601; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2602; GFX7-NEXT:    s_endpgm
2603;
2604; GFX8-LABEL: idot4_commutative:
2605; GFX8:       ; %bb.0: ; %entry
2606; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2607; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x3c
2608; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2609; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2610; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2611; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2612; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2613; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2614; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2615; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2616; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2617; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2618; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
2619; GFX8-NEXT:    s_waitcnt vmcnt(1)
2620; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
2621; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
2622; GFX8-NEXT:    v_bfe_i32 v3, v3, 16, 8
2623; GFX8-NEXT:    s_waitcnt vmcnt(0)
2624; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
2625; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
2626; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2627; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s0
2628; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
2629; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
2630; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
2631; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2632; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2633; GFX8-NEXT:    flat_store_dword v[0:1], v2
2634; GFX8-NEXT:    s_endpgm
2635;
2636; GFX9-NODL-LABEL: idot4_commutative:
2637; GFX9-NODL:       ; %bb.0: ; %entry
2638; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2639; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
2640; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2641; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2642; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
2643; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
2644; GFX9-NODL-NEXT:    s_load_dword s0, s[6:7], 0x0
2645; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2646; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2647; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
2648; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2649; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
2650; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2651; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
2652; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2653; GFX9-NODL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
2654; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
2655; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
2656; GFX9-NODL-NEXT:    s_endpgm
2657;
2658; GFX9-DL-LABEL: idot4_commutative:
2659; GFX9-DL:       ; %bb.0: ; %entry
2660; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2661; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
2662; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2663; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2664; GFX9-DL-NEXT:    global_load_dword v1, v0, s[2:3]
2665; GFX9-DL-NEXT:    global_load_dword v2, v0, s[0:1]
2666; GFX9-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2667; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc020100
2668; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2669; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2670; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v1, s1
2671; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2672; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v2, s1
2673; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2674; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s0
2675; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
2676; GFX9-DL-NEXT:    s_endpgm
2677;
2678; GFX10-DL-LABEL: idot4_commutative:
2679; GFX10-DL:       ; %bb.0: ; %entry
2680; GFX10-DL-NEXT:    s_clause 0x1
2681; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2682; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
2683; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2684; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2685; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2686; GFX10-DL-NEXT:    s_clause 0x1
2687; GFX10-DL-NEXT:    global_load_dword v1, v0, s[2:3]
2688; GFX10-DL-NEXT:    global_load_dword v2, v0, s[0:1]
2689; GFX10-DL-NEXT:    s_waitcnt_depctr 0xffe3
2690; GFX10-DL-NEXT:    s_load_dword s0, s[6:7], 0x0
2691; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2692; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v1, 0xc020100
2693; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2694; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v2, 0xc020100
2695; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2696; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s0
2697; GFX10-DL-NEXT:    v_dot4c_i32_i8 v2, v1, v0
2698; GFX10-DL-NEXT:    global_store_dword v3, v2, s[6:7]
2699; GFX10-DL-NEXT:    s_endpgm
2700;
2701; GFX11-DL-LABEL: idot4_commutative:
2702; GFX11-DL:       ; %bb.0: ; %entry
2703; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2704; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2705; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x3c
2706; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
2707; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2708; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2709; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2710; GFX11-DL-NEXT:    s_clause 0x1
2711; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[2:3]
2712; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[0:1]
2713; GFX11-DL-NEXT:    s_load_b32 s0, s[4:5], 0x0
2714; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
2715; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc020100
2716; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
2717; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020100
2718; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2719; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2720; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
2721; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[4:5]
2722; GFX11-DL-NEXT:    s_endpgm
2723                                       ptr addrspace(1) %src2,
2724                                       ptr addrspace(1) %src3,
2725                                       ptr addrspace(1) nocapture %dst) {
2726entry:
2727  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2728  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2729  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2730  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2731  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2732  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
2733  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
2734
2735  %v1e0 = extractelement <4 x i8> %vec1, i64 0
2736  %cv1e0 = sext i8 %v1e0 to i32
2737  %v2e0 = extractelement <4 x i8> %vec2, i64 0
2738  %cv2e0 = sext i8 %v2e0 to i32
2739  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
2740
2741  %v1e1 = extractelement <4 x i8> %vec1, i64 1
2742  %cv1e1 = sext i8 %v1e1 to i32
2743  %v2e1 = extractelement <4 x i8> %vec2, i64 1
2744  %cv2e1 = sext i8 %v2e1 to i32
2745  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
2746
2747  %v2e2 = extractelement <4 x i8> %vec2, i64 2
2748  %cv2e2 = sext i8 %v2e2 to i32
2749  %v1e2 = extractelement <4 x i8> %vec1, i64 2
2750  %cv1e2 = sext i8 %v1e2 to i32
2751  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
2752
2753
2754  %acc = load i32, ptr addrspace(1) %dst, align 4
2755  %mad1 = add i32 %mul1, %acc
2756  %mad2 = add i32 %mad1, %mul2
2757  %mad3 = add i32 %mad2, %mul3
2758
2759  store i32 %mad3, ptr addrspace(1) %dst, align 4
2760  ret void
2761}
2762
2763define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
2764; GFX7-LABEL: idot4_acc32_3src_3ele_src0:
2765; GFX7:       ; %bb.0: ; %entry
2766; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
2767; GFX7-NEXT:    s_mov_b32 s11, 0xf000
2768; GFX7-NEXT:    s_mov_b32 s14, 0
2769; GFX7-NEXT:    s_mov_b32 s15, s11
2770; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2771; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2772; GFX7-NEXT:    s_mov_b64 s[12:13], s[0:1]
2773; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2774; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
2775; GFX7-NEXT:    s_mov_b64 s[12:13], s[2:3]
2776; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64
2777; GFX7-NEXT:    s_mov_b64 s[12:13], s[4:5]
2778; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
2779; GFX7-NEXT:    s_load_dword s0, s[6:7], 0x0
2780; GFX7-NEXT:    s_mov_b32 s10, -1
2781; GFX7-NEXT:    s_mov_b32 s8, s6
2782; GFX7-NEXT:    s_mov_b32 s9, s7
2783; GFX7-NEXT:    s_waitcnt vmcnt(2)
2784; GFX7-NEXT:    v_bfe_i32 v1, v2, 8, 8
2785; GFX7-NEXT:    s_waitcnt vmcnt(1)
2786; GFX7-NEXT:    v_bfe_i32 v2, v3, 8, 8
2787; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2788; GFX7-NEXT:    v_mad_i32_i24 v4, v2, v2, s0
2789; GFX7-NEXT:    v_bfe_i32 v3, v3, 16, 8
2790; GFX7-NEXT:    s_waitcnt vmcnt(0)
2791; GFX7-NEXT:    v_bfe_i32 v0, v0, 16, 8
2792; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v2, v4
2793; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v0, v1
2794; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0
2795; GFX7-NEXT:    s_endpgm
2796;
2797; GFX8-LABEL: idot4_acc32_3src_3ele_src0:
2798; GFX8:       ; %bb.0: ; %entry
2799; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
2800; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2801; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2802; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2803; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2804; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2805; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2806; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2807; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2808; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2809; GFX8-NEXT:    flat_load_dword v4, v[0:1]
2810; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2811; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2812; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2813; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2814; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x0
2815; GFX8-NEXT:    s_waitcnt vmcnt(2)
2816; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
2817; GFX8-NEXT:    s_waitcnt vmcnt(1)
2818; GFX8-NEXT:    v_bfe_i32 v1, v4, 8, 8
2819; GFX8-NEXT:    v_bfe_i32 v3, v4, 16, 8
2820; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2821; GFX8-NEXT:    v_mad_i32_i24 v4, v1, v1, s0
2822; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v1, v4
2823; GFX8-NEXT:    s_waitcnt vmcnt(0)
2824; GFX8-NEXT:    v_bfe_i32 v0, v0, 16, 8
2825; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
2826; GFX8-NEXT:    v_mov_b32_e32 v0, s6
2827; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2828; GFX8-NEXT:    flat_store_dword v[0:1], v2
2829; GFX8-NEXT:    s_endpgm
2830;
2831; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0:
2832; GFX9-NODL:       ; %bb.0: ; %entry
2833; GFX9-NODL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
2834; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2835; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2836; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[10:11]
2837; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[12:13]
2838; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[8:9]
2839; GFX9-NODL-NEXT:    s_load_dword s0, s[14:15], 0x0
2840; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2841; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
2842; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 8, 8
2843; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2844; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
2845; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2846; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
2847; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2848; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v4, s0
2849; GFX9-NODL-NEXT:    v_add3_u32 v1, v3, v2, v1
2850; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[14:15]
2851; GFX9-NODL-NEXT:    s_endpgm
2852;
2853; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0:
2854; GFX9-DL:       ; %bb.0: ; %entry
2855; GFX9-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
2856; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2857; GFX9-DL-NEXT:    s_mov_b32 s0, 0xc06010c
2858; GFX9-DL-NEXT:    s_mov_b32 s1, 0xc0c0c01
2859; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc020101
2860; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2861; GFX9-DL-NEXT:    global_load_dword v1, v0, s[12:13]
2862; GFX9-DL-NEXT:    global_load_dword v2, v0, s[8:9]
2863; GFX9-DL-NEXT:    global_load_dword v3, v0, s[10:11]
2864; GFX9-DL-NEXT:    s_load_dword s3, s[14:15], 0x0
2865; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2866; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2867; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v2, s0
2868; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2869; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s1
2870; GFX9-DL-NEXT:    v_or_b32_e32 v1, v1, v2
2871; GFX9-DL-NEXT:    v_perm_b32 v2, v3, v3, s2
2872; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2873; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v2, v1, s3
2874; GFX9-DL-NEXT:    global_store_dword v0, v1, s[14:15]
2875; GFX9-DL-NEXT:    s_endpgm
2876;
2877; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0:
2878; GFX10-DL:       ; %bb.0: ; %entry
2879; GFX10-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
2880; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2881; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2882; GFX10-DL-NEXT:    s_clause 0x2
2883; GFX10-DL-NEXT:    global_load_dword v1, v0, s[12:13]
2884; GFX10-DL-NEXT:    global_load_dword v2, v0, s[8:9]
2885; GFX10-DL-NEXT:    global_load_dword v3, v0, s[10:11]
2886; GFX10-DL-NEXT:    s_load_dword s0, s[14:15], 0x0
2887; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2888; GFX10-DL-NEXT:    v_perm_b32 v0, v1, v2, 0xc06010c
2889; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2890; GFX10-DL-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0c01
2891; GFX10-DL-NEXT:    v_perm_b32 v2, v3, v3, 0xc020101
2892; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2893; GFX10-DL-NEXT:    v_or_b32_e32 v0, v0, v1
2894; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2895; GFX10-DL-NEXT:    v_mov_b32_e32 v1, s0
2896; GFX10-DL-NEXT:    v_dot4c_i32_i8 v1, v2, v0
2897; GFX10-DL-NEXT:    global_store_dword v3, v1, s[14:15]
2898; GFX10-DL-NEXT:    s_endpgm
2899;
2900; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0:
2901; GFX11-DL:       ; %bb.0: ; %entry
2902; GFX11-DL-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
2903; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2904; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2905; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2906; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2907; GFX11-DL-NEXT:    s_clause 0x2
2908; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
2909; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[0:1]
2910; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
2911; GFX11-DL-NEXT:    s_load_b32 s0, s[6:7], 0x0
2912; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
2913; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v2, 0xc06010c
2914; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
2915; GFX11-DL-NEXT:    v_perm_b32 v2, v0, v0, 0xc0c0c01
2916; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc020101
2917; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2918; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v2
2919; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
2920; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
2921; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
2922; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[6:7]
2923; GFX11-DL-NEXT:    s_endpgm
2924                                       ptr addrspace(1) %src2,
2925                                       ptr addrspace(1) %src3,
2926                                       ptr addrspace(1) nocapture %dst) {
2927entry:
2928  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2929  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
2930  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
2931  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
2932  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
2933  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
2934  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
2935
2936  %v2e0 = extractelement <4 x i8> %vec2, i64 1
2937  %cv2e0 = sext i8 %v2e0 to i32
2938  %mul1 = mul nuw nsw i32 %cv2e0, %cv2e0
2939
2940  %v1e1 = extractelement <4 x i8> %vec1, i64 1
2941  %cv1e1 = sext i8 %v1e1 to i32
2942  %v2e1 = extractelement <4 x i8> %vec2, i64 1
2943  %cv2e1 = sext i8 %v2e1 to i32
2944  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
2945
2946  %v3e2 = extractelement <4 x i8> %vec3, i64 2
2947  %cv3e2 = sext i8 %v3e2 to i32
2948  %v2e2 = extractelement <4 x i8> %vec2, i64 2
2949  %cv2e2 = sext i8 %v2e2 to i32
2950  %mul3 = mul nuw nsw i32 %cv2e2, %cv3e2
2951
2952
2953  %acc = load i32, ptr addrspace(1) %dst, align 4
2954  %mad1 = add i32 %mul1, %acc
2955  %mad2 = add i32 %mad1, %mul2
2956  %mad3 = add i32 %mad2, %mul3
2957
2958  store i32 %mad3, ptr addrspace(1) %dst, align 4
2959  ret void
2960}
2961
2962define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
2963; GFX7-LABEL: idot4_4src:
2964; GFX7:       ; %bb.0: ; %entry
2965; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x9
2966; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2967; GFX7-NEXT:    s_mov_b32 s18, 0
2968; GFX7-NEXT:    s_mov_b32 s19, s3
2969; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2970; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2971; GFX7-NEXT:    s_mov_b64 s[16:17], s[8:9]
2972; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2973; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[16:19], 0 addr64
2974; GFX7-NEXT:    s_mov_b64 s[16:17], s[10:11]
2975; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[16:19], 0 addr64
2976; GFX7-NEXT:    s_mov_b64 s[16:17], s[12:13]
2977; GFX7-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64
2978; GFX7-NEXT:    s_mov_b64 s[16:17], s[14:15]
2979; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
2980; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x11
2981; GFX7-NEXT:    s_mov_b32 s2, -1
2982; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2983; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
2984; GFX7-NEXT:    s_waitcnt vmcnt(3)
2985; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
2986; GFX7-NEXT:    v_bfe_i32 v2, v2, 8, 8
2987; GFX7-NEXT:    s_waitcnt vmcnt(2)
2988; GFX7-NEXT:    v_bfe_i32 v5, v3, 0, 8
2989; GFX7-NEXT:    v_bfe_i32 v3, v3, 8, 8
2990; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2991; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v2, s4
2992; GFX7-NEXT:    s_waitcnt vmcnt(1)
2993; GFX7-NEXT:    v_bfe_i32 v2, v4, 0, 8
2994; GFX7-NEXT:    v_bfe_i32 v4, v4, 8, 8
2995; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v3, v1
2996; GFX7-NEXT:    s_waitcnt vmcnt(0)
2997; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 8
2998; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
2999; GFX7-NEXT:    v_mad_i32_i24 v1, v2, v4, v1
3000; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v0, v1
3001; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3002; GFX7-NEXT:    s_endpgm
3003;
3004; GFX8-LABEL: idot4_4src:
3005; GFX8:       ; %bb.0: ; %entry
3006; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
3007; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3008; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
3009; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3010; GFX8-NEXT:    v_mov_b32_e32 v1, s9
3011; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s8, v2
3012; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3013; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3014; GFX8-NEXT:    v_mov_b32_e32 v1, s11
3015; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s10, v2
3016; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3017; GFX8-NEXT:    flat_load_dword v4, v[0:1]
3018; GFX8-NEXT:    v_mov_b32_e32 v1, s13
3019; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s12, v2
3020; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3021; GFX8-NEXT:    flat_load_dword v5, v[0:1]
3022; GFX8-NEXT:    v_mov_b32_e32 v1, s15
3023; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s14, v2
3024; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3025; GFX8-NEXT:    flat_load_dword v0, v[0:1]
3026; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
3027; GFX8-NEXT:    s_waitcnt vmcnt(3)
3028; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
3029; GFX8-NEXT:    v_bfe_i32 v2, v3, 8, 8
3030; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3031; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
3032; GFX8-NEXT:    s_waitcnt vmcnt(2)
3033; GFX8-NEXT:    v_bfe_i32 v3, v4, 0, 8
3034; GFX8-NEXT:    v_bfe_i32 v4, v4, 8, 8
3035; GFX8-NEXT:    v_mad_i32_i24 v1, v3, v4, v1
3036; GFX8-NEXT:    s_waitcnt vmcnt(1)
3037; GFX8-NEXT:    v_bfe_i32 v6, v5, 0, 8
3038; GFX8-NEXT:    v_bfe_i32 v5, v5, 8, 8
3039; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v5, v1
3040; GFX8-NEXT:    s_waitcnt vmcnt(0)
3041; GFX8-NEXT:    v_bfe_i32 v7, v0, 0, 8
3042; GFX8-NEXT:    v_bfe_i32 v0, v0, 8, 8
3043; GFX8-NEXT:    v_mad_i32_i24 v2, v7, v0, v1
3044; GFX8-NEXT:    v_mov_b32_e32 v0, s0
3045; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3046; GFX8-NEXT:    flat_store_dword v[0:1], v2
3047; GFX8-NEXT:    s_endpgm
3048;
3049; GFX9-NODL-LABEL: idot4_4src:
3050; GFX9-NODL:       ; %bb.0: ; %entry
3051; GFX9-NODL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
3052; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3053; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
3054; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3055; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[8:9]
3056; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[10:11]
3057; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[12:13]
3058; GFX9-NODL-NEXT:    global_load_dword v4, v0, s[14:15]
3059; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
3060; GFX9-NODL-NEXT:    s_load_dword s2, s[0:1], 0x0
3061; GFX9-NODL-NEXT:    s_waitcnt vmcnt(3)
3062; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
3063; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
3064; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v2, sext(v2), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
3065; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
3066; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v3), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
3067; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
3068; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
3069; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3070; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s2, v2
3071; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v4
3072; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[0:1]
3073; GFX9-NODL-NEXT:    s_endpgm
3074;
3075; GFX9-DL-LABEL: idot4_4src:
3076; GFX9-DL:       ; %bb.0: ; %entry
3077; GFX9-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
3078; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3079; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
3080; GFX9-DL-NEXT:    s_mov_b32 s2, 0xc0c0501
3081; GFX9-DL-NEXT:    s_mov_b32 s3, 0x5010c0c
3082; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3083; GFX9-DL-NEXT:    global_load_dword v1, v0, s[8:9]
3084; GFX9-DL-NEXT:    global_load_dword v2, v0, s[10:11]
3085; GFX9-DL-NEXT:    global_load_dword v3, v0, s[12:13]
3086; GFX9-DL-NEXT:    global_load_dword v4, v0, s[14:15]
3087; GFX9-DL-NEXT:    s_mov_b32 s4, 0xc0c0400
3088; GFX9-DL-NEXT:    s_load_dword s6, s[0:1], 0x0
3089; GFX9-DL-NEXT:    s_mov_b32 s5, 0x4000c0c
3090; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3091; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
3092; GFX9-DL-NEXT:    v_perm_b32 v5, v2, v1, s2
3093; GFX9-DL-NEXT:    v_perm_b32 v1, v2, v1, s4
3094; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3095; GFX9-DL-NEXT:    v_perm_b32 v6, v4, v3, s3
3096; GFX9-DL-NEXT:    v_perm_b32 v2, v4, v3, s5
3097; GFX9-DL-NEXT:    v_or_b32_e32 v3, v6, v5
3098; GFX9-DL-NEXT:    v_or_b32_e32 v1, v2, v1
3099; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3100; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v3, s6
3101; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
3102; GFX9-DL-NEXT:    s_endpgm
3103;
3104; GFX10-DL-LABEL: idot4_4src:
3105; GFX10-DL:       ; %bb.0: ; %entry
3106; GFX10-DL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
3107; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3108; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
3109; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3110; GFX10-DL-NEXT:    s_clause 0x3
3111; GFX10-DL-NEXT:    global_load_dword v1, v0, s[8:9]
3112; GFX10-DL-NEXT:    global_load_dword v2, v0, s[10:11]
3113; GFX10-DL-NEXT:    global_load_dword v3, v0, s[12:13]
3114; GFX10-DL-NEXT:    global_load_dword v4, v0, s[14:15]
3115; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
3116; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
3117; GFX10-DL-NEXT:    v_perm_b32 v0, v2, v1, 0xc0c0501
3118; GFX10-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
3119; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3120; GFX10-DL-NEXT:    v_perm_b32 v5, v4, v3, 0x5010c0c
3121; GFX10-DL-NEXT:    v_perm_b32 v2, v4, v3, 0x4000c0c
3122; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
3123; GFX10-DL-NEXT:    v_or_b32_e32 v0, v5, v0
3124; GFX10-DL-NEXT:    v_or_b32_e32 v1, v2, v1
3125; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3126; GFX10-DL-NEXT:    v_mov_b32_e32 v2, s2
3127; GFX10-DL-NEXT:    v_dot4c_i32_i8 v2, v1, v0
3128; GFX10-DL-NEXT:    global_store_dword v3, v2, s[0:1]
3129; GFX10-DL-NEXT:    s_endpgm
3130;
3131; GFX11-DL-LABEL: idot4_4src:
3132; GFX11-DL:       ; %bb.0: ; %entry
3133; GFX11-DL-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
3134; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3135; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x44
3136; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3137; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3138; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3139; GFX11-DL-NEXT:    s_clause 0x3
3140; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[8:9]
3141; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[10:11]
3142; GFX11-DL-NEXT:    global_load_b32 v3, v0, s[12:13]
3143; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[14:15]
3144; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
3145; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
3146; GFX11-DL-NEXT:    v_perm_b32 v4, v2, v1, 0xc0c0501
3147; GFX11-DL-NEXT:    v_perm_b32 v1, v2, v1, 0xc0c0400
3148; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
3149; GFX11-DL-NEXT:    v_perm_b32 v5, v0, v3, 0x5010c0c
3150; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v3, 0x4000c0c
3151; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3152; GFX11-DL-NEXT:    v_or_b32_e32 v2, v5, v4
3153; GFX11-DL-NEXT:    v_or_b32_e32 v0, v0, v1
3154; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
3155; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3156; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3157; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v0, v2, s2 neg_lo:[1,1,0]
3158; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
3159; GFX11-DL-NEXT:    s_endpgm
3160                                       ptr addrspace(1) %src2,
3161                                       ptr addrspace(1) %src3,
3162                                       ptr addrspace(1) %src4,
3163                                       ptr addrspace(1) nocapture %dst) {
3164entry:
3165  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3166
3167  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
3168  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
3169  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
3170  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
3171  %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx
3172  %vec3 = load <4 x i8>, ptr addrspace(1) %gep3
3173  %gep4 = getelementptr <4 x i8>, ptr addrspace(1) %src4, i32 %idx
3174  %vec4 = load <4 x i8>, ptr addrspace(1) %gep4
3175
3176
3177  %v1e0 = extractelement <4 x i8> %vec1, i64 0
3178  %cv1e0 = sext i8 %v1e0 to i32
3179  %v1e1 = extractelement <4 x i8> %vec1, i64 1
3180  %cv1e1 = sext i8 %v1e1 to i32
3181  %mul1 = mul nuw nsw i32 %cv1e0, %cv1e1
3182
3183  %v2e0 = extractelement <4 x i8> %vec2, i64 0
3184  %cv2e0 = sext i8 %v2e0 to i32
3185  %v2e1 = extractelement <4 x i8> %vec2, i64 1
3186  %cv2e1 = sext i8 %v2e1 to i32
3187  %mul2 = mul nuw nsw i32 %cv2e0, %cv2e1
3188
3189  %v3e0 = extractelement <4 x i8> %vec3, i64 0
3190  %cv3e0 = sext i8 %v3e0 to i32
3191  %v3e1 = extractelement <4 x i8> %vec3, i64 1
3192  %cv3e1 = sext i8 %v3e1 to i32
3193  %mul3 = mul nuw nsw i32 %cv3e0, %cv3e1
3194
3195  %v4e0 = extractelement <4 x i8> %vec4, i64 0
3196  %cv4e0 = sext i8 %v4e0 to i32
3197  %v4e1 = extractelement <4 x i8> %vec4, i64 1
3198  %cv4e1 = sext i8 %v4e1 to i32
3199  %mul4 = mul nuw nsw i32 %cv4e0, %cv4e1
3200
3201
3202  %acc = load i32, ptr addrspace(1) %dst, align 4
3203  %mad1 = add i32 %mul1, %acc
3204  %mad2 = add i32 %mad1, %mul2
3205  %mad3 = add i32 %mad2, %mul3
3206  %mad4 = add i32 %mad3, %mul4
3207
3208  store i32 %mad4, ptr addrspace(1) %dst, align 4
3209  ret void
3210}
3211
3212define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
3213; GFX7-LABEL: idot4_nonstandard_signed:
3214; GFX7:       ; %bb.0: ; %entry
3215; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
3216; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
3217; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3218; GFX7-NEXT:    s_mov_b32 s6, 0
3219; GFX7-NEXT:    s_mov_b32 s7, s3
3220; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3221; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
3222; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3223; GFX7-NEXT:    v_mov_b32_e32 v1, 0
3224; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3225; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
3226; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
3227; GFX7-NEXT:    s_mov_b32 s2, -1
3228; GFX7-NEXT:    s_waitcnt vmcnt(1)
3229; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
3230; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
3231; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3232; GFX7-NEXT:    s_waitcnt vmcnt(0)
3233; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
3234; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
3235; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
3236; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
3237; GFX7-NEXT:    v_mul_u32_u24_e32 v1, v1, v5
3238; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
3239; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
3240; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
3241; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
3242; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
3243; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v4, v1
3244; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3245; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
3246; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
3247; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3248; GFX7-NEXT:    s_endpgm
3249;
3250; GFX8-LABEL: idot4_nonstandard_signed:
3251; GFX8:       ; %bb.0: ; %entry
3252; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3253; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
3254; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3255; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
3256; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3257; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3258; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3259; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3260; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3261; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3262; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3263; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3264; GFX8-NEXT:    flat_load_dword v2, v[0:1]
3265; GFX8-NEXT:    v_mov_b32_e32 v0, s4
3266; GFX8-NEXT:    v_mov_b32_e32 v1, s5
3267; GFX8-NEXT:    s_waitcnt vmcnt(1)
3268; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
3269; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
3270; GFX8-NEXT:    v_bfe_i32 v7, v7, 0, 8
3271; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
3272; GFX8-NEXT:    s_waitcnt vmcnt(0)
3273; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
3274; GFX8-NEXT:    v_mul_lo_u16_sdwa v6, sext(v3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3275; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v8
3276; GFX8-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3277; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
3278; GFX8-NEXT:    v_mad_u16 v6, v8, v7, v6
3279; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
3280; GFX8-NEXT:    v_mad_u16 v4, v4, v5, v6
3281; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
3282; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
3283; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 16
3284; GFX8-NEXT:    flat_store_dword v[0:1], v2
3285; GFX8-NEXT:    s_endpgm
3286;
3287; GFX9-NODL-LABEL: idot4_nonstandard_signed:
3288; GFX9-NODL:       ; %bb.0: ; %entry
3289; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3290; GFX9-NODL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3291; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3292; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
3293; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[0:1]
3294; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[2:3]
3295; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
3296; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
3297; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
3298; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
3299; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
3300; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
3301; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3302; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3303; GFX9-NODL-NEXT:    v_bfe_i32 v5, v5, 0, 8
3304; GFX9-NODL-NEXT:    v_and_b32_e32 v6, 0xff, v6
3305; GFX9-NODL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3306; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
3307; GFX9-NODL-NEXT:    v_bfe_i32 v3, v3, 0, 8
3308; GFX9-NODL-NEXT:    v_mad_legacy_u16 v4, v6, v5, v4
3309; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
3310; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v7, v3, v4
3311; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
3312; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
3313; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 16
3314; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[6:7]
3315; GFX9-NODL-NEXT:    s_endpgm
3316;
3317; GFX9-DL-LABEL: idot4_nonstandard_signed:
3318; GFX9-DL:       ; %bb.0: ; %entry
3319; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3320; GFX9-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3321; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3322; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3323; GFX9-DL-NEXT:    global_load_dword v1, v0, s[0:1]
3324; GFX9-DL-NEXT:    global_load_dword v2, v0, s[2:3]
3325; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
3326; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3327; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
3328; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
3329; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3330; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
3331; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3332; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3333; GFX9-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
3334; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
3335; GFX9-DL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3336; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
3337; GFX9-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
3338; GFX9-DL-NEXT:    v_mad_legacy_u16 v4, v6, v5, v4
3339; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
3340; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v3, v4
3341; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
3342; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
3343; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 16
3344; GFX9-DL-NEXT:    global_store_dword v0, v1, s[6:7]
3345; GFX9-DL-NEXT:    s_endpgm
3346;
3347; GFX10-DL-LABEL: idot4_nonstandard_signed:
3348; GFX10-DL:       ; %bb.0: ; %entry
3349; GFX10-DL-NEXT:    s_clause 0x1
3350; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3351; GFX10-DL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3352; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3353; GFX10-DL-NEXT:    v_mov_b32_e32 v6, 0xff
3354; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3355; GFX10-DL-NEXT:    s_clause 0x1
3356; GFX10-DL-NEXT:    global_load_dword v1, v0, s[0:1]
3357; GFX10-DL-NEXT:    global_load_dword v2, v0, s[2:3]
3358; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
3359; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
3360; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3361; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xff, v2
3362; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
3363; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
3364; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
3365; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
3366; GFX10-DL-NEXT:    v_mul_lo_u16 v0, v0, v3
3367; GFX10-DL-NEXT:    v_bfe_i32 v3, v4, 0, 8
3368; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v5
3369; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3370; GFX10-DL-NEXT:    v_bfe_i32 v6, v7, 0, 8
3371; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
3372; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
3373; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v3, v0
3374; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v6, v0
3375; GFX10-DL-NEXT:    v_mad_u16 v0, v1, v2, v0
3376; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
3377; GFX10-DL-NEXT:    v_bfe_i32 v0, v0, 0, 16
3378; GFX10-DL-NEXT:    global_store_dword v1, v0, s[6:7]
3379; GFX10-DL-NEXT:    s_endpgm
3380;
3381; GFX11-DL-LABEL: idot4_nonstandard_signed:
3382; GFX11-DL:       ; %bb.0: ; %entry
3383; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3384; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3385; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3386; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3387; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3388; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
3389; GFX11-DL-NEXT:    s_clause 0x1
3390; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
3391; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
3392; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
3393; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
3394; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
3395; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xff, v0
3396; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
3397; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
3398; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
3399; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
3400; GFX11-DL-NEXT:    v_mul_lo_u16 v2, v2, v3
3401; GFX11-DL-NEXT:    v_bfe_i32 v3, v4, 0, 8
3402; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v5
3403; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
3404; GFX11-DL-NEXT:    v_bfe_i32 v5, v6, 0, 8
3405; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v7
3406; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
3407; GFX11-DL-NEXT:    v_mad_u16 v2, v4, v3, v2
3408; GFX11-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
3409; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3410; GFX11-DL-NEXT:    v_mad_u16 v2, v6, v5, v2
3411; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v2
3412; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
3413; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
3414; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 16
3415; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[4:5]
3416; GFX11-DL-NEXT:    s_endpgm
3417                                       ptr addrspace(1) %src2,
3418                                       ptr addrspace(1) nocapture %dst) {
3419entry:
3420  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3421  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
3422  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
3423  %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
3424  %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
3425  %v1e0 = extractelement <4 x i8> %vec1, i64 0
3426  %v1e0e = sext i8 %v1e0 to i16
3427  %v2e0 = extractelement <4 x i8> %vec2, i64 0
3428  %v2e0e = zext i8 %v2e0 to i16
3429  %mul0 = mul nsw i16 %v1e0e, %v2e0e
3430  %add0 = add i16 %mul0, 0
3431
3432  %v1e1 = extractelement <4 x i8> %vec1, i64 1
3433  %v1e1e = sext i8 %v1e1 to i16
3434  %v2e1 = extractelement <4 x i8> %vec2, i64 1
3435  %v2e1e = zext i8 %v2e1 to i16
3436  %mul1 = mul nsw i16 %v2e1e, %v1e1e
3437  %add1 = add i16 %mul1, %add0
3438  %v1e2 = extractelement <4 x i8> %vec1, i64 2
3439  %v1e2e = sext i8 %v1e2 to i16
3440  %v2e2 = extractelement <4 x i8> %vec2, i64 2
3441  %v2e2e = zext i8 %v2e2 to i16
3442  %mul2 = mul nsw i16 %v2e2e, %v1e2e
3443  %add2 = add i16 %mul2, %add1
3444  %v1e3 = extractelement <4 x i8> %vec1, i64 3
3445  %v1e3e = sext i8 %v1e3 to i16
3446  %v2e3 = extractelement <4 x i8> %vec2, i64 3
3447  %v2e3e = zext i8 %v2e3 to i16
3448  %mul3 = mul nsw i16 %v1e3e, %v2e3e
3449  %add3 = add i16 %mul3, %add2
3450  %res = sext i16 %add3 to i32
3451  store i32 %res, ptr addrspace(1) %dst, align 4
3452  ret void
3453}
3454
3455
3456declare i32 @llvm.amdgcn.workitem.id.x()
3457